From 74384c634aa5b1ef9acb360d6616987fd64b7697 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 6 Jun 2018 13:22:01 +0200
Subject: [PATCH 01/31] added tests for constant impute strategy in
 simpleImputer

---
 sklearn/tests/test_impute.py | 133 ++++++++++++++++++++++++++++++++---
 1 file changed, 125 insertions(+), 8 deletions(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 954a016a835bb..6457b9ec83a27 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -24,16 +24,14 @@ def _check_statistics(X, X_true,
                       strategy, statistics, missing_values):
     """Utility function for testing imputation for a given strategy.
 
-    Test:
-        - along the two axes
-        - with dense and sparse arrays
+    Test with dense and sparse arrays
 
     Check that:
         - the statistics (mean, median, mode) are correct
         - the missing values are imputed correctly"""
 
     err_msg = "Parameters: strategy = %s, missing_values = %s, " \
-              "axis = {0}, sparse = {1}" % (strategy, missing_values)
+              "sparse = {0}" % (strategy, missing_values)
 
     assert_ae = assert_array_equal
     if X.dtype.kind == 'f' or X_true.dtype.kind == 'f':
@@ -43,8 +41,8 @@ def _check_statistics(X, X_true,
     imputer = SimpleImputer(missing_values, strategy=strategy)
     X_trans = imputer.fit(X).transform(X.copy())
     assert_ae(imputer.statistics_, statistics,
-              err_msg=err_msg.format(0, False))
-    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False))
+              err_msg=err_msg.format(False))
+    assert_ae(X_trans, X_true, err_msg=err_msg.format(False))
 
     # Sparse matrix
     imputer = SimpleImputer(missing_values, strategy=strategy)
@@ -55,8 +53,8 @@ def _check_statistics(X, X_true,
         X_trans = X_trans.toarray()
 
     assert_ae(imputer.statistics_, statistics,
-              err_msg=err_msg.format(0, True))
-    assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
+              err_msg=err_msg.format(True))
+    assert_ae(X_trans, X_true, err_msg=err_msg.format(True))
 
 
 def test_imputation_shape():
@@ -210,6 +208,125 @@ def test_imputation_most_frequent():
     _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1)
 
 
+def test_imputation_constant_integer():
+    # Test imputation using the constant strategy
+    # on integers
+    X = np.array([
+        [-1, 2, 3, -1],
+        [4, -1, 5, -1],
+        [6, 7, -1, -1],
+        [8, 9, 0, -1]
+    ])
+
+    X_true = np.array([
+        [0, 2, 3],
+        [4, 0, 5],
+        [6, 7, 0],
+        [8, 9, 0]
+    ])
+
+    imputer = SimpleImputer(missing_value=-1, strategy="constant", 
+                            fill_value=0)
+    X_trans = imputer.fit(X).transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
+def test_imputation_constant_float():
+    # Test imputation using the constant strategy
+    # on floats
+    X = np.array([
+        [np.nan, 1.1, 2.2, np.nan],
+        [3.3, np.nan, 4.4, np.nan],
+        [5.5, 6.6, np.nan, np.nan],
+        [7.7, 8.8, 9.9, np.nan]
+    ])
+
+    X_true = np.array([
+        [0, 1.1, 2.2],
+        [3.3, 0, 4.4],
+        [5.5, 6.6, 0],
+        [7.7, 8.8, 9.9]
+    ])
+
+    imputer = SimpleImputer(strategy="constant", fill_value=0)
+    X_trans = imputer.fit(X).transform(X)  
+
+    assert_allclose(X_trans, X_true)
+
+
+def test_imputation_constant_object():
+    # Test imputation using the constant strategy
+    # on objects
+    X = np.array([
+        [None, "a", "b", None],
+        ["c", None, "d", None],
+        ["e", "f", None, None],
+        ["g", "h", "i", None]
+    ], dtype=object)
+
+    X_true = np.array([
+        ["Z", "a", "b"],
+        ["c", "Z", "d"],
+        ["e", "f", "Z"],
+        ["g", "h", "i"]
+    ])
+
+    imputer = SimpleImputer(None, strategy="constant", fill_value="Z")
+    X_trans = imputer.fit(X).transform(X) 
+
+    assert_array_equal(X_trans, X_true)
+
+
+def test_imputation_constant_object_nan():
+    # Test imputation using the constant strategy
+    # on objects
+    X = np.array([
+        [np.nan, "a", "b", np.nan],
+        ["c", np.nan, "d", np.nan],
+        ["e", "f", np.nan, np.nan],
+        ["g", "h", "i", np.nan]
+    ], dtype=object)
+
+    X_true = np.array([
+        ["missing", "a", "b"],
+        ["c", "missing", "d"],
+        ["e", "f", "missing"],
+        ["g", "h", "i"]
+    ], dtype=object)
+
+    imputer = SimpleImputer(None, strategy="constant", fill_value="missing")
+    X_trans = imputer.fit(X).transform(X)  
+
+    assert_array_equal(X_trans, X_true)
+
+
+def test_imputation_constant_pandas():
+    # Test imputation using the constant strategy
+    # on pandas df
+    pd = pytest.importskip("pandas")
+
+    for dtype in [object, "category"]:
+        df = pd.DataFrame([
+            [np.nan, "a", "b", np.nan],
+            ["c", np.nan, "d", np.nan],
+            ["e", "f", np.nan, np.nan],
+            ["g", "h", "i", np.nan]
+        ], dtype=dtype)
+
+        X_true = np.array([
+            ["missing", "a", "b"],
+            ["c", "missing", "d"],
+            ["e", "f", "missing"],
+            ["g", "h", "i"]
+        ], dtype=object)
+
+        imputer = SimpleImputer(strategy="constant", fill_value="missing")
+        X_trans = imputer.fit(df).transform(df) 
+    
+        assert_array_equal(X_trans, X_true)
+
+
 def test_imputation_pipeline_grid_search():
     # Test imputation within a pipeline + gridsearch.
     pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)),

From 6dd6a5ef3da6d0ee1343ae4823f0ce4dd81f6fa8 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 6 Jun 2018 13:30:57 +0200
Subject: [PATCH 02/31] typos

---
 sklearn/tests/test_impute.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 6457b9ec83a27..93859a0a9e0c2 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -225,7 +225,7 @@ def test_imputation_constant_integer():
         [8, 9, 0]
     ])
 
-    imputer = SimpleImputer(missing_value=-1, strategy="constant", 
+    imputer = SimpleImputer(missing_values=-1, strategy="constant", 
                             fill_value=0)
     X_trans = imputer.fit(X).transform(X)
 
@@ -266,13 +266,13 @@ def test_imputation_constant_object():
     ], dtype=object)
 
     X_true = np.array([
-        ["Z", "a", "b"],
-        ["c", "Z", "d"],
-        ["e", "f", "Z"],
+        ["missing", "a", "b"],
+        ["c", "missing", "d"],
+        ["e", "f", "missing"],
         ["g", "h", "i"]
     ])
 
-    imputer = SimpleImputer(None, strategy="constant", fill_value="Z")
+    imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing")
     X_trans = imputer.fit(X).transform(X) 
 
     assert_array_equal(X_trans, X_true)
@@ -295,7 +295,7 @@ def test_imputation_constant_object_nan():
         ["g", "h", "i"]
     ], dtype=object)
 
-    imputer = SimpleImputer(None, strategy="constant", fill_value="missing")
+    imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing")
     X_trans = imputer.fit(X).transform(X)  
 
     assert_array_equal(X_trans, X_true)
@@ -323,7 +323,7 @@ def test_imputation_constant_pandas():
 
         imputer = SimpleImputer(strategy="constant", fill_value="missing")
         X_trans = imputer.fit(df).transform(df) 
-    
+
         assert_array_equal(X_trans, X_true)
 
 

From 2b101fbbee1175da653f793c2a47eb28c11fb18e Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 6 Jun 2018 13:37:53 +0200
Subject: [PATCH 03/31] typos

---
 sklearn/tests/test_impute.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 93859a0a9e0c2..e46ca7e9837df 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -250,7 +250,7 @@ def test_imputation_constant_float():
     ])
 
     imputer = SimpleImputer(strategy="constant", fill_value=0)
-    X_trans = imputer.fit(X).transform(X)  
+    X_trans = imputer.fit(X).transform(X)
 
     assert_allclose(X_trans, X_true)
 
@@ -272,8 +272,9 @@ def test_imputation_constant_object():
         ["g", "h", "i"]
     ])
 
-    imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing")
-    X_trans = imputer.fit(X).transform(X) 
+    imputer = SimpleImputer(missing_values=None, strategy="constant",
+                            fill_value="missing")
+    X_trans = imputer.fit(X).transform(X)
 
     assert_array_equal(X_trans, X_true)
 
@@ -295,8 +296,9 @@ def test_imputation_constant_object_nan():
         ["g", "h", "i"]
     ], dtype=object)
 
-    imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing")
-    X_trans = imputer.fit(X).transform(X)  
+    imputer = SimpleImputer(missing_values=None, strategy="constant",
+                            fill_value="missing")
+    X_trans = imputer.fit(X).transform(X)
 
     assert_array_equal(X_trans, X_true)
 

From 6e13e68eef49eec22d1116ce896f591c69cb481e Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 6 Jun 2018 13:40:39 +0200
Subject: [PATCH 04/31] typos

---
 sklearn/tests/test_impute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index e46ca7e9837df..22e53a56f73c8 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -306,7 +306,7 @@ def test_imputation_constant_object_nan():
 def test_imputation_constant_pandas():
     # Test imputation using the constant strategy
     # on pandas df
-    pd = pytest.importskip("pandas")
+    pd = pytest.importorskip("pandas")
 
     for dtype in [object, "category"]:
         df = pd.DataFrame([

From f300fe24e959c07f9a72a35007d06d905544c0d7 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 6 Jun 2018 18:05:42 +0200
Subject: [PATCH 05/31] added constant strategy to the SimpleImputer.

---
 sklearn/impute.py            | 139 +++++++++++++++++++++++++----------
 sklearn/tests/test_impute.py |  77 ++++++++++---------
 2 files changed, 143 insertions(+), 73 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index fe772d6a3a0cb..5711eb55c3196 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -7,6 +7,7 @@
 
 import warnings
 from time import time
+import numbers
 
 import numpy as np
 import numpy.ma as ma
@@ -36,11 +37,20 @@
     'MICEImputer',
 ]
 
+def _is_scalar_nan(x):
+    """Work around limitations of numpy ufuncs"""
+    return False if x is None else np.isnan(x)
+
 
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
-    if value_to_mask == "NaN" or np.isnan(value_to_mask):
-        return np.isnan(X)
+    if value_to_mask == "NaN" or _is_scalar_nan(value_to_mask):
+        if X.dtype.kind == "O":
+            # np.isnan does not work for dtype objects. We use the trick that
+            # nan values are never equal to themselves.
+            return np.logical_not(X == X)
+        else:
+            return np.isnan(X)
     else:
         return X == value_to_mask
 
@@ -94,6 +104,13 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
           each column.
         - If "most_frequent", then replace missing using the most frequent
           value along each column.
+        - If "constant", then replace missing values with fill_value
+
+    fill_value : string or numerical value, optional (default=None)
+        When strategy == "constant", fill_value is used to replace all
+        occurrences of missing_values.
+        If left to the default, fill_value will be 0 when imputing numerical
+        data and "missing_value" for strings or object data types.
 
     verbose : integer, optional (default=0)
         Controls the verbosity of the imputer.
@@ -115,16 +132,41 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     Notes
     -----
     Columns which only contained missing values at `fit` are discarded upon
-    `transform`.
+    `transform` is strategy is not "constant"
 
     """
     def __init__(self, missing_values="NaN", strategy="mean",
-                 verbose=0, copy=True):
+                 fill_value=None, verbose=0, copy=True):
         self.missing_values = missing_values
         self.strategy = strategy
+        self.fill_value = fill_value
         self.verbose = verbose
         self.copy = copy
 
+    def _validate_input(self, X):
+        allowed_strategies = ["mean", "median", "most_frequent", "constant"]
+        if self.strategy not in allowed_strategies:
+            raise ValueError("Can only use these strategies: {0} "
+                             " got strategy={1}".format(allowed_strategies,
+                                                        self.strategy))
+
+        if self.strategy in ("most_frequent", "constant"):
+            dtype = None 
+        else:
+            dtype = FLOAT_DTYPES
+
+        if self.missing_values is None:
+            force_all_finite = "allow-nan"
+        else:
+            if self.missing_values == "NaN" or np.isnan(self.missing_values):
+                force_all_finite = "allow-nan"
+            else:
+                force_all_finite = True
+
+        return check_array(X, accept_sparse='csc', dtype=dtype,
+                           force_all_finite=force_all_finite)
+
+
     def fit(self, X, y=None):
         """Fit the imputer on X.
 
@@ -138,30 +180,37 @@ def fit(self, X, y=None):
         -------
         self : SimpleImputer
         """
-        # Check parameters
-        allowed_strategies = ["mean", "median", "most_frequent"]
-        if self.strategy not in allowed_strategies:
-            raise ValueError("Can only use these strategies: {0} "
-                             " got strategy={1}".format(allowed_strategies,
-                                                        self.strategy))
-
-        X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan'
-                        if self.missing_values == 'NaN'
-                        or np.isnan(self.missing_values) else True)
+        X = self._validate_input(X)
+
+        if self.strategy == "constant":
+            if (X.dtype.kind in ("i", "f") 
+                    and not isinstance(self.fill_value, numbers.Real)):
+                raise ValueError(
+                    "fill_value={0} is invalid. Expected a numerical value "
+                    "to numerical data".format(self.fill_value))
+                
+        if self.fill_value is None:
+            if X.dtype.kind in ("i", "f"):
+                fill_value = 0
+            else:
+                fill_value = "missing_value"
+        else:
+            fill_value = self.fill_value
 
         if sparse.issparse(X):
             self.statistics_ = self._sparse_fit(X,
                                                 self.strategy,
-                                                self.missing_values)
+                                                self.missing_values,
+                                                fill_value)
         else:
             self.statistics_ = self._dense_fit(X,
                                                self.strategy,
-                                               self.missing_values)
+                                               self.missing_values,
+                                               fill_value)
 
         return self
 
-    def _sparse_fit(self, X, strategy, missing_values):
+    def _sparse_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on sparse data."""
         # Count the zeros
         if missing_values == 0:
@@ -233,12 +282,14 @@ def _sparse_fit(self, X, strategy, missing_values):
                                                       n_zeros_axis[i])
 
                 return most_frequent
+            
+            # Constant
+            elif strategy == "constant":
+
+                return np.full(X.shape[0], fill_value)
 
-    def _dense_fit(self, X, strategy, missing_values):
+    def _dense_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on dense data."""
-        X = check_array(X, force_all_finite='allow-nan'
-                        if self.missing_values == 'NaN'
-                        or np.isnan(self.missing_values) else True)
         mask = _get_mask(X, missing_values)
         masked_X = ma.masked_array(X, mask=mask)
 
@@ -280,6 +331,16 @@ def _dense_fit(self, X, strategy, missing_values):
 
             return most_frequent
 
+        # Constant
+        elif strategy == "constant":
+            if isinstance(fill_value, numbers.Real):
+                dtype = None
+            else:
+                dtype = object
+
+            return np.full(X.shape[0], fill_value, dtype=dtype)
+
+
     def transform(self, X):
         """Impute all missing values in X.
 
@@ -289,27 +350,29 @@ def transform(self, X):
             The input data to complete.
         """
         check_is_fitted(self, 'statistics_')
-        X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan'
-                        if self.missing_values == 'NaN'
-                        or np.isnan(self.missing_values) else True,
-                        copy=self.copy)
+
+        X = self._validate_input(X)
+
         statistics = self.statistics_
         if X.shape[1] != statistics.shape[0]:
             raise ValueError("X has %d features per sample, expected %d"
                              % (X.shape[1], self.statistics_.shape[0]))
 
-        # Delete the invalid columns
-        invalid_mask = np.isnan(statistics)
-        valid_mask = np.logical_not(invalid_mask)
-        valid_statistics = statistics[valid_mask]
-        valid_statistics_indexes = np.flatnonzero(valid_mask)
-        missing = np.arange(X.shape[1])[invalid_mask]
-
-        if invalid_mask.any():
-            if self.verbose:
-                warnings.warn("Deleting features without "
-                              "observed values: %s" % missing)
+        # Delete the invalid columns if strategy is not constant
+        if self.strategy == "constant":
+            valid_statistics = statistics
+        else:
+            invalid_mask = np.isnan(statistics)
+            valid_mask = np.logical_not(invalid_mask)
+
+            if invalid_mask.any():
+                missing = np.arange(X.shape[1])[invalid_mask]
+                if self.verbose:
+                    warnings.warn("Deleting features without "
+                                "observed values: %s" % missing)
+        
+            valid_statistics = statistics[valid_mask]
+            valid_statistics_indexes = np.flatnonzero(valid_mask)
             X = X[:, valid_statistics_indexes]
 
         # Do actual imputation
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 22e53a56f73c8..9e788012b85c2 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -219,10 +219,10 @@ def test_imputation_constant_integer():
     ])
 
     X_true = np.array([
-        [0, 2, 3],
-        [4, 0, 5],
-        [6, 7, 0],
-        [8, 9, 0]
+        [0, 2, 3, 0],
+        [4, 0, 5, 0],
+        [6, 7, 0, 0],
+        [8, 9, 0, 0]
     ])
 
     imputer = SimpleImputer(missing_values=-1, strategy="constant", 
@@ -235,24 +235,32 @@ def test_imputation_constant_integer():
 def test_imputation_constant_float():
     # Test imputation using the constant strategy
     # on floats
-    X = np.array([
-        [np.nan, 1.1, 2.2, np.nan],
-        [3.3, np.nan, 4.4, np.nan],
-        [5.5, 6.6, np.nan, np.nan],
-        [7.7, 8.8, 9.9, np.nan]
-    ])
-
-    X_true = np.array([
-        [0, 1.1, 2.2],
-        [3.3, 0, 4.4],
-        [5.5, 6.6, 0],
-        [7.7, 8.8, 9.9]
-    ])
+    for format in ["csr", "array"]:
+        X = np.array([
+            [np.nan, 1.1, 2.2, np.nan],
+            [3.3, np.nan, 4.4, np.nan],
+            [5.5, 6.6, np.nan, np.nan],
+            [7.7, 8.8, 9.9, np.nan]
+        ])
 
-    imputer = SimpleImputer(strategy="constant", fill_value=0)
-    X_trans = imputer.fit(X).transform(X)
+        X = sparse.csr_matrix(X) if format == "csr" else X
 
-    assert_allclose(X_trans, X_true)
+        X_true = np.array([
+            [0, 1.1, 2.2, 0],
+            [3.3, 0, 4.4, 0],
+            [5.5, 6.6, 0, 0],
+            [7.7, 8.8, 9.9, 0]
+        ])
+        
+        X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true
+
+        imputer = SimpleImputer(strategy="constant", fill_value=0)
+        X_trans = imputer.fit(X).transform(X)
+
+        if format == "csr":
+            assert_allclose(X_trans.toarray(), X_true.toarray())
+        else:
+            assert_allclose(X_trans, X_true)
 
 
 def test_imputation_constant_object():
@@ -266,11 +274,11 @@ def test_imputation_constant_object():
     ], dtype=object)
 
     X_true = np.array([
-        ["missing", "a", "b"],
-        ["c", "missing", "d"],
-        ["e", "f", "missing"],
-        ["g", "h", "i"]
-    ])
+        ["missing", "a", "b", "missing"],
+        ["c", "missing", "d", "missing"],
+        ["e", "f", "missing", "missing"],
+        ["g", "h", "i", "missing"]
+    ], dtype=object)
 
     imputer = SimpleImputer(missing_values=None, strategy="constant",
                             fill_value="missing")
@@ -290,14 +298,13 @@ def test_imputation_constant_object_nan():
     ], dtype=object)
 
     X_true = np.array([
-        ["missing", "a", "b"],
-        ["c", "missing", "d"],
-        ["e", "f", "missing"],
-        ["g", "h", "i"]
+        ["missing_value", "a", "b", "missing_value"],
+        ["c", "missing_value", "d", "missing_value"],
+        ["e", "f", "missing_value", "missing_value"],
+        ["g", "h", "i", "missing_value"]
     ], dtype=object)
 
-    imputer = SimpleImputer(missing_values=None, strategy="constant",
-                            fill_value="missing")
+    imputer = SimpleImputer(strategy="constant")
     X_trans = imputer.fit(X).transform(X)
 
     assert_array_equal(X_trans, X_true)
@@ -317,10 +324,10 @@ def test_imputation_constant_pandas():
         ], dtype=dtype)
 
         X_true = np.array([
-            ["missing", "a", "b"],
-            ["c", "missing", "d"],
-            ["e", "f", "missing"],
-            ["g", "h", "i"]
+            ["missing", "a", "b", "missing"],
+            ["c", "missing", "d", "missing"],
+            ["e", "f", "missing", "missing"],
+            ["g", "h", "i", "missing"]
         ], dtype=object)
 
         imputer = SimpleImputer(strategy="constant", fill_value="missing")

From 35e30ac4f26d66fcc5cd9ca398c333531990be27 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 7 Jun 2018 15:29:54 +0200
Subject: [PATCH 06/31] bug fixes on the SimpleImputer and change for default
 value to np.nan on MICEImputer

---
 sklearn/impute.py            | 87 +++++++++++++++---------------------
 sklearn/tests/test_impute.py | 71 +++++++++++------------------
 2 files changed, 62 insertions(+), 96 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 5711eb55c3196..390b2abcb9879 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -37,20 +37,12 @@
     'MICEImputer',
 ]
 
-def _is_scalar_nan(x):
-    """Work around limitations of numpy ufuncs"""
-    return False if x is None else np.isnan(x)
-
 
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
-    if value_to_mask == "NaN" or _is_scalar_nan(value_to_mask):
-        if X.dtype.kind == "O":
-            # np.isnan does not work for dtype objects. We use the trick that
-            # nan values are never equal to themselves.
-            return np.logical_not(X == X)
-        else:
-            return np.isnan(X)
+    if value_to_mask is np.nan:
+        # nan values are never equal to themselves
+        return X != X
     else:
         return X == value_to_mask
 
@@ -90,10 +82,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    missing_values : integer or "NaN", optional (default="NaN")
+    missing_values : real number, string, np.nan or None,
+        optional (default=np.nan).
         The placeholder for the missing values. All occurrences of
-        `missing_values` will be imputed. For missing values encoded as np.nan,
-        use the string value "NaN".
+        `missing_values` will be imputed.
 
     strategy : string, optional (default="mean")
         The imputation strategy.
@@ -104,7 +96,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
           each column.
         - If "most_frequent", then replace missing using the most frequent
           value along each column.
-        - If "constant", then replace missing values with fill_value
+        - If "constant", then replace missing values with fill_value.
 
     fill_value : string or numerical value, optional (default=None)
         When strategy == "constant", fill_value is used to replace all
@@ -132,10 +124,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     Notes
     -----
     Columns which only contained missing values at `fit` are discarded upon
-    `transform` is strategy is not "constant"
+    `transform` if strategy is not "constant"
 
     """
-    def __init__(self, missing_values="NaN", strategy="mean",
+    def __init__(self, missing_values=np.nan, strategy="mean",
                  fill_value=None, verbose=0, copy=True):
         self.missing_values = missing_values
         self.strategy = strategy
@@ -151,21 +143,17 @@ def _validate_input(self, X):
                                                         self.strategy))
 
         if self.strategy in ("most_frequent", "constant"):
-            dtype = None 
+            dtype = None
         else:
             dtype = FLOAT_DTYPES
 
-        if self.missing_values is None:
-            force_all_finite = "allow-nan"
+        if self.missing_values is not np.nan:
+            force_all_finite = True
         else:
-            if self.missing_values == "NaN" or np.isnan(self.missing_values):
-                force_all_finite = "allow-nan"
-            else:
-                force_all_finite = True
+            force_all_finite = "allow-nan"
 
         return check_array(X, accept_sparse='csc', dtype=dtype,
-                           force_all_finite=force_all_finite)
-
+                           force_all_finite=force_all_finite, copy=self.copy)
 
     def fit(self, X, y=None):
         """Fit the imputer on X.
@@ -182,13 +170,8 @@ def fit(self, X, y=None):
         """
         X = self._validate_input(X)
 
-        if self.strategy == "constant":
-            if (X.dtype.kind in ("i", "f") 
-                    and not isinstance(self.fill_value, numbers.Real)):
-                raise ValueError(
-                    "fill_value={0} is invalid. Expected a numerical value "
-                    "to numerical data".format(self.fill_value))
-                
+        # default missing_values is 0 for numerical input and "missing_value"
+        # otherwise
         if self.fill_value is None:
             if X.dtype.kind in ("i", "f"):
                 fill_value = 0
@@ -197,6 +180,14 @@ def fit(self, X, y=None):
         else:
             fill_value = self.fill_value
 
+        # fill_value should be numerical in case of numerical input
+        if self.strategy == "constant":
+            if (X.dtype.kind in ("i", "f")
+                    and not isinstance(fill_value, numbers.Real)):
+                raise ValueError(
+                    "fill_value={0} is invalid. Expected a numerical value "
+                    "to numerical data".format(fill_value))
+
         if sparse.issparse(X):
             self.statistics_ = self._sparse_fit(X,
                                                 self.strategy,
@@ -282,11 +273,11 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value):
                                                       n_zeros_axis[i])
 
                 return most_frequent
-            
+
             # Constant
             elif strategy == "constant":
 
-                return np.full(X.shape[0], fill_value)
+                return np.full(X.shape[1], fill_value)
 
     def _dense_fit(self, X, strategy, missing_values, fill_value):
         """Fit the transformer on dense data."""
@@ -338,8 +329,7 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
             else:
                 dtype = object
 
-            return np.full(X.shape[0], fill_value, dtype=dtype)
-
+            return np.full(X.shape[1], fill_value, dtype=dtype)
 
     def transform(self, X):
         """Impute all missing values in X.
@@ -364,16 +354,15 @@ def transform(self, X):
         else:
             invalid_mask = np.isnan(statistics)
             valid_mask = np.logical_not(invalid_mask)
+            valid_statistics = statistics[valid_mask]
+            valid_statistics_indexes = np.flatnonzero(valid_mask)
 
             if invalid_mask.any():
                 missing = np.arange(X.shape[1])[invalid_mask]
                 if self.verbose:
                     warnings.warn("Deleting features without "
-                                "observed values: %s" % missing)
-        
-            valid_statistics = statistics[valid_mask]
-            valid_statistics_indexes = np.flatnonzero(valid_mask)
-            X = X[:, valid_statistics_indexes]
+                                  "observed values: %s" % missing)
+                X = X[:, valid_statistics_indexes]
 
         # Do actual imputation
         if sparse.issparse(X) and self.missing_values != 0:
@@ -390,7 +379,6 @@ def transform(self, X):
             mask = _get_mask(X, self.missing_values)
             n_missing = np.sum(mask, axis=0)
             values = np.repeat(valid_statistics, n_missing)
-
             coordinates = np.where(mask.transpose())[::-1]
 
             X[coordinates] = values
@@ -409,10 +397,9 @@ class MICEImputer(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    missing_values : int or "NaN", optional (default="NaN")
+    missing_values : int, np.nan, optional (default=np.nan)
         The placeholder for the missing values. All occurrences of
-        ``missing_values`` will be imputed. For missing values encoded as
-        np.nan, use the string value "NaN".
+        ``missing_values`` will be imputed.
 
     imputation_order : str, optional (default="ascending")
         The order in which the features will be imputed. Possible values:
@@ -507,7 +494,7 @@ class MICEImputer(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self,
-                 missing_values='NaN',
+                 missing_values=np.nan,
                  imputation_order='ascending',
                  n_imputations=100,
                  n_burn_in=10,
@@ -757,10 +744,10 @@ def _initial_imputation(self, X):
             Input data's missing indicator matrix, where "n_samples" is the
             number of samples and "n_features" is the number of features.
         """
+
         X = check_array(X, dtype=FLOAT_DTYPES, order="F",
-                        force_all_finite='allow-nan'
-                        if self.missing_values == 'NaN'
-                        or np.isnan(self.missing_values) else True)
+                        force_all_finite="allow-nan"
+                        if self.missing_values is np.nan else True)
 
         mask_missing_values = _get_mask(X, self.missing_values)
         if self.initial_imputer_ is None:
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 9e788012b85c2..a9d2e189bf179 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -62,7 +62,7 @@ def test_imputation_shape():
     X = np.random.randn(10, 2)
     X[::2] = np.nan
 
-    for strategy in ['mean', 'median', 'most_frequent']:
+    for strategy in ['mean', 'median', 'most_frequent', "constant"]:
         imputer = SimpleImputer(strategy=strategy)
         X_imputed = imputer.fit_transform(sparse.csr_matrix(X))
         assert X_imputed.shape == (10, 2)
@@ -99,9 +99,10 @@ def test_imputation_mean_median():
     values = np.arange(1, shape[0] + 1)
     values[4::2] = - values[4::2]
 
-    tests = [("mean", "NaN", lambda z, v, p: safe_mean(np.hstack((z, v)))),
+    tests = [("mean", np.nan, lambda z, v, p: safe_mean(np.hstack((z, v)))),
              ("mean", 0, lambda z, v, p: np.mean(v)),
-             ("median", "NaN", lambda z, v, p: safe_median(np.hstack((z, v)))),
+             ("median", np.nan,
+              lambda z, v, p: safe_median(np.hstack((z, v)))),
              ("median", 0, lambda z, v, p: np.median(v))]
 
     for strategy, test_missing_values, true_value_fun in tests:
@@ -182,7 +183,7 @@ def test_imputation_median_special_cases():
     statistics_median = [0, 5, 0, -2.5, 2.5, 4.5, -4.5, .5]
 
     _check_statistics(X, X_imputed_median, "median",
-                      statistics_median, 'NaN')
+                      statistics_median, np.nan)
 
 
 def test_imputation_most_frequent():
@@ -225,7 +226,7 @@ def test_imputation_constant_integer():
         [8, 9, 0, 0]
     ])
 
-    imputer = SimpleImputer(missing_values=-1, strategy="constant", 
+    imputer = SimpleImputer(missing_values=-1, strategy="constant",
                             fill_value=0)
     X_trans = imputer.fit(X).transform(X)
 
@@ -251,7 +252,7 @@ def test_imputation_constant_float():
             [5.5, 6.6, 0, 0],
             [7.7, 8.8, 9.9, 0]
         ])
-        
+
         X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true
 
         imputer = SimpleImputer(strategy="constant", fill_value=0)
@@ -266,48 +267,26 @@ def test_imputation_constant_float():
 def test_imputation_constant_object():
     # Test imputation using the constant strategy
     # on objects
-    X = np.array([
-        [None, "a", "b", None],
-        ["c", None, "d", None],
-        ["e", "f", None, None],
-        ["g", "h", "i", None]
-    ], dtype=object)
-
-    X_true = np.array([
-        ["missing", "a", "b", "missing"],
-        ["c", "missing", "d", "missing"],
-        ["e", "f", "missing", "missing"],
-        ["g", "h", "i", "missing"]
-    ], dtype=object)
-
-    imputer = SimpleImputer(missing_values=None, strategy="constant",
-                            fill_value="missing")
-    X_trans = imputer.fit(X).transform(X)
-
-    assert_array_equal(X_trans, X_true)
-
-
-def test_imputation_constant_object_nan():
-    # Test imputation using the constant strategy
-    # on objects
-    X = np.array([
-        [np.nan, "a", "b", np.nan],
-        ["c", np.nan, "d", np.nan],
-        ["e", "f", np.nan, np.nan],
-        ["g", "h", "i", np.nan]
-    ], dtype=object)
+    for marker in (None, np.nan, "NAN", 0):
+        X = np.array([
+            [marker, "a", "b", marker],
+            ["c", marker, "d", marker],
+            ["e", "f", marker, marker],
+            ["g", "h", "i", marker]
+        ], dtype=object)
 
-    X_true = np.array([
-        ["missing_value", "a", "b", "missing_value"],
-        ["c", "missing_value", "d", "missing_value"],
-        ["e", "f", "missing_value", "missing_value"],
-        ["g", "h", "i", "missing_value"]
-    ], dtype=object)
+        X_true = np.array([
+            ["missing", "a", "b", "missing"],
+            ["c", "missing", "d", "missing"],
+            ["e", "f", "missing", "missing"],
+            ["g", "h", "i", "missing"]
+        ], dtype=object)
 
-    imputer = SimpleImputer(strategy="constant")
-    X_trans = imputer.fit(X).transform(X)
+        imputer = SimpleImputer(missing_values=marker, strategy="constant",
+                                fill_value="missing")
+        X_trans = imputer.fit(X).transform(X)
 
-    assert_array_equal(X_trans, X_true)
+        assert_array_equal(X_trans, X_true)
 
 
 def test_imputation_constant_pandas():
@@ -331,7 +310,7 @@ def test_imputation_constant_pandas():
         ], dtype=object)
 
         imputer = SimpleImputer(strategy="constant", fill_value="missing")
-        X_trans = imputer.fit(df).transform(df) 
+        X_trans = imputer.fit(df).transform(df)
 
         assert_array_equal(X_trans, X_true)
 

From ea4a929358f1dd309cbd535520061421dc09b2e6 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 7 Jun 2018 17:35:54 +0200
Subject: [PATCH 07/31] object dtypes support for "most_frequent" strategy in
 SimpleImputer

---
 sklearn/impute.py            | 13 +++++++++----
 sklearn/tests/test_impute.py | 27 ++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 390b2abcb9879..31f1e652d2425 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -41,7 +41,8 @@
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
     if value_to_mask is np.nan:
-        # nan values are never equal to themselves
+        # nan values are never equal to themselves. We use this trick because
+        # np.isnan does not work on object dtypes.
         return X != X
     else:
         return X == value_to_mask
@@ -82,7 +83,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    missing_values : real number, string, np.nan or None,
+    missing_values : real number, string, np.nan or None, \
         optional (default=np.nan).
         The placeholder for the missing values. All occurrences of
         `missing_values` will be imputed.
@@ -313,7 +314,10 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
             X = X.transpose()
             mask = mask.transpose()
 
-            most_frequent = np.empty(X.shape[0])
+            if X.dtype.kind == "O":
+                most_frequent = np.empty(X.shape[0], dtype=object)
+            else:
+                most_frequent = np.empty(X.shape[0])
 
             for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
                 row_mask = np.logical_not(row_mask).astype(np.bool)
@@ -352,7 +356,8 @@ def transform(self, X):
         if self.strategy == "constant":
             valid_statistics = statistics
         else:
-            invalid_mask = np.isnan(statistics)
+            # same as np.isnan but also works for object dtypes
+            invalid_mask = statistics != statistics
             valid_mask = np.logical_not(invalid_mask)
             valid_statistics = statistics[valid_mask]
             valid_statistics_indexes = np.flatnonzero(valid_mask)
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index a9d2e189bf179..035bb3c2923ba 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -34,6 +34,7 @@ def _check_statistics(X, X_true,
               "sparse = {0}" % (strategy, missing_values)
 
     assert_ae = assert_array_equal
+
     if X.dtype.kind == 'f' or X_true.dtype.kind == 'f':
         assert_ae = assert_array_almost_equal
 
@@ -209,6 +210,30 @@ def test_imputation_most_frequent():
     _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1)
 
 
+def test_imputation_most_frequent_objects():
+    # Test imputation using the most-frequent strategy.
+    for marker in (None, np.nan, "NAN", "", 0):
+        X = np.array([
+            [marker, marker, "a", "f"],
+            [marker, "c", marker, "d"],
+            [marker, "b", "d", marker],
+            [marker, "c", "d", "h"],
+        ], dtype=object)
+
+        X_true = np.array([
+            ["c", "a", "f"],
+            ["c", "d", "d"],
+            ["b", "d", "d"],
+            ["c", "d", "h"],
+        ], dtype=object)
+
+        imputer = SimpleImputer(missing_values=marker,
+                                strategy="most_frequent")
+        X_trans = imputer.fit(X).transform(X)
+
+        assert_array_equal(X_trans, X_true)
+
+
 def test_imputation_constant_integer():
     # Test imputation using the constant strategy
     # on integers
@@ -267,7 +292,7 @@ def test_imputation_constant_float():
 def test_imputation_constant_object():
     # Test imputation using the constant strategy
     # on objects
-    for marker in (None, np.nan, "NAN", 0):
+    for marker in (None, np.nan, "NAN", "", 0):
         X = np.array([
             [marker, "a", "b", marker],
             ["c", marker, "d", marker],

From 10f165b6d64da70ad0447c7506c549f999ac28ab Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 11 Jun 2018 09:49:39 +0200
Subject: [PATCH 08/31] minor fixes regarding the change of default
 missing_values="NaN" to np.nan

---
 doc/modules/impute.rst                           | 14 +++++++++-----
 sklearn/impute.py                                | 16 ++++++++--------
 sklearn/model_selection/tests/test_search.py     |  2 +-
 sklearn/model_selection/tests/test_validation.py |  4 ++--
 sklearn/tests/test_impute.py                     | 12 ++++--------
 sklearn/utils/estimator_checks.py                |  1 +
 6 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index f161825105975..a28e0d4b47e38 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -20,9 +20,10 @@ Univariate feature imputation
 =============================
 
 The :class:`SimpleImputer` class provides basic strategies for imputing missing
-values, either using the mean, the median or the most frequent value of
-the row or column in which the missing values are located. This class
-also allows for different missing values encodings.
+values. Missing values can be imputed with a provided value, or using the
+statistics (mean, median or most frequent) of each column in which the missing
+values are located. This class also allows for different missing values
+encodings.
 
 The following snippet demonstrates how to replace missing values,
 encoded as ``np.nan``, using the mean value of the columns (axis 0)
@@ -30,9 +31,9 @@ that contain the missing values::
 
     >>> import numpy as np
     >>> from sklearn.impute import SimpleImputer
-    >>> imp = SimpleImputer(missing_values='NaN', strategy='mean')
+    >>> imp = SimpleImputer(missing_values=np.nan, strategy='mean')
     >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]])       # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(copy=True, missing_values='NaN', strategy='mean', verbose=0)
+    SimpleImputer(copy=True, missing_values=nan, strategy='mean', verbose=0)
     >>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
     >>> print(imp.transform(X))           # doctest: +NORMALIZE_WHITESPACE  +ELLIPSIS
     [[4.          2.        ]
@@ -52,6 +53,9 @@ The :class:`SimpleImputer` class also supports sparse matrices::
      [6.          3.666...]
      [7.          6.        ]]
 
+Object
+
+
 Note that, here, missing values are encoded by 0 and are thus implicitly stored
 in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 31f1e652d2425..c2832a35c0c48 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -185,7 +185,7 @@ def fit(self, X, y=None):
         if self.strategy == "constant":
             if (X.dtype.kind in ("i", "f")
                     and not isinstance(fill_value, numbers.Real)):
-                raise ValueError(
+                raise TypeError(
                     "fill_value={0} is invalid. Expected a numerical value "
                     "to numerical data".format(fill_value))
 
@@ -244,7 +244,7 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value):
             with np.errstate(all="ignore"):
                 return np.ravel(sums) / np.ravel(n_non_missing)
 
-        # Median + Most frequent
+        # Median + Most frequent + Constant
         else:
             # Remove the missing values, for each column
             columns_all = np.hsplit(X.data, X.indptr[1:-1])
@@ -277,7 +277,6 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value):
 
             # Constant
             elif strategy == "constant":
-
                 return np.full(X.shape[1], fill_value)
 
     def _dense_fit(self, X, strategy, missing_values, fill_value):
@@ -328,12 +327,12 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
 
         # Constant
         elif strategy == "constant":
-            if isinstance(fill_value, numbers.Real):
+            """if isinstance(fill_value, numbers.Real):
                 dtype = None
             else:
                 dtype = object
-
-            return np.full(X.shape[1], fill_value, dtype=dtype)
+            """
+            return np.full(X.shape[1], fill_value, dtype=X.dtype)
 
     def transform(self, X):
         """Impute all missing values in X.
@@ -749,10 +748,11 @@ def _initial_imputation(self, X):
             Input data's missing indicator matrix, where "n_samples" is the
             number of samples and "n_features" is the number of features.
         """
+        force_all_finite = "allow-nan" if self.missing_values is np.nan \
+                           else True
 
         X = check_array(X, dtype=FLOAT_DTYPES, order="F",
-                        force_all_finite="allow-nan"
-                        if self.missing_values is np.nan else True)
+                        force_all_finite=force_all_finite)
 
         mask_missing_values = _get_mask(X, self.missing_values)
         if self.initial_imputer_ is None:
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
index f436c7b55cf36..876a5af11fe3e 100644
--- a/sklearn/model_selection/tests/test_search.py
+++ b/sklearn/model_selection/tests/test_search.py
@@ -1313,7 +1313,7 @@ def test_grid_search_allows_nans():
     X[2, :] = np.nan
     y = [0, 0, 1, 1, 1]
     p = Pipeline([
-        ('imputer', SimpleImputer(strategy='mean', missing_values='NaN')),
+        ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
         ('classifier', MockClassifier()),
     ])
     GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
index 2929916619769..92d3b5988629c 100644
--- a/sklearn/model_selection/tests/test_validation.py
+++ b/sklearn/model_selection/tests/test_validation.py
@@ -744,7 +744,7 @@ def test_permutation_test_score_allow_nans():
     X[2, :] = np.nan
     y = np.repeat([0, 1], X.shape[0] / 2)
     p = Pipeline([
-        ('imputer', SimpleImputer(strategy='mean', missing_values='NaN')),
+        ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
         ('classifier', MockClassifier()),
     ])
     permutation_test_score(p, X, y, cv=5)
@@ -756,7 +756,7 @@ def test_cross_val_score_allow_nans():
     X[2, :] = np.nan
     y = np.repeat([0, 1], X.shape[0] / 2)
     p = Pipeline([
-        ('imputer', SimpleImputer(strategy='mean', missing_values='NaN')),
+        ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
         ('classifier', MockClassifier()),
     ])
     cross_val_score(p, X, y, cv=5)
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 035bb3c2923ba..8522f6f50b1cd 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -235,8 +235,7 @@ def test_imputation_most_frequent_objects():
 
 
 def test_imputation_constant_integer():
-    # Test imputation using the constant strategy
-    # on integers
+    # Test imputation using the constant strategy on integers
     X = np.array([
         [-1, 2, 3, -1],
         [4, -1, 5, -1],
@@ -259,8 +258,7 @@ def test_imputation_constant_integer():
 
 
 def test_imputation_constant_float():
-    # Test imputation using the constant strategy
-    # on floats
+    # Test imputation using the constant strategy on floats
     for format in ["csr", "array"]:
         X = np.array([
             [np.nan, 1.1, 2.2, np.nan],
@@ -290,8 +288,7 @@ def test_imputation_constant_float():
 
 
 def test_imputation_constant_object():
-    # Test imputation using the constant strategy
-    # on objects
+    # Test imputation using the constant strategy on objects
     for marker in (None, np.nan, "NAN", "", 0):
         X = np.array([
             [marker, "a", "b", marker],
@@ -315,8 +312,7 @@ def test_imputation_constant_object():
 
 
 def test_imputation_constant_pandas():
-    # Test imputation using the constant strategy
-    # on pandas df
+    # Test imputation using the constant strategy on pandas df
     pd = pytest.importorskip("pandas")
 
     for dtype in [object, "category"]:
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 5e4c454f4b1ab..d57937cfe944b 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -1605,6 +1605,7 @@ def check_classifiers_predictions(X, y, name, classifier_orig):
 def choose_check_classifiers_labels(name, y, y_names):
     return y if name in ["LabelPropagation", "LabelSpreading"] else y_names
 
+
 def check_classifiers_classes(name, classifier_orig):
     X_multiclass, y_multiclass = make_blobs(n_samples=30, random_state=0,
                                             cluster_std=0.1)

From a6c33b1a69c3cafc091311776578588034f29784 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 11 Jun 2018 14:40:12 +0200
Subject: [PATCH 09/31] Changed the test in estimator_check to allow np.nan as
 default value in constructor ; + minor corrections

---
 sklearn/impute.py                 | 22 +++++++++++++++++-----
 sklearn/tests/test_impute.py      | 18 ++++++++++++++++++
 sklearn/utils/estimator_checks.py | 12 +++++++++++-
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index c2832a35c0c48..f87cdf2d67902 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -84,7 +84,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     Parameters
     ----------
     missing_values : real number, string, np.nan or None, \
-        optional (default=np.nan).
+optional (default=np.nan).
         The placeholder for the missing values. All occurrences of
         `missing_values` will be imputed.
 
@@ -183,11 +183,23 @@ def fit(self, X, y=None):
 
         # fill_value should be numerical in case of numerical input
         if self.strategy == "constant":
-            if (X.dtype.kind in ("i", "f")
-                    and not isinstance(fill_value, numbers.Real)):
+            if X.dtype.kind in ("i", "f"):
+                if not isinstance(fill_value, numbers.Real):
+                    raise TypeError(
+                        "fill_value={0} is invalid. Expected a numerical value"
+                        " to numerical data".format(fill_value))
+
+            elif X.dtype.kind == "O":
+                if not isinstance(fill_value, six.string_types):
+                    raise TypeError(
+                        "fill_value={0} is invalid. Expected an str instance "
+                        "when imputing categorical data.".format(fill_value))
+
+            else:
                 raise TypeError(
-                    "fill_value={0} is invalid. Expected a numerical value "
-                    "to numerical data".format(fill_value))
+                    "SimpleImputer cannot work on data with dtype={0}: "
+                    "expecting numerical or categorical data with "
+                    "dtype=object.".format(X.dtype))
 
         if sparse.issparse(X):
             self.statistics_ = self._sparse_fit(X,
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 8522f6f50b1cd..6c63b79de6bf2 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -75,6 +75,24 @@ def test_imputation_shape():
         assert X_imputed.shape == (10, 2)
 
 
+def test_imputation_valid_types():
+    # Verify that exceptions are raised on invalid inputs
+    tests = [(1, 0, "fill_value", None),
+             (1., np.nan, "fill_value", None),
+             ("a", "", 0, object),
+             (True, "nan", "fill_value", "c")]
+
+    for X_data, missing_value, fill_value, dtype in tests:
+        X = np.full((3, 5), X_data, dtype=dtype)
+        X[0, 0] = missing_value
+
+        with pytest.raises(TypeError):
+            imputer = SimpleImputer(missing_values=missing_value,
+                                    strategy="constant",
+                                    fill_value=fill_value)
+            imputer.fit(X).transform(X)
+
+
 def safe_median(arr, *args, **kwargs):
     # np.median([]) raises a TypeError for numpy >= 1.10.1
     length = arr.size if hasattr(arr, 'size') else len(arr)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index d57937cfe944b..b8b81a67d4a95 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2001,11 +2001,17 @@ def param_filter(p):
 
             init_params = [p for p in signature(init).parameters.values()
                            if param_filter(p)]
+            print("init_params:")
+            print(init_params)
+            print()
         except (TypeError, ValueError):
             # init is not a python function.
             # true for mixins
             return
         params = estimator.get_params()
+        print("params:")
+        print(params)
+        print()
         if name in META_ESTIMATORS:
             # they can need a non-default argument
             init_params = init_params[1:]
@@ -2031,7 +2037,11 @@ def param_filter(p):
             if isinstance(param_value, np.ndarray):
                 assert_array_equal(param_value, init_param.default)
             else:
-                assert_equal(param_value, init_param.default, init_param.name)
+                # Allows to set default parameters to np.nan
+                if (param_value is not np.nan or
+                        init_param.default is not np.nan):
+                    assert_equal(param_value, init_param.default,
+                                 init_param.name)
 
 
 def multioutput_estimator_convert_y_2d(estimator, y):

From 9c2a407de7b7202c76af318824e12a8ce9b51dd5 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 11 Jun 2018 15:51:57 +0200
Subject: [PATCH 10/31] fix for older versions of numpy

---
 sklearn/impute.py            | 4 +++-
 sklearn/tests/test_impute.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index f87cdf2d67902..2dcca8ec968d7 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -45,7 +45,9 @@ def _get_mask(X, value_to_mask):
         # np.isnan does not work on object dtypes.
         return X != X
     else:
-        return X == value_to_mask
+        # X == value_to_mask with object dytpes does not always perform
+        # element-wise for old versions of numpy
+        return np.equal(X, value_to_mask)
 
 
 def _most_frequent(array, extra_value, n_repeat):
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 6c63b79de6bf2..19d7e0a588c43 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -230,7 +230,7 @@ def test_imputation_most_frequent():
 
 def test_imputation_most_frequent_objects():
     # Test imputation using the most-frequent strategy.
-    for marker in (None, np.nan, "NAN", "", 0):
+    for marker in (np.nan, "NAN", "", 0):
         X = np.array([
             [marker, marker, "a", "f"],
             [marker, "c", marker, "d"],

From df8608ba545fb154b80a69144d6f06685d24942b Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 11 Jun 2018 15:59:33 +0200
Subject: [PATCH 11/31] .

---
 sklearn/tests/test_impute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 19d7e0a588c43..6c63b79de6bf2 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -230,7 +230,7 @@ def test_imputation_most_frequent():
 
 def test_imputation_most_frequent_objects():
     # Test imputation using the most-frequent strategy.
-    for marker in (np.nan, "NAN", "", 0):
+    for marker in (None, np.nan, "NAN", "", 0):
         X = np.array([
             [marker, marker, "a", "f"],
             [marker, "c", marker, "d"],

From 45176876ab7a18eb6cb1efac390c91568b0bf35b Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Tue, 12 Jun 2018 17:48:24 +0200
Subject: [PATCH 12/31] fix for old versions of numpy v2

---
 sklearn/impute.py                 | 25 +++++++++++++++----------
 sklearn/utils/estimator_checks.py |  8 ++------
 sklearn/utils/fixes.py            | 16 ++++++++++++++++
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 2dcca8ec968d7..9a145b63515ea 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -22,6 +22,7 @@
 from .utils.sparsefuncs import _get_median
 from .utils.validation import check_is_fitted
 from .utils.validation import FLOAT_DTYPES
+from .utils.fixes import custom_isnan
 
 from .externals import six
 
@@ -38,12 +39,20 @@
 ]
 
 
+def _custom_isnan(x):
+    # np.nan is never equal to np.nan. Return true only if x is np.nan
+    return x != x
+
+
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
     if value_to_mask is np.nan:
-        # nan values are never equal to themselves. We use this trick because
-        # np.isnan does not work on object dtypes.
-        return X != X
+        if X.dtype.kind in ("i", "u", "f"):
+            return np.isnan(X)
+        else:
+            # np.isnan does not work on object dtypes.
+            return custom_isnan(X)
+
     else:
         # X == value_to_mask with object dytpes does not always perform
         # element-wise for old versions of numpy
@@ -185,7 +194,7 @@ def fit(self, X, y=None):
 
         # fill_value should be numerical in case of numerical input
         if self.strategy == "constant":
-            if X.dtype.kind in ("i", "f"):
+            if X.dtype.kind in ("i", "u", "f"):
                 if not isinstance(fill_value, numbers.Real):
                     raise TypeError(
                         "fill_value={0} is invalid. Expected a numerical value"
@@ -341,11 +350,6 @@ def _dense_fit(self, X, strategy, missing_values, fill_value):
 
         # Constant
         elif strategy == "constant":
-            """if isinstance(fill_value, numbers.Real):
-                dtype = None
-            else:
-                dtype = object
-            """
             return np.full(X.shape[1], fill_value, dtype=X.dtype)
 
     def transform(self, X):
@@ -361,6 +365,7 @@ def transform(self, X):
         X = self._validate_input(X)
 
         statistics = self.statistics_
+
         if X.shape[1] != statistics.shape[0]:
             raise ValueError("X has %d features per sample, expected %d"
                              % (X.shape[1], self.statistics_.shape[0]))
@@ -370,7 +375,7 @@ def transform(self, X):
             valid_statistics = statistics
         else:
             # same as np.isnan but also works for object dtypes
-            invalid_mask = statistics != statistics
+            invalid_mask = _get_mask(statistics, np.nan)
             valid_mask = np.logical_not(invalid_mask)
             valid_statistics = statistics[valid_mask]
             valid_statistics_indexes = np.flatnonzero(valid_mask)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index b8b81a67d4a95..8425b98980a84 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2001,17 +2001,13 @@ def param_filter(p):
 
             init_params = [p for p in signature(init).parameters.values()
                            if param_filter(p)]
-            print("init_params:")
-            print(init_params)
-            print()
+
         except (TypeError, ValueError):
             # init is not a python function.
             # true for mixins
             return
         params = estimator.get_params()
-        print("params:")
-        print(params)
-        print()
+
         if name in META_ESTIMATORS:
             # they can need a non-default argument
             init_params = init_params[1:]
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index f7d9d6a29f9f6..588cf1de182e2 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -334,3 +334,19 @@ def nanpercentile(a, q):
             return np.array([np.nan] * size_q)
 else:
     from numpy import nanpercentile  # noqa
+
+
+# Fix for behavior inconsistency on numpy.equal for object dtypes.
+# For numpy versions < 1.13, numpy.equal tests identity of objects instead of
+# equality
+
+test_array = np.array([np.nan], dtype=object)
+test_mask = test_array != test_array
+
+if np.array_equal(test_mask, np.array([True])):
+    def custom_isnan(X):
+        return X != X
+
+else:
+    def custom_isnan(X):
+        return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool)

From 1f1c6a01537b39cea3fce55ac481d34e40de149b Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 13 Jun 2018 16:37:44 +0200
Subject: [PATCH 13/31] minor fixes and added doc example for categorical
 inputs

---
 doc/conftest.py                   |  9 ++++++
 doc/modules/impute.rst            | 30 ++++++++++++-----
 sklearn/impute.py                 | 11 ++-----
 sklearn/tests/test_impute.py      | 53 ++++++++++++++++++++++++-------
 sklearn/utils/estimator_checks.py |  9 ++++++
 sklearn/utils/fixes.py            | 10 +++---
 6 files changed, 90 insertions(+), 32 deletions(-)

diff --git a/doc/conftest.py b/doc/conftest.py
index 158fff5830acf..f6fe644583730 100644
--- a/doc/conftest.py
+++ b/doc/conftest.py
@@ -62,6 +62,13 @@ def setup_compose():
         raise SkipTest("Skipping compose.rst, pandas not installed")
 
 
+def setup_impute():
+    try:
+        import pandas  # noqa
+    except ImportError:
+        raise SkipTest("Skipping impute.rst, pandas not installed")
+
+
 def pytest_runtest_setup(item):
     fname = item.fspath.strpath
     if fname.endswith('datasets/labeled_faces.rst'):
@@ -76,6 +83,8 @@ def pytest_runtest_setup(item):
         setup_working_with_text_data()
     elif fname.endswith('modules/compose.rst'):
         setup_compose()
+    elif fname.endswith('modules/impute.rst'):
+        setup_impute()
 
 
 def pytest_runtest_teardown(item):
diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index a28e0d4b47e38..2667b123e5fdc 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -20,9 +20,9 @@ Univariate feature imputation
 =============================
 
 The :class:`SimpleImputer` class provides basic strategies for imputing missing
-values. Missing values can be imputed with a provided value, or using the
-statistics (mean, median or most frequent) of each column in which the missing
-values are located. This class also allows for different missing values
+values. Missing values can be imputed with a provided constant value, or using
+the statistics (mean, median or most frequent) of each column in which the
+missing values are located. This class also allows for different missing values
 encodings.
 
 The following snippet demonstrates how to replace missing values,
@@ -33,7 +33,7 @@ that contain the missing values::
     >>> from sklearn.impute import SimpleImputer
     >>> imp = SimpleImputer(missing_values=np.nan, strategy='mean')
     >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]])       # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(copy=True, missing_values=nan, strategy='mean', verbose=0)
+    SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)
     >>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
     >>> print(imp.transform(X))           # doctest: +NORMALIZE_WHITESPACE  +ELLIPSIS
     [[4.          2.        ]
@@ -46,20 +46,34 @@ The :class:`SimpleImputer` class also supports sparse matrices::
     >>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
     >>> imp = SimpleImputer(missing_values=0, strategy='mean')
     >>> imp.fit(X)                  # doctest: +NORMALIZE_WHITESPACE
-    SimpleImputer(copy=True, missing_values=0, strategy='mean', verbose=0)
+    SimpleImputer(copy=True, fill_value=None, missing_values=0, strategy='mean', verbose=0)
     >>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
     >>> print(imp.transform(X_test))      # doctest: +NORMALIZE_WHITESPACE  +ELLIPSIS
     [[4.          2.        ]
      [6.          3.666...]
      [7.          6.        ]]
 
-Object
-
-
 Note that, here, missing values are encoded by 0 and are thus implicitly stored
 in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
+The :class:`SimpleImputer` class also supports categorical datas represented as
+string values or pandas categoricals when using the "most_frequent" or
+"constant" strategy::
+
+    >>> import pandas as pd
+    >>> df = pd.DataFrame([["a", "x"],
+    ...                    ["", "y"],
+    ...                    ["a", ""],
+    ...                    ["b", "y"]], dtype="category")
+    ...
+    >>> imp = SimpleImputer(missing_values="", strategy="most_frequent")
+    >>> print(imp.fit_transform(df))      # doctest: +NORMALIZE_WHITESPACE
+    [['a' 'x']
+     ['a' 'y']
+     ['a' 'y']
+     ['b' 'y']]
+
 .. _mice:
 
 Multivariate feature imputation
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 9a145b63515ea..f4cb476b547f9 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -22,7 +22,7 @@
 from .utils.sparsefuncs import _get_median
 from .utils.validation import check_is_fitted
 from .utils.validation import FLOAT_DTYPES
-from .utils.fixes import custom_isnan
+from .utils.fixes import _compat_isnan
 
 from .externals import six
 
@@ -39,11 +39,6 @@
 ]
 
 
-def _custom_isnan(x):
-    # np.nan is never equal to np.nan. Return true only if x is np.nan
-    return x != x
-
-
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
     if value_to_mask is np.nan:
@@ -51,7 +46,7 @@ def _get_mask(X, value_to_mask):
             return np.isnan(X)
         else:
             # np.isnan does not work on object dtypes.
-            return custom_isnan(X)
+            return _compat_isnan(X)
 
     else:
         # X == value_to_mask with object dytpes does not always perform
@@ -198,7 +193,7 @@ def fit(self, X, y=None):
                 if not isinstance(fill_value, numbers.Real):
                     raise TypeError(
                         "fill_value={0} is invalid. Expected a numerical value"
-                        " to numerical data".format(fill_value))
+                        " when imputing numerical data".format(fill_value))
 
             elif X.dtype.kind == "O":
                 if not isinstance(fill_value, six.string_types):
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 6c63b79de6bf2..83d78793729dd 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -5,6 +5,8 @@
 import numpy as np
 from scipy import sparse
 
+import io
+
 from sklearn.utils.testing import assert_allclose
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
@@ -252,6 +254,33 @@ def test_imputation_most_frequent_objects():
         assert_array_equal(X_trans, X_true)
 
 
+def test_imputation_most_frequent_pandas():
+    # Test imputation using the most frequent strategy on pandas df
+    pd = pytest.importorskip("pandas")
+
+    f = io.StringIO(u"Cat1,Cat2,Cat3,Cat4\n"
+                    ",i,x,\n"
+                    "a,,y,\n"
+                    "a,j,,\n"
+                    "b,j,x,")
+
+    for dtype in (object, "category"):
+        df = pd.read_csv(f, dtype=dtype)
+        f.seek(0)
+
+        X_true = np.array([
+            ["a", "i", "x"],
+            ["a", "j", "y"],
+            ["a", "j", "x"],
+            ["b", "j", "x"]
+        ], dtype=object)
+
+        imputer = SimpleImputer(strategy="most_frequent")
+        X_trans = imputer.fit(df).transform(df)
+
+        assert_array_equal(X_trans, X_true)
+
+
 def test_imputation_constant_integer():
     # Test imputation using the constant strategy on integers
     X = np.array([
@@ -333,19 +362,21 @@ def test_imputation_constant_pandas():
     # Test imputation using the constant strategy on pandas df
     pd = pytest.importorskip("pandas")
 
-    for dtype in [object, "category"]:
-        df = pd.DataFrame([
-            [np.nan, "a", "b", np.nan],
-            ["c", np.nan, "d", np.nan],
-            ["e", "f", np.nan, np.nan],
-            ["g", "h", "i", np.nan]
-        ], dtype=dtype)
+    f = io.StringIO(u"Cat1,Cat2,Cat3,Cat4\n"
+                    ",i,x,\n"
+                    "a,,y,\n"
+                    "a,j,,\n"
+                    "b,j,x,")
+
+    for dtype in (object, "category"):
+        df = pd.read_csv(f, dtype=dtype)
+        f.seek(0)
 
         X_true = np.array([
-            ["missing", "a", "b", "missing"],
-            ["c", "missing", "d", "missing"],
-            ["e", "f", "missing", "missing"],
-            ["g", "h", "i", "missing"]
+            ["missing", "i", "x", "missing"],
+            ["a", "missing", "y", "missing"],
+            ["a", "j", "missing", "missing"],
+            ["b", "j", "x", "missing"]
         ], dtype=object)
 
         imputer = SimpleImputer(strategy="constant", fill_value="missing")
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 8425b98980a84..6120d1cefd7ea 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -8,6 +8,7 @@
 from copy import deepcopy
 import struct
 from functools import partial
+import numbers
 
 import numpy as np
 from scipy import sparse
@@ -2039,6 +2040,14 @@ def param_filter(p):
                     assert_equal(param_value, init_param.default,
                                  init_param.name)
 
+                def _isscalarnan(x):
+                    return isinstance(x, numbers.Real) and np.isnan(x)
+
+                if _isscalarnan(param_value):
+                    assert param_value is init_param.default, init_param.name
+                else:
+                    assert param_value == init_param.default, init_param.name
+
 
 def multioutput_estimator_convert_y_2d(estimator, y):
     # Estimators in mono_output_task_error raise ValueError if y is of 1-D
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 588cf1de182e2..ba61c54778948 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -340,13 +340,13 @@ def nanpercentile(a, q):
 # For numpy versions < 1.13, numpy.equal tests identity of objects instead of
 # equality
 
-test_array = np.array([np.nan], dtype=object)
-test_mask = test_array != test_array
+_nan_object_array = np.array([np.nan], dtype=object)
+_nan_object_mask = _nan_object_array != _nan_object_array
 
-if np.array_equal(test_mask, np.array([True])):
-    def custom_isnan(X):
+if np.array_equal(_nan_object_mask, np.array([True])):
+    def _compat_isnan(X):
         return X != X
 
 else:
-    def custom_isnan(X):
+    def _compat_isnan(X):
         return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool)

From 2f6d0b1e857b84c2fece45477a7914f2d496abe3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 13 Jun 2018 18:06:20 +0200
Subject: [PATCH 14/31] DOCTEST fix printing estimator

---
 doc/modules/impute.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 2667b123e5fdc..49e5fbef40fb6 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -94,7 +94,7 @@ Here is an example snippet::
     >>> imp = MICEImputer(n_imputations=10, random_state=0)
     >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
     MICEImputer(imputation_order='ascending', initial_strategy='mean',
-          max_value=None, min_value=None, missing_values='NaN', n_burn_in=10,
+          max_value=None, min_value=None, missing_values=nan, n_burn_in=10,
           n_imputations=10, n_nearest_features=None, predictor=None,
           random_state=0, verbose=False)
     >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]

From 3884d4e195e3c1552ee177e8f14e1177ff1f90cd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 13 Jun 2018 18:34:32 +0200
Subject: [PATCH 15/31] EXA fix example using constant strategy

---
 ...=> plot_column_transformer_mixed_types.py} | 44 ++++++++++---------
 1 file changed, 23 insertions(+), 21 deletions(-)
 rename examples/compose/{column_transformer_mixed_types.py => plot_column_transformer_mixed_types.py} (74%)

diff --git a/examples/compose/column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
similarity index 74%
rename from examples/compose/column_transformer_mixed_types.py
rename to examples/compose/plot_column_transformer_mixed_types.py
index d5767ad231452..95fd7aa6bef34 100644
--- a/examples/compose/column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -27,14 +27,16 @@
 from __future__ import print_function
 
 import pandas as pd
+import numpy as np
 
-from sklearn.compose import make_column_transformer
-from sklearn.pipeline import make_pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.preprocessing import StandardScaler, CategoricalEncoder
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split, GridSearchCV
 
+np.random.seed(0)
 
 # Read data from Titanic dataset.
 titanic_url = ('https://raw.githubusercontent.com/amueller/'
@@ -49,27 +51,27 @@
 # - embarked: categories encoded as strings {'C', 'S', 'Q'}.
 # - sex: categories encoded as strings {'female', 'male'}.
 # - pclass: ordinal integers {1, 2, 3}.
-numeric_features = ['age', 'fare']
-categorical_features = ['embarked', 'sex', 'pclass']
-
-# Provisionally, use pd.fillna() to impute missing values for categorical
-# features; SimpleImputer will eventually support strategy="constant".
-data[categorical_features] = data[categorical_features].fillna(value='missing')
 
 # We create the preprocessing pipelines for both numeric and categorical data.
-numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
-categorical_transformer = CategoricalEncoder('onehot-dense',
-                                             handle_unknown='ignore')
+numeric_features = ['age', 'fare']
+numeric_transformer = Pipeline(steps=[
+    ('imputer', SimpleImputer(strategy='median')),
+    ('scaler', StandardScaler())])
+
+categorical_features = ['embarked', 'sex', 'pclass']
+categorical_transformer = Pipeline(steps=[
+    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
+    ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])
 
-preprocessing_pl = make_column_transformer(
-    (numeric_features, numeric_transformer),
-    (categorical_features, categorical_transformer),
-    remainder='drop'
-)
+preprocessor = ColumnTransformer(transformers=[
+    ('num', numeric_transformer, numeric_features),
+    ('cat', categorical_transformer, categorical_features)],
+                                 remainder='drop')
 
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
-clf = make_pipeline(preprocessing_pl, LogisticRegression())
+clf = Pipeline(steps=[('preprocessor', preprocessor),
+                      ('classifier', LogisticRegression())])
 
 X = data.drop('survived', axis=1)
 y = data.survived.values
@@ -78,7 +80,7 @@
                                                     shuffle=True)
 
 clf.fit(X_train, y_train)
-print("model score: %f" % clf.score(X_test, y_test))
+print("model score: %.3f" % clf.score(X_test, y_test))
 
 
 ###############################################################################
@@ -93,12 +95,12 @@
 
 
 param_grid = {
-    'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
-    'logisticregression__C': [0.1, 1.0, 1.0],
+    'preprocessor__num__imputer__strategy': ['mean', 'median'],
+    'classifier__C': [0.1, 1.0, 10, 100],
 }
 
 grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
 grid_search.fit(X_train, y_train)
 
-print(("best logistic regression from grid search: %f"
+print(("best logistic regression from grid search: %.3f"
        % grid_search.score(X_test, y_test)))

From 72eb6b5def911a85531147ebe8e19687c90d440e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 13 Jun 2018 18:37:11 +0200
Subject: [PATCH 16/31] COSMIT

---
 examples/compose/plot_column_transformer_mixed_types.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index 95fd7aa6bef34..1847a40034fee 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -63,10 +63,11 @@
     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
     ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])
 
-preprocessor = ColumnTransformer(transformers=[
-    ('num', numeric_transformer, numeric_features),
-    ('cat', categorical_transformer, categorical_features)],
-                                 remainder='drop')
+preprocessor = ColumnTransformer(
+    transformers=[
+        ('num', numeric_transformer, numeric_features),
+        ('cat', categorical_transformer, categorical_features)],
+    remainder='drop')
 
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.

From cc9aa6f20b78b84f5ba27620f25ef02880a63624 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 13 Jun 2018 18:44:28 +0200
Subject: [PATCH 17/31] COSMIT

---
 examples/compose/plot_column_transformer_mixed_types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
index 1847a40034fee..f1c7d146c9643 100644
--- a/examples/compose/plot_column_transformer_mixed_types.py
+++ b/examples/compose/plot_column_transformer_mixed_types.py
@@ -75,7 +75,7 @@
                       ('classifier', LogisticRegression())])
 
 X = data.drop('survived', axis=1)
-y = data.survived.values
+y = data['survived']
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                     shuffle=True)

From d4e52264fef23ae252f16df0e65152f5419346e7 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 14 Jun 2018 11:55:28 +0200
Subject: [PATCH 18/31] adressed @glemaitre remarks

---
 doc/modules/impute.rst            |   4 +-
 sklearn/impute.py                 |  11 +-
 sklearn/tests/test_impute.py      | 215 +++++++++++++++---------------
 sklearn/utils/estimator_checks.py |   7 +-
 sklearn/utils/fixes.py            |   5 +-
 5 files changed, 122 insertions(+), 120 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 49e5fbef40fb6..45b866a7123ab 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -58,8 +58,8 @@ in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
 The :class:`SimpleImputer` class also supports categorical datas represented as
-string values or pandas categoricals when using the "most_frequent" or
-"constant" strategy::
+string values or pandas categoricals when using the ``most_frequent`` or
+``constant`` strategy::
 
     >>> import pandas as pd
     >>> df = pd.DataFrame([["a", "x"],
diff --git a/sklearn/impute.py b/sklearn/impute.py
index f4cb476b547f9..8b038d1706795 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -131,7 +131,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     Notes
     -----
     Columns which only contained missing values at `fit` are discarded upon
-    `transform` if strategy is not "constant"
+    `transform` if strategy is not "constant".
 
     """
     def __init__(self, missing_values=np.nan, strategy="mean",
@@ -192,14 +192,15 @@ def fit(self, X, y=None):
             if X.dtype.kind in ("i", "u", "f"):
                 if not isinstance(fill_value, numbers.Real):
                     raise TypeError(
-                        "fill_value={0} is invalid. Expected a numerical value"
-                        " when imputing numerical data".format(fill_value))
+                        "'fill_value'={0} is invalid. Expected a numerical"
+                        " value when imputing numerical"
+                        " data".format(fill_value))
 
             elif X.dtype.kind == "O":
                 if not isinstance(fill_value, six.string_types):
                     raise TypeError(
-                        "fill_value={0} is invalid. Expected an str instance "
-                        "when imputing categorical data.".format(fill_value))
+                        "'fill_value'={0} is invalid. Expected an str instance"
+                        " when imputing categorical data.".format(fill_value))
 
             else:
                 raise TypeError(
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 83d78793729dd..28bf7df71836e 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -8,6 +8,7 @@
 import io
 
 from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_allclose_dense_sparse
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_almost_equal
 from sklearn.utils.testing import assert_raises
@@ -77,22 +78,22 @@ def test_imputation_shape():
         assert X_imputed.shape == (10, 2)
 
 
-def test_imputation_valid_types():
+@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match",
+                         [(1, 0, "x", None, "imputing numerical"),
+                          (1., np.nan, "x", None, "imputing numerical"),
+                          ("a", "", 0, object, "imputing categorical"),
+                          (True, "nan", "x", "c", "cannot work")])
+def test_imputation_error_invalid_types(X_data, missing_value,
+                                        fill_value, dtype, match):
     # Verify that exceptions are raised on invalid inputs
-    tests = [(1, 0, "fill_value", None),
-             (1., np.nan, "fill_value", None),
-             ("a", "", 0, object),
-             (True, "nan", "fill_value", "c")]
+    X = np.full((3, 5), X_data, dtype=dtype)
+    X[0, 0] = missing_value
 
-    for X_data, missing_value, fill_value, dtype in tests:
-        X = np.full((3, 5), X_data, dtype=dtype)
-        X[0, 0] = missing_value
-
-        with pytest.raises(TypeError):
-            imputer = SimpleImputer(missing_values=missing_value,
-                                    strategy="constant",
-                                    fill_value=fill_value)
-            imputer.fit(X).transform(X)
+    with pytest.raises(TypeError, match=match):
+        imputer = SimpleImputer(missing_values=missing_value,
+                                strategy="constant",
+                                fill_value=fill_value)
+        imputer.fit_transform(X)
 
 
 def safe_median(arr, *args, **kwargs):
@@ -230,31 +231,32 @@ def test_imputation_most_frequent():
     _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1)
 
 
-def test_imputation_most_frequent_objects():
+@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0])
+def test_imputation_most_frequent_objects(marker):
     # Test imputation using the most-frequent strategy.
-    for marker in (None, np.nan, "NAN", "", 0):
-        X = np.array([
-            [marker, marker, "a", "f"],
-            [marker, "c", marker, "d"],
-            [marker, "b", "d", marker],
-            [marker, "c", "d", "h"],
-        ], dtype=object)
-
-        X_true = np.array([
-            ["c", "a", "f"],
-            ["c", "d", "d"],
-            ["b", "d", "d"],
-            ["c", "d", "h"],
-        ], dtype=object)
+    X = np.array([
+        [marker, marker, "a", "f"],
+        [marker, "c", marker, "d"],
+        [marker, "b", "d", marker],
+        [marker, "c", "d", "h"],
+    ], dtype=object)
 
-        imputer = SimpleImputer(missing_values=marker,
-                                strategy="most_frequent")
-        X_trans = imputer.fit(X).transform(X)
+    X_true = np.array([
+        ["c", "a", "f"],
+        ["c", "d", "d"],
+        ["b", "d", "d"],
+        ["c", "d", "h"],
+    ], dtype=object)
+
+    imputer = SimpleImputer(missing_values=marker,
+                            strategy="most_frequent")
+    X_trans = imputer.fit(X).transform(X)
 
-        assert_array_equal(X_trans, X_true)
+    assert_array_equal(X_trans, X_true)
 
 
-def test_imputation_most_frequent_pandas():
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_imputation_most_frequent_pandas(dtype):
     # Test imputation using the most frequent strategy on pandas df
     pd = pytest.importorskip("pandas")
 
@@ -264,21 +266,19 @@ def test_imputation_most_frequent_pandas():
                     "a,j,,\n"
                     "b,j,x,")
 
-    for dtype in (object, "category"):
-        df = pd.read_csv(f, dtype=dtype)
-        f.seek(0)
+    df = pd.read_csv(f, dtype=dtype)
 
-        X_true = np.array([
-            ["a", "i", "x"],
-            ["a", "j", "y"],
-            ["a", "j", "x"],
-            ["b", "j", "x"]
-        ], dtype=object)
+    X_true = np.array([
+        ["a", "i", "x"],
+        ["a", "j", "y"],
+        ["a", "j", "x"],
+        ["b", "j", "x"]
+    ], dtype=object)
 
-        imputer = SimpleImputer(strategy="most_frequent")
-        X_trans = imputer.fit(df).transform(df)
+    imputer = SimpleImputer(strategy="most_frequent")
+    X_trans = imputer.fit_transform(df)
 
-        assert_array_equal(X_trans, X_true)
+    assert_array_equal(X_trans, X_true)
 
 
 def test_imputation_constant_integer():
@@ -299,66 +299,73 @@ def test_imputation_constant_integer():
 
     imputer = SimpleImputer(missing_values=-1, strategy="constant",
                             fill_value=0)
-    X_trans = imputer.fit(X).transform(X)
+    X_trans = imputer.fit_transform(X)
 
     assert_array_equal(X_trans, X_true)
 
 
-def test_imputation_constant_float():
+@pytest.mark.parametrize("format", ["csr", "array"])
+def test_imputation_constant_float(format):
     # Test imputation using the constant strategy on floats
-    for format in ["csr", "array"]:
-        X = np.array([
-            [np.nan, 1.1, 2.2, np.nan],
-            [3.3, np.nan, 4.4, np.nan],
-            [5.5, 6.6, np.nan, np.nan],
-            [7.7, 8.8, 9.9, np.nan]
-        ])
-
-        X = sparse.csr_matrix(X) if format == "csr" else X
-
-        X_true = np.array([
-            [0, 1.1, 2.2, 0],
-            [3.3, 0, 4.4, 0],
-            [5.5, 6.6, 0, 0],
-            [7.7, 8.8, 9.9, 0]
-        ])
-
-        X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true
-
-        imputer = SimpleImputer(strategy="constant", fill_value=0)
-        X_trans = imputer.fit(X).transform(X)
-
-        if format == "csr":
-            assert_allclose(X_trans.toarray(), X_true.toarray())
-        else:
-            assert_allclose(X_trans, X_true)
+    X = np.array([
+        [np.nan, 1.1, 2.2, np.nan],
+        [3.3, np.nan, 4.4, np.nan],
+        [5.5, 6.6, np.nan, np.nan],
+        [7.7, 8.8, 9.9, np.nan]
+    ])
 
+    X = sparse.csr_matrix(X) if format == "csr" else X
 
-def test_imputation_constant_object():
+    X_true = np.array([
+        [0, 1.1, 2.2, 0],
+        [3.3, 0, 4.4, 0],
+        [5.5, 6.6, 0, 0],
+        [7.7, 8.8, 9.9, 0]
+    ])
+
+    if format == "csr":
+        X_true = sparse.csr_matrix(X_true)
+        X_true[np.array([[True, False, False, True],
+                         [False, True, False, True],
+                         [False, False, True, True],
+                         [False, False, False, True]])] = 0
+
+    imputer = SimpleImputer(strategy="constant", fill_value=0)
+    X_trans = imputer.fit_transform(X)
+
+    print(X_trans)
+    print()
+    print(X_true)
+
+    assert_allclose_dense_sparse(X_trans, X_true)
+
+
+@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0])
+def test_imputation_constant_object(marker):
     # Test imputation using the constant strategy on objects
-    for marker in (None, np.nan, "NAN", "", 0):
-        X = np.array([
-            [marker, "a", "b", marker],
-            ["c", marker, "d", marker],
-            ["e", "f", marker, marker],
-            ["g", "h", "i", marker]
-        ], dtype=object)
+    X = np.array([
+        [marker, "a", "b", marker],
+        ["c", marker, "d", marker],
+        ["e", "f", marker, marker],
+        ["g", "h", "i", marker]
+    ], dtype=object)
 
-        X_true = np.array([
-            ["missing", "a", "b", "missing"],
-            ["c", "missing", "d", "missing"],
-            ["e", "f", "missing", "missing"],
-            ["g", "h", "i", "missing"]
-        ], dtype=object)
+    X_true = np.array([
+        ["missing", "a", "b", "missing"],
+        ["c", "missing", "d", "missing"],
+        ["e", "f", "missing", "missing"],
+        ["g", "h", "i", "missing"]
+    ], dtype=object)
 
-        imputer = SimpleImputer(missing_values=marker, strategy="constant",
-                                fill_value="missing")
-        X_trans = imputer.fit(X).transform(X)
+    imputer = SimpleImputer(missing_values=marker, strategy="constant",
+                            fill_value="missing")
+    X_trans = imputer.fit_transform(X)
 
-        assert_array_equal(X_trans, X_true)
+    assert_array_equal(X_trans, X_true)
 
 
-def test_imputation_constant_pandas():
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_imputation_constant_pandas(dtype):
     # Test imputation using the constant strategy on pandas df
     pd = pytest.importorskip("pandas")
 
@@ -368,21 +375,19 @@ def test_imputation_constant_pandas():
                     "a,j,,\n"
                     "b,j,x,")
 
-    for dtype in (object, "category"):
-        df = pd.read_csv(f, dtype=dtype)
-        f.seek(0)
+    df = pd.read_csv(f, dtype=dtype)
 
-        X_true = np.array([
-            ["missing", "i", "x", "missing"],
-            ["a", "missing", "y", "missing"],
-            ["a", "j", "missing", "missing"],
-            ["b", "j", "x", "missing"]
-        ], dtype=object)
+    X_true = np.array([
+        ["missing", "i", "x", "missing"],
+        ["a", "missing", "y", "missing"],
+        ["a", "j", "missing", "missing"],
+        ["b", "j", "x", "missing"]
+    ], dtype=object)
 
-        imputer = SimpleImputer(strategy="constant", fill_value="missing")
-        X_trans = imputer.fit(df).transform(df)
+    imputer = SimpleImputer(strategy="constant", fill_value="missing")
+    X_trans = imputer.fit_transform(df)
 
-        assert_array_equal(X_trans, X_true)
+    assert_array_equal(X_trans, X_true)
 
 
 def test_imputation_pipeline_grid_search():
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 1ba8da83c56f8..18e0f9d3c3ed8 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -2036,16 +2036,11 @@ def param_filter(p):
             if isinstance(param_value, np.ndarray):
                 assert_array_equal(param_value, init_param.default)
             else:
-                # Allows to set default parameters to np.nan
-                if (param_value is not np.nan or
-                        init_param.default is not np.nan):
-                    assert_equal(param_value, init_param.default,
-                                 init_param.name)
-
                 def _isscalarnan(x):
                     return isinstance(x, numbers.Real) and np.isnan(x)
 
                 if _isscalarnan(param_value):
+                    # Allows to set default parameters to np.nan
                     assert param_value is init_param.default, init_param.name
                 else:
                     assert param_value == init_param.default, init_param.name
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 6b6e6ac1f6249..74ecf0db0960f 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -270,8 +270,9 @@ def nanpercentile(a, q):
 
 
 # Fix for behavior inconsistency on numpy.equal for object dtypes.
-# For numpy versions < 1.13, numpy.equal tests identity of objects instead of
-# equality
+# For numpy versions < 1.13, numpy.equal tests element-wise identity of objects
+# instead of equality. This fix returns the mask of NaNs in an array of
+# numerical or object values for all nupy versions.
 
 _nan_object_array = np.array([np.nan], dtype=object)
 _nan_object_mask = _nan_object_array != _nan_object_array

From 6efd1221b54eae5f030913ab8294ea1e106c0b72 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 14 Jun 2018 12:08:36 +0200
Subject: [PATCH 19/31] small corrections

---
 sklearn/impute.py            | 8 ++++----
 sklearn/tests/test_impute.py | 4 ----
 sklearn/utils/fixes.py       | 4 ++--
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 8b038d1706795..972891cbbbcd0 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -22,7 +22,7 @@
 from .utils.sparsefuncs import _get_median
 from .utils.validation import check_is_fitted
 from .utils.validation import FLOAT_DTYPES
-from .utils.fixes import _compat_isnan
+from .utils.fixes import _object_dtype_isnan
 
 from .externals import six
 
@@ -46,7 +46,7 @@ def _get_mask(X, value_to_mask):
             return np.isnan(X)
         else:
             # np.isnan does not work on object dtypes.
-            return _compat_isnan(X)
+            return _object_dtype_isnan(X)
 
     else:
         # X == value_to_mask with object dytpes does not always perform
@@ -90,7 +90,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
     Parameters
     ----------
     missing_values : real number, string, np.nan or None, \
-optional (default=np.nan).
+optional (default=np.nan)
         The placeholder for the missing values. All occurrences of
         `missing_values` will be imputed.
 
@@ -105,7 +105,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
           value along each column.
         - If "constant", then replace missing values with fill_value.
 
-    fill_value : string or numerical value, optional (default=None)
+    fill_value : string or numerical value, optional
         When strategy == "constant", fill_value is used to replace all
         occurrences of missing_values.
         If left to the default, fill_value will be 0 when imputing numerical
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 28bf7df71836e..0fb4dc7ffbc2e 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -333,10 +333,6 @@ def test_imputation_constant_float(format):
     imputer = SimpleImputer(strategy="constant", fill_value=0)
     X_trans = imputer.fit_transform(X)
 
-    print(X_trans)
-    print()
-    print(X_true)
-
     assert_allclose_dense_sparse(X_trans, X_true)
 
 
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 74ecf0db0960f..6595e5ac0fa43 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -278,9 +278,9 @@ def nanpercentile(a, q):
 _nan_object_mask = _nan_object_array != _nan_object_array
 
 if np.array_equal(_nan_object_mask, np.array([True])):
-    def _compat_isnan(X):
+    def _object_dtype_isnan(X):
         return X != X
 
 else:
-    def _compat_isnan(X):
+    def _object_dtype_isnan(X):
         return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool)

From fbaaa381083b091d10fdf35503d74b51d3732a17 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Thu, 14 Jun 2018 14:27:14 +0200
Subject: [PATCH 20/31] small corrections

---
 doc/modules/impute.rst       |  8 +++----
 sklearn/impute.py            | 10 ++++++--
 sklearn/tests/test_impute.py | 46 ++++++++++++++++++------------------
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 45b866a7123ab..493ff6fb7439e 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -57,17 +57,17 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored
 in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
-The :class:`SimpleImputer` class also supports categorical datas represented as
+The :class:`SimpleImputer` class also supports categorical data represented as
 string values or pandas categoricals when using the ``most_frequent`` or
 ``constant`` strategy::
 
     >>> import pandas as pd
     >>> df = pd.DataFrame([["a", "x"],
-    ...                    ["", "y"],
-    ...                    ["a", ""],
+    ...                    [np.nan, "y"],
+    ...                    ["a", np.nan],
     ...                    ["b", "y"]], dtype="category")
     ...
-    >>> imp = SimpleImputer(missing_values="", strategy="most_frequent")
+    >>> imp = SimpleImputer(strategy="most_frequent")
     >>> print(imp.fit_transform(df))      # doctest: +NORMALIZE_WHITESPACE
     [['a' 'x']
      ['a' 'y']
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 972891cbbbcd0..c5b67069dd41e 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -60,7 +60,13 @@ def _most_frequent(array, extra_value, n_repeat):
        of the array."""
     # Compute the most frequent value in array only
     if array.size > 0:
-        mode = stats.mode(array)
+        with warnings.catch_warnings():
+            # stats.mode raises a warning when input array contains objects due
+            # to incapacity to detect NaNs. Irrelevant here since input array
+            # has already been NaN-masked.
+            warnings.simplefilter("ignore", RuntimeWarning)
+            mode = stats.mode(array)
+
         most_frequent_value = mode[0][0]
         most_frequent_count = mode[1][0]
     else:
@@ -177,7 +183,7 @@ def fit(self, X, y=None):
         """
         X = self._validate_input(X)
 
-        # default missing_values is 0 for numerical input and "missing_value"
+        # default fill_value is 0 for numerical input and "missing_value"
         # otherwise
         if self.fill_value is None:
             if X.dtype.kind in ("i", "f"):
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 0fb4dc7ffbc2e..459b7c47c774d 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -78,24 +78,6 @@ def test_imputation_shape():
         assert X_imputed.shape == (10, 2)
 
 
-@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match",
-                         [(1, 0, "x", None, "imputing numerical"),
-                          (1., np.nan, "x", None, "imputing numerical"),
-                          ("a", "", 0, object, "imputing categorical"),
-                          (True, "nan", "x", "c", "cannot work")])
-def test_imputation_error_invalid_types(X_data, missing_value,
-                                        fill_value, dtype, match):
-    # Verify that exceptions are raised on invalid inputs
-    X = np.full((3, 5), X_data, dtype=dtype)
-    X[0, 0] = missing_value
-
-    with pytest.raises(TypeError, match=match):
-        imputer = SimpleImputer(missing_values=missing_value,
-                                strategy="constant",
-                                fill_value=fill_value)
-        imputer.fit_transform(X)
-
-
 def safe_median(arr, *args, **kwargs):
     # np.median([]) raises a TypeError for numpy >= 1.10.1
     length = arr.size if hasattr(arr, 'size') else len(arr)
@@ -281,6 +263,24 @@ def test_imputation_most_frequent_pandas(dtype):
     assert_array_equal(X_trans, X_true)
 
 
+@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match",
+                         [(1, 0, "x", None, "imputing numerical"),
+                          (1., np.nan, "x", None, "imputing numerical"),
+                          ("a", "", 0, object, "imputing categorical"),
+                          (True, "nan", "x", "c", "cannot work")])
+def test_imputation_constant_error_invalid_types(X_data, missing_value,
+                                                 fill_value, dtype, match):
+    # Verify that exceptions are raised on invalid inputs
+    X = np.full((3, 5), X_data, dtype=dtype)
+    X[0, 0] = missing_value
+
+    with pytest.raises(TypeError, match=match):
+        imputer = SimpleImputer(missing_values=missing_value,
+                                strategy="constant",
+                                fill_value=fill_value)
+        imputer.fit_transform(X)
+
+
 def test_imputation_constant_integer():
     # Test imputation using the constant strategy on integers
     X = np.array([
@@ -374,13 +374,13 @@ def test_imputation_constant_pandas(dtype):
     df = pd.read_csv(f, dtype=dtype)
 
     X_true = np.array([
-        ["missing", "i", "x", "missing"],
-        ["a", "missing", "y", "missing"],
-        ["a", "j", "missing", "missing"],
-        ["b", "j", "x", "missing"]
+        ["missing_value", "i", "x", "missing_value"],
+        ["a", "missing_value", "y", "missing_value"],
+        ["a", "j", "missing_value", "missing_value"],
+        ["b", "j", "x", "missing_value"]
     ], dtype=object)
 
-    imputer = SimpleImputer(strategy="constant", fill_value="missing")
+    imputer = SimpleImputer(strategy="constant")
     X_trans = imputer.fit_transform(df)
 
     assert_array_equal(X_trans, X_true)

From 724a4a135e6a4c09f91a39ba9d6ec02af7e54b2c Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 15 Jun 2018 09:27:58 +0200
Subject: [PATCH 21/31] fixed np.nan is not np.float('nan') issue

---
 doc/modules/impute.rst            |  4 ++--
 sklearn/impute.py                 | 11 +++++++----
 sklearn/utils/__init__.py         | 19 +++++++++++++++++++
 sklearn/utils/estimator_checks.py |  7 ++-----
 4 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 493ff6fb7439e..6356bc4ecf81c 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -58,8 +58,8 @@ in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
 The :class:`SimpleImputer` class also supports categorical data represented as
-string values or pandas categoricals when using the ``most_frequent`` or
-``constant`` strategy::
+string values or pandas categoricals when using the `most_frequent` or
+`constant` strategy::
 
     >>> import pandas as pd
     >>> df = pd.DataFrame([["a", "x"],
diff --git a/sklearn/impute.py b/sklearn/impute.py
index c5b67069dd41e..4aeff59b4b810 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -23,6 +23,7 @@
 from .utils.validation import check_is_fitted
 from .utils.validation import FLOAT_DTYPES
 from .utils.fixes import _object_dtype_isnan
+from .utils import is_scalar_nan
 
 from .externals import six
 
@@ -160,7 +161,7 @@ def _validate_input(self, X):
         else:
             dtype = FLOAT_DTYPES
 
-        if self.missing_values is not np.nan:
+        if not is_scalar_nan(self.missing_values):
             force_all_finite = True
         else:
             force_all_finite = "allow-nan"
@@ -186,7 +187,7 @@ def fit(self, X, y=None):
         # default fill_value is 0 for numerical input and "missing_value"
         # otherwise
         if self.fill_value is None:
-            if X.dtype.kind in ("i", "f"):
+            if X.dtype.kind in ("i", "u", "f"):
                 fill_value = 0
             else:
                 fill_value = "missing_value"
@@ -769,8 +770,10 @@ def _initial_imputation(self, X):
             Input data's missing indicator matrix, where "n_samples" is the
             number of samples and "n_features" is the number of features.
         """
-        force_all_finite = "allow-nan" if self.missing_values is np.nan \
-                           else True
+        if is_scalar_nan(self.missing_values):
+            force_all_finite = "allow-nan"
+        else:
+            force_all_finite = True
 
         X = check_array(X, dtype=FLOAT_DTYPES, order="F",
                         force_all_finite=force_all_finite)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index e3d1e7faaabd1..385e170ea9bb6 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -2,6 +2,7 @@
 The :mod:`sklearn.utils` module includes various utilities.
 """
 from collections import Sequence
+import numbers
 
 import numpy as np
 from scipy.sparse import issparse
@@ -553,3 +554,21 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None,
                       (working_memory, np.ceil(row_bytes * 2 ** -20)))
         chunk_n_rows = 1
     return chunk_n_rows
+
+
+def is_scalar_nan(x):
+    """Tests if x is NaN
+
+    This function is meant to overcome the issue that np.isnan does not allow
+    non-numerical types as input, and that np.nan is not np.float('nan').
+
+    Parameters
+    ----------
+    x : any type
+
+    Returns
+    -------
+    boolean
+    """
+
+    return isinstance(x, numbers.Real) and np.isnan(x)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 18e0f9d3c3ed8..cc41e7a2fad73 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -8,7 +8,6 @@
 from copy import deepcopy
 import struct
 from functools import partial
-import numbers
 
 import numpy as np
 from scipy import sparse
@@ -37,6 +36,7 @@
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.testing import assert_dict_equal
 from sklearn.utils.testing import create_memmap_backed_data
+from sklearn.utils import is_scalar_nan
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 
 
@@ -2036,10 +2036,7 @@ def param_filter(p):
             if isinstance(param_value, np.ndarray):
                 assert_array_equal(param_value, init_param.default)
             else:
-                def _isscalarnan(x):
-                    return isinstance(x, numbers.Real) and np.isnan(x)
-
-                if _isscalarnan(param_value):
+                if is_scalar_nan(param_value):
                     # Allows to set default parameters to np.nan
                     assert param_value is init_param.default, init_param.name
                 else:

From e5f4a1bc12b2f66401c583e5cef76917672599c1 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 15 Jun 2018 11:28:28 +0200
Subject: [PATCH 22/31] add tests for is_scalar_nan

---
 doc/modules/impute.rst            |  4 ++--
 sklearn/utils/__init__.py         | 13 +++++++++++++
 sklearn/utils/tests/test_utils.py | 16 ++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst
index 6356bc4ecf81c..fd01321599dbe 100644
--- a/doc/modules/impute.rst
+++ b/doc/modules/impute.rst
@@ -58,8 +58,8 @@ in the matrix. This format is thus suitable when there are many more missing
 values than observed values.
 
 The :class:`SimpleImputer` class also supports categorical data represented as
-string values or pandas categoricals when using the `most_frequent` or
-`constant` strategy::
+string values or pandas categoricals when using the ``'most_frequent'`` or
+``'constant'`` strategy::
 
     >>> import pandas as pd
     >>> df = pd.DataFrame([["a", "x"],
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 385e170ea9bb6..7821d92f97d78 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -569,6 +569,19 @@ def is_scalar_nan(x):
     Returns
     -------
     boolean
+
+    Examples
+    --------
+    >>> is_scalar_nan(np.nan)
+    True
+    >>> is_scalar_nan(float("nan"))
+    True
+    >>> is_scalar_nan(None)
+    False
+    >>> is_scalar_nan("")
+    False
+    >>> is_scalar_nan([np.nan])
+    False
     """
 
     return isinstance(x, numbers.Real) and np.isnan(x)
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index 1f1efed825c80..9d63bee96bf15 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -21,6 +21,7 @@
 from sklearn.utils import shuffle
 from sklearn.utils import gen_even_slices
 from sklearn.utils import get_chunk_n_rows
+from sklearn.utils import is_scalar_nan
 from sklearn.utils.extmath import pinvh
 from sklearn.utils.arpack import eigsh
 from sklearn.utils.mocking import MockDataFrame
@@ -314,3 +315,18 @@ def check_warning(*args, **kw):
                                max_n_rows=max_n_rows)
         assert actual == expected
         assert type(actual) is type(expected)
+
+
+@pytest.mark.parametrize("input, result", [(float("nan"), True),
+                                           (np.nan, True),
+                                           (np.float("nan"), True),
+                                           (np.float32("nan"), True),
+                                           (np.float64("nan"), True),
+                                           (0, False),
+                                           (0., False),
+                                           (None, False),
+                                           ("", False),
+                                           ("nan", False),
+                                           ([np.nan], False)])
+def test_is_scalar_nan(input, result):
+    assert is_scalar_nan(input) is result

From d69f85588c3c64b9b26f1f538ee1066d3349f4b4 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 15 Jun 2018 11:56:10 +0200
Subject: [PATCH 23/31] fixed

---
 sklearn/impute.py                 | 4 ++--
 sklearn/utils/__init__.py         | 4 +++-
 sklearn/utils/tests/test_utils.py | 6 +++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 4aeff59b4b810..84728cc44f7cb 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -462,8 +462,8 @@ class MICEImputer(BaseEstimator, TransformerMixin):
 
     initial_strategy : str, optional (default="mean")
         Which strategy to use to initialize the missing values. Same as the
-        ``strategy`` parameter in :class:`sklearn.preprocessing.Imputer`
-        Valid values: {"mean", "median", or "most_frequent"}.
+        ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer`
+        Valid values: {"mean", "median", "most_frequent", or "constant"}.
 
     min_value : float, optional (default=None)
         Minimum possible imputed value. Default of ``None`` will set minimum
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 7821d92f97d78..312efaa8a533a 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -584,4 +584,6 @@ def is_scalar_nan(x):
     False
     """
 
-    return isinstance(x, numbers.Real) and np.isnan(x)
+    # convert from numpy.bool_ to python bool to ensure that testing
+    # is_scalar_nan(x) is True does not fail.
+    return bool(isinstance(x, numbers.Real) and np.isnan(x))
diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py
index 9d63bee96bf15..c2474c58c13f7 100644
--- a/sklearn/utils/tests/test_utils.py
+++ b/sklearn/utils/tests/test_utils.py
@@ -317,7 +317,7 @@ def check_warning(*args, **kw):
         assert type(actual) is type(expected)
 
 
-@pytest.mark.parametrize("input, result", [(float("nan"), True),
+@pytest.mark.parametrize("value, result", [(float("nan"), True),
                                            (np.nan, True),
                                            (np.float("nan"), True),
                                            (np.float32("nan"), True),
@@ -328,5 +328,5 @@ def check_warning(*args, **kw):
                                            ("", False),
                                            ("nan", False),
                                            ([np.nan], False)])
-def test_is_scalar_nan(input, result):
-    assert is_scalar_nan(input) is result
+def test_is_scalar_nan(value, result):
+    assert is_scalar_nan(value) is result

From c3a730d214d85b38f2536dd8b1c72b4bfd5fc318 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 15 Jun 2018 16:22:42 +0200
Subject: [PATCH 24/31] fixed v2

---
 sklearn/tests/test_impute.py | 33 ++++++++++++++-------------------
 sklearn/utils/__init__.py    |  4 +++-
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 459b7c47c774d..d728a5a777cca 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -304,33 +304,28 @@ def test_imputation_constant_integer():
     assert_array_equal(X_trans, X_true)
 
 
-@pytest.mark.parametrize("format", ["csr", "array"])
-def test_imputation_constant_float(format):
+@pytest.mark.parametrize("array_constructor", [sparse.csr_matrix, np.asarray])
+def test_imputation_constant_float(array_constructor):
     # Test imputation using the constant strategy on floats
     X = np.array([
-        [np.nan, 1.1, 2.2, np.nan],
-        [3.3, np.nan, 4.4, np.nan],
-        [5.5, 6.6, np.nan, np.nan],
-        [7.7, 8.8, 9.9, np.nan]
+        [np.nan, 1.1, 0, np.nan],
+        [1.2, np.nan, 1.3, np.nan],
+        [0, 0, np.nan, np.nan],
+        [1.4, 1.5, 0, np.nan]
     ])
 
-    X = sparse.csr_matrix(X) if format == "csr" else X
-
     X_true = np.array([
-        [0, 1.1, 2.2, 0],
-        [3.3, 0, 4.4, 0],
-        [5.5, 6.6, 0, 0],
-        [7.7, 8.8, 9.9, 0]
+        [-1, 1.1, 0, -1],
+        [1.2, -1, 1.3, -1],
+        [0, 0, -1, -1],
+        [1.4, 1.5, 0, -1]
     ])
 
-    if format == "csr":
-        X_true = sparse.csr_matrix(X_true)
-        X_true[np.array([[True, False, False, True],
-                         [False, True, False, True],
-                         [False, False, True, True],
-                         [False, False, False, True]])] = 0
+    X = array_constructor(X)
+
+    X_true = array_constructor(X_true)
 
-    imputer = SimpleImputer(strategy="constant", fill_value=0)
+    imputer = SimpleImputer(strategy="constant", fill_value=-1)
     X_trans = imputer.fit_transform(X)
 
     assert_allclose_dense_sparse(X_trans, X_true)
diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
index 312efaa8a533a..bb1f383505fe9 100644
--- a/sklearn/utils/__init__.py
+++ b/sklearn/utils/__init__.py
@@ -586,4 +586,6 @@ def is_scalar_nan(x):
 
     # convert from numpy.bool_ to python bool to ensure that testing
     # is_scalar_nan(x) is True does not fail.
-    return bool(isinstance(x, numbers.Real) and np.isnan(x))
+    # Redondant np.floating is needed because numbers can't match np.float32
+    # in python 2.
+    return bool(isinstance(x, (numbers.Real, np.floating)) and np.isnan(x))

From 94d7964cf494f2ceb01521fa8084a1fd1d310a91 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Fri, 15 Jun 2018 20:59:01 +0200
Subject: [PATCH 25/31] adressed @jnothman remark

---
 sklearn/impute.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index 84728cc44f7cb..3adc0a58ef70b 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -43,8 +43,11 @@
 def _get_mask(X, value_to_mask):
     """Compute the boolean mask X == missing_values."""
     if value_to_mask is np.nan:
-        if X.dtype.kind in ("i", "u", "f"):
+        if X.dtype.kind == "f":
             return np.isnan(X)
+        elif X.dtype.kind in ("i", "u"):
+            # can't have NaNs in integer array.
+            return np.zeros(X.shape, dtype=bool)
         else:
             # np.isnan does not work on object dtypes.
             return _object_dtype_isnan(X)

From 20456f41ebbabad7197d5b6a16c38f5a2e1380b5 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Sun, 17 Jun 2018 17:35:12 +0200
Subject: [PATCH 26/31] add tests for warnings and errors catch

---
 sklearn/tests/test_impute.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index d728a5a777cca..b052bf1904cb3 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -78,6 +78,26 @@ def test_imputation_shape():
         assert X_imputed.shape == (10, 2)
 
 
+@pytest.mark.parametrize("strategy", ["const", 101, None])
+def test_imputation_error_invalid_strategy(strategy):
+    X = np.ones((3, 5))
+    X[0, 0] = np.nan
+
+    with pytest.raises(ValueError, match=str(strategy)):
+        imputer = SimpleImputer(strategy=strategy)
+        imputer.fit_transform(X)
+
+
+@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"])
+def test_imputation_deletion_warning(strategy):
+    X = np.ones((3, 5))
+    X[:, 0] = np.nan
+
+    with pytest.warns(UserWarning, match="Deleting"):
+        imputer = SimpleImputer(strategy=strategy, verbose=True)
+        imputer.fit_transform(X)
+
+
 def safe_median(arr, *args, **kwargs):
     # np.median([]) raises a TypeError for numpy >= 1.10.1
     length = arr.size if hasattr(arr, 'size') else len(arr)

From 7d3d1b54daced3d9c1e1eef5e31306e75346d128 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 18 Jun 2018 14:30:25 +0200
Subject: [PATCH 27/31] dtype checks modifications  + more tests

---
 doc/whats_new/v0.20.rst      | 11 ++++++++
 sklearn/impute.py            | 50 +++++++++++++++-----------------
 sklearn/tests/test_impute.py | 55 ++++++++++++++++++++++++++++--------
 sklearn/utils/validation.py  |  4 +++
 4 files changed, 82 insertions(+), 38 deletions(-)

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index 87569d8649d86..2df84310eb6bf 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -613,6 +613,17 @@ Imputer
   SimpleImputer().fit_transform(X.T).T)``). :issue:`10829` by :user:`Guillaume
   Lemaitre <glemaitre>` and :user:`Gilberto Olimpio <gilbertoolimpio>`.
 
+- The :class:`impute.SimpleImputer` has a new strategy, ``'constant'``, to
+  complete missing values with a fixed one, given by the ``fill_value``
+  parameter. This strategy supports numeric and non-numeric data, and so does
+  the ``'most_frequent'`` strategy now. :issue:`11211` by :user:`Jeremie du
+  Boisberranger <jeremiedbb>`.
+
+- The NaN marker for the missing values has been changed between the
+  :class:`preprocessing.Imputer` and the :class:`impute.SimpleImputer`.
+  ``missing_values='NaN'`` should now be ``missing_values=np.nan``.
+  :issue:`11211` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
+
 Outlier Detection models
 
 - More consistent outlier detection API:
diff --git a/sklearn/impute.py b/sklearn/impute.py
index 3adc0a58ef70b..cd70d0f58b894 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -99,8 +99,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
 
     Parameters
     ----------
-    missing_values : real number, string, np.nan or None, \
-optional (default=np.nan)
+    missing_values : number, string, np.nan (default) or None
         The placeholder for the missing values. All occurrences of
         `missing_values` will be imputed.
 
@@ -108,12 +107,13 @@ class SimpleImputer(BaseEstimator, TransformerMixin):
         The imputation strategy.
 
         - If "mean", then replace missing values using the mean along
-          each column.
+          each column. Can only be used with numeric data.
         - If "median", then replace missing values using the median along
-          each column.
+          each column. Can only be used with numeric data.
         - If "most_frequent", then replace missing using the most frequent
-          value along each column.
-        - If "constant", then replace missing values with fill_value.
+          value along each column. Can be used with strings or numeric data.
+        - If "constant", then replace missing values with fill_value. Can be
+          used with strings or numeric data.
 
     fill_value : string or numerical value, optional
         When strategy == "constant", fill_value is used to replace all
@@ -169,8 +169,17 @@ def _validate_input(self, X):
         else:
             force_all_finite = "allow-nan"
 
-        return check_array(X, accept_sparse='csc', dtype=dtype,
-                           force_all_finite=force_all_finite, copy=self.copy)
+        try:
+            X = check_array(X, accept_sparse='csc', dtype=dtype,
+                            force_all_finite=force_all_finite, copy=self.copy)
+        except TypeError:
+            raise TypeError("Cannot use {0} strategy with non-numeric "
+                            "data.".format(self.strategy))
+
+        if X.dtype.kind not in ("i", "u", "f", "O"):
+            X = X.astype(object)
+
+        return X
 
     def fit(self, X, y=None):
         """Fit the imputer on X.
@@ -198,25 +207,12 @@ def fit(self, X, y=None):
             fill_value = self.fill_value
 
         # fill_value should be numerical in case of numerical input
-        if self.strategy == "constant":
-            if X.dtype.kind in ("i", "u", "f"):
-                if not isinstance(fill_value, numbers.Real):
-                    raise TypeError(
-                        "'fill_value'={0} is invalid. Expected a numerical"
-                        " value when imputing numerical"
-                        " data".format(fill_value))
-
-            elif X.dtype.kind == "O":
-                if not isinstance(fill_value, six.string_types):
-                    raise TypeError(
-                        "'fill_value'={0} is invalid. Expected an str instance"
-                        " when imputing categorical data.".format(fill_value))
-
-            else:
-                raise TypeError(
-                    "SimpleImputer cannot work on data with dtype={0}: "
-                    "expecting numerical or categorical data with "
-                    "dtype=object.".format(X.dtype))
+        if (self.strategy == "constant" and
+                X.dtype.kind in ("i", "u", "f") and
+                not isinstance(fill_value, numbers.Real)):
+            raise TypeError("'fill_value'={0} is invalid. Expected a numerical"
+                            " value when imputing numerical"
+                            " data".format(fill_value))
 
         if sparse.issparse(X):
             self.statistics_ = self._sparse_fit(X,
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index b052bf1904cb3..cab1996c0c14a 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -210,6 +210,44 @@ def test_imputation_median_special_cases():
                       statistics_median, np.nan)
 
 
+@pytest.mark.parametrize("strategy", ["mean", "median"])
+@pytest.mark.parametrize("dtype", [None, object, str])
+def test_imputation_mean_median_error_invalid_type(strategy, dtype):
+    X = np.array([["a", "b", 3],
+                  [4, "e", 6],
+                  ["g", "h", 9]], dtype=dtype)
+
+    with pytest.raises(TypeError, match="non-numeric data"):
+        imputer = SimpleImputer(strategy=strategy)
+        imputer.fit_transform(X)
+
+
+@pytest.mark.parametrize("strategy", ["constant", "most_frequent"])
+@pytest.mark.parametrize("dtype", [None, object, str])
+def test_imputation_non_numeric(strategy, dtype):
+    # Test imputation on non-numeric data using "most_frequent" and "constant"
+    # strategy
+    X = np.array([
+        ["", "a", "f"],
+        ["c", "d", "d"],
+        ["b", "d", "d"],
+        ["c", "d", "h"],
+    ], dtype=dtype)
+
+    X_true = np.array([
+        ["c", "a", "f"],
+        ["c", "d", "d"],
+        ["b", "d", "d"],
+        ["c", "d", "h"],
+    ], dtype=dtype)
+
+    imputer = SimpleImputer(missing_values="", strategy=strategy,
+                            fill_value="c")
+    X_trans = imputer.fit(X).transform(X)
+
+    assert_array_equal(X_trans, X_true)
+
+
 def test_imputation_most_frequent():
     # Test imputation using the most-frequent strategy.
     X = np.array([
@@ -283,21 +321,16 @@ def test_imputation_most_frequent_pandas(dtype):
     assert_array_equal(X_trans, X_true)
 
 
-@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match",
-                         [(1, 0, "x", None, "imputing numerical"),
-                          (1., np.nan, "x", None, "imputing numerical"),
-                          ("a", "", 0, object, "imputing categorical"),
-                          (True, "nan", "x", "c", "cannot work")])
-def test_imputation_constant_error_invalid_types(X_data, missing_value,
-                                                 fill_value, dtype, match):
-    # Verify that exceptions are raised on invalid inputs
-    X = np.full((3, 5), X_data, dtype=dtype)
+@pytest.mark.parametrize("X_data, missing_value", [(1, 0), (1., np.nan)])
+def test_imputation_constant_error_invalid_type(X_data, missing_value):
+    # Verify that exceptions are raised on invalid fill_value type
+    X = np.full((3, 5), X_data)
     X[0, 0] = missing_value
 
-    with pytest.raises(TypeError, match=match):
+    with pytest.raises(TypeError, match="imputing numerical"):
         imputer = SimpleImputer(missing_values=missing_value,
                                 strategy="constant",
-                                fill_value=fill_value)
+                                fill_value="x")
         imputer.fit_transform(X)
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 5fd54dc49b078..2f62299a206b0 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -492,6 +492,10 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None,
             try:
                 warnings.simplefilter('error', ComplexWarning)
                 array = np.asarray(array, dtype=dtype, order=order)
+            except ValueError as ve:
+                if "convert" in ve:
+                    raise TypeError("Invalid dtype conversion from {0} to "
+                                    "{1}".format(dtype_orig, dtype))
             except ComplexWarning:
                 raise ValueError("Complex data not supported\n"
                                  "{}\n".format(array))

From 972668b750a4378cbc771fdc7c18d95f21397481 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Mon, 18 Jun 2018 16:53:24 +0200
Subject: [PATCH 28/31] fixed exception catching + go back to not allow any but
 object dtype

---
 sklearn/impute.py            | 15 +++++++++++----
 sklearn/tests/test_impute.py | 28 ++++++++++------------------
 sklearn/utils/validation.py  |  4 ----
 3 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index cd70d0f58b894..e6a4614da5f62 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -172,12 +172,19 @@ def _validate_input(self, X):
         try:
             X = check_array(X, accept_sparse='csc', dtype=dtype,
                             force_all_finite=force_all_finite, copy=self.copy)
-        except TypeError:
-            raise TypeError("Cannot use {0} strategy with non-numeric "
-                            "data.".format(self.strategy))
+        except ValueError as ve:
+            if "could not convert" in str(ve):
+                raise TypeError("Cannot use {0} strategy with non-numeric "
+                                "data. Received datatype :{1}."
+                                "".format(self.strategy, X.dtype.kind))
+            else:
+                raise ve
 
         if X.dtype.kind not in ("i", "u", "f", "O"):
-            X = X.astype(object)
+            raise TypeError("The SimpleImputer does not support this datatype"
+                            " ({0}). Please provide either numeric data or"
+                            " categorical data represented by integer or "
+                            "object datatypes.".format(X.dtype))
 
         return X
 
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index cab1996c0c14a..7dbbe6bd378f1 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -223,29 +223,21 @@ def test_imputation_mean_median_error_invalid_type(strategy, dtype):
 
 
 @pytest.mark.parametrize("strategy", ["constant", "most_frequent"])
-@pytest.mark.parametrize("dtype", [None, object, str])
-def test_imputation_non_numeric(strategy, dtype):
+@pytest.mark.parametrize("dtype", [str, np.dtype('U'), np.dtype('S')])
+def test_imputation_const_mostf_error_invalid_types(strategy, dtype):
     # Test imputation on non-numeric data using "most_frequent" and "constant"
     # strategy
     X = np.array([
-        ["", "a", "f"],
-        ["c", "d", "d"],
-        ["b", "d", "d"],
-        ["c", "d", "h"],
+        [np.nan, np.nan, "a", "f"],
+        [np.nan, "c", np.nan, "d"],
+        [np.nan, "b", "d", np.nan],
+        [np.nan, "c", "d", "h"],
     ], dtype=dtype)
 
-    X_true = np.array([
-        ["c", "a", "f"],
-        ["c", "d", "d"],
-        ["b", "d", "d"],
-        ["c", "d", "h"],
-    ], dtype=dtype)
-
-    imputer = SimpleImputer(missing_values="", strategy=strategy,
-                            fill_value="c")
-    X_trans = imputer.fit(X).transform(X)
-
-    assert_array_equal(X_trans, X_true)
+    err_msg = "SimpleImputer does not support this datatype"
+    with pytest.raises(TypeError, match=err_msg):
+        imputer = SimpleImputer(strategy=strategy)
+        imputer.fit(X).transform(X)
 
 
 def test_imputation_most_frequent():
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 2f62299a206b0..5fd54dc49b078 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -492,10 +492,6 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None,
             try:
                 warnings.simplefilter('error', ComplexWarning)
                 array = np.asarray(array, dtype=dtype, order=order)
-            except ValueError as ve:
-                if "convert" in ve:
-                    raise TypeError("Invalid dtype conversion from {0} to "
-                                    "{1}".format(dtype_orig, dtype))
             except ComplexWarning:
                 raise ValueError("Complex data not supported\n"
                                  "{}\n".format(array))

From f1da7b8a218c99e78800fafa68b92f754071f5ae Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 20 Jun 2018 10:31:04 +0200
Subject: [PATCH 29/31] error message update

---
 sklearn/impute.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index e6a4614da5f62..a17e653117ad1 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -174,17 +174,19 @@ def _validate_input(self, X):
                             force_all_finite=force_all_finite, copy=self.copy)
         except ValueError as ve:
             if "could not convert" in str(ve):
-                raise TypeError("Cannot use {0} strategy with non-numeric "
-                                "data. Received datatype :{1}."
-                                "".format(self.strategy, X.dtype.kind))
+                raise ValueError("Cannot use {0} strategy with non-numeric "
+                                 "data. Received datatype :{1}."
+                                 "".format(self.strategy, X.dtype.kind))
             else:
                 raise ve
 
         if X.dtype.kind not in ("i", "u", "f", "O"):
-            raise TypeError("The SimpleImputer does not support this datatype"
-                            " ({0}). Please provide either numeric data or"
-                            " categorical data represented by integer or "
-                            "object datatypes.".format(X.dtype))
+            raise ValueError("SimpleImputer does not work on data with dtype "
+                             "{0}. Please provide either a numeric array (with"
+                             " a floating point or integer dtype) or "
+                             "categorical data represented either as an array "
+                             "with integer dtype or an array of string values "
+                             "with an object dtype.".format(X.dtype))
 
         return X
 

From fb1a4e9f4adde5ea5221ef3519034107ca387548 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 20 Jun 2018 10:40:15 +0200
Subject: [PATCH 30/31] with tests update is better

---
 sklearn/impute.py            | 2 +-
 sklearn/tests/test_impute.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index a17e653117ad1..eace23aebf1d8 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -181,7 +181,7 @@ def _validate_input(self, X):
                 raise ve
 
         if X.dtype.kind not in ("i", "u", "f", "O"):
-            raise ValueError("SimpleImputer does not work on data with dtype "
+            raise ValueError("SimpleImputer does not support data with dtype "
                              "{0}. Please provide either a numeric array (with"
                              " a floating point or integer dtype) or "
                              "categorical data represented either as an array "
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 7dbbe6bd378f1..211d7122a40ee 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -217,7 +217,7 @@ def test_imputation_mean_median_error_invalid_type(strategy, dtype):
                   [4, "e", 6],
                   ["g", "h", 9]], dtype=dtype)
 
-    with pytest.raises(TypeError, match="non-numeric data"):
+    with pytest.raises(ValueError, match="non-numeric data"):
         imputer = SimpleImputer(strategy=strategy)
         imputer.fit_transform(X)
 
@@ -234,8 +234,8 @@ def test_imputation_const_mostf_error_invalid_types(strategy, dtype):
         [np.nan, "c", "d", "h"],
     ], dtype=dtype)
 
-    err_msg = "SimpleImputer does not support this datatype"
-    with pytest.raises(TypeError, match=err_msg):
+    err_msg = "SimpleImputer does not support data"
+    with pytest.raises(ValueError, match=err_msg):
         imputer = SimpleImputer(strategy=strategy)
         imputer.fit(X).transform(X)
 

From c8246f2549ddbd2680fb3996dfd39689a2665bd0 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 20 Jun 2018 15:23:36 +0200
Subject: [PATCH 31/31] TypeError -> ValueError

---
 sklearn/impute.py            | 6 +++---
 sklearn/tests/test_impute.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/impute.py b/sklearn/impute.py
index eace23aebf1d8..15e719b7f1d13 100644
--- a/sklearn/impute.py
+++ b/sklearn/impute.py
@@ -219,9 +219,9 @@ def fit(self, X, y=None):
         if (self.strategy == "constant" and
                 X.dtype.kind in ("i", "u", "f") and
                 not isinstance(fill_value, numbers.Real)):
-            raise TypeError("'fill_value'={0} is invalid. Expected a numerical"
-                            " value when imputing numerical"
-                            " data".format(fill_value))
+            raise ValueError("'fill_value'={0} is invalid. Expected a "
+                             "numerical value when imputing numerical "
+                             "data".format(fill_value))
 
         if sparse.issparse(X):
             self.statistics_ = self._sparse_fit(X,
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
index 211d7122a40ee..170d94333bf44 100644
--- a/sklearn/tests/test_impute.py
+++ b/sklearn/tests/test_impute.py
@@ -319,7 +319,7 @@ def test_imputation_constant_error_invalid_type(X_data, missing_value):
     X = np.full((3, 5), X_data)
     X[0, 0] = missing_value
 
-    with pytest.raises(TypeError, match="imputing numerical"):
+    with pytest.raises(ValueError, match="imputing numerical"):
         imputer = SimpleImputer(missing_values=missing_value,
                                 strategy="constant",
                                 fill_value="x")