FIX: Ensure expected behaviour when copy=True|False in Imputer

glouppe · glouppe · commit 08debe2cca9e · 2014-01-01T17:52:05.000+01:00
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
@@ -10,9 +10,11 @@
 from scipy import stats
 
 from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_arrays
 from ..utils import array2d
 from ..utils import atleast2d_or_csr
 from ..utils import atleast2d_or_csc
+from ..utils import as_float_array
 
 from ..externals import six
 
@@ -127,7 +129,8 @@ class Imputer(BaseEstimator, TransformerMixin):
 
     copy : boolean, optional (default=True)
         If True, a copy of X will be created. If False, imputation will
-        be done in-place.
+        be done in-place. Note that if X is sparse and missing_values=0, then
+        a new copy is made, even if copy=False.
 
     Attributes
     ----------
@@ -337,14 +340,13 @@ def transform(self, X):
         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
             The input data to complete.
         """
-        if self.copy and not isinstance(X, list):
-            X = X.copy()
+        X = as_float_array(X, copy=self.copy) # Copy just once
 
         # Since two different arrays can be provided in fit(X) and
         # transform(X), the imputation data need to be recomputed
         # when the imputation is done per sample
         if self.axis == 1:
-            X = atleast2d_or_csr(X, force_all_finite=False).astype(np.float)
+            X = atleast2d_or_csr(X, force_all_finite=False, copy=False)
 
             if sparse.issparse(X):
                 statistics = self._sparse_fit(X,
@@ -358,7 +360,7 @@ def transform(self, X):
                                              self.missing_values,
                                              self.axis)
         else:
-            X = atleast2d_or_csc(X, force_all_finite=False).astype(np.float)
+            X = atleast2d_or_csc(X, force_all_finite=False, copy=False)
             statistics = self.statistics_
 
         # Delete the invalid rows/columns
diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py
@@ -2,9 +2,11 @@
 from scipy import sparse
 
 from sklearn.utils.testing import assert_equal
+from sklearn.utils.testing import assert_not_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_false
+from sklearn.utils.testing import assert_true
 
 from sklearn.preprocessing.imputation import Imputer
 from sklearn.pipeline import Pipeline
@@ -250,26 +252,39 @@ def test_imputation_pickle():
 
 
 def test_imputation_copy():
-    """Test imputation with copy=True."""
-    l = 5
-
-    # Test default behaviour and with copy=True
-    for params in [{}, {'copy': True}]:
-        X = sparse_random_matrix(l, l, density=0.75, random_state=0)
-
-        # Dense
-        imputer = Imputer(missing_values=0, strategy="mean", **params)
-        Xt = imputer.fit(X).transform(X)
-        Xt[0, 0] = np.nan
-        # Check that the objects are different and that they don't use
-        # the same buffer
-        assert_false(np.all(X.todense() == Xt))
-
-        # Sparse
-        imputer = Imputer(missing_values=0, strategy="mean", **params)
-        X = X.todense()
-        Xt = imputer.fit(X).transform(X)
-        Xt[0, 0] = np.nan
-        # Check that the objects are different and that they don't use
-        # the same buffer
-        assert_false(np.all(X == Xt))
+    """Test imputation with copy"""
+    X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)
+
+    # copy=True, dense
+    X = X_orig.copy().todense()
+    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
+    Xt = imputer.fit(X).transform(X)
+    Xt[0, 0] = -1
+    assert_false(np.all(X == Xt))
+    assert_not_equal(X.ctypes.data, Xt.ctypes.data)
+
+    # copy=True, sparse
+    X = X_orig.copy()
+    imputer = Imputer(missing_values=0, strategy="mean", copy=True)
+    X = X.todense()
+    Xt = imputer.fit(X).transform(X)
+    Xt[0, 0] = -1
+    assert_false(np.all(X == Xt))
+    assert_not_equal(X.ctypes.data, Xt.ctypes.data)
+
+    # copy=False, dense
+    X = X_orig.copy().todense()
+    imputer = Imputer(missing_values=0, strategy="mean", copy=False)
+    Xt = imputer.fit(X).transform(X)
+    Xt[0, 0] = -1
+    assert_true(np.all(X == Xt))
+    assert_equal(X.ctypes.data, Xt.ctypes.data)
+
+    # copy=False, sparse
+    X = X_orig.copy()
+    imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
+    Xt = imputer.fit(X).transform(X)
+    assert_true(np.all(X == Xt)) # Fail...
+
+    # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is
+    # made, even if copy=False.