Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 08debe2

Browse files
committed
FIX: Ensure expected behaviour when copy=True|False in Imputer
1 parent 224a4d5 commit 08debe2

File tree

2 files changed

+45
-28
lines changed

2 files changed

+45
-28
lines changed

sklearn/preprocessing/imputation.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010
from scipy import stats
1111

1212
from ..base import BaseEstimator, TransformerMixin
13+
from ..utils import check_arrays
1314
from ..utils import array2d
1415
from ..utils import atleast2d_or_csr
1516
from ..utils import atleast2d_or_csc
17+
from ..utils import as_float_array
1618

1719
from ..externals import six
1820

@@ -127,7 +129,8 @@ class Imputer(BaseEstimator, TransformerMixin):
127129
128130
copy : boolean, optional (default=True)
129131
If True, a copy of X will be created. If False, imputation will
130-
be done in-place.
132+
be done in-place. Note that if X is sparse and missing_values=0, then
133+
a new copy is made, even if copy=False.
131134
132135
Attributes
133136
----------
@@ -337,14 +340,13 @@ def transform(self, X):
337340
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
338341
The input data to complete.
339342
"""
340-
if self.copy and not isinstance(X, list):
341-
X = X.copy()
343+
X = as_float_array(X, copy=self.copy) # Copy just once
342344

343345
# Since two different arrays can be provided in fit(X) and
344346
# transform(X), the imputation data need to be recomputed
345347
# when the imputation is done per sample
346348
if self.axis == 1:
347-
X = atleast2d_or_csr(X, force_all_finite=False).astype(np.float)
349+
X = atleast2d_or_csr(X, force_all_finite=False, copy=False)
348350

349351
if sparse.issparse(X):
350352
statistics = self._sparse_fit(X,
@@ -358,7 +360,7 @@ def transform(self, X):
358360
self.missing_values,
359361
self.axis)
360362
else:
361-
X = atleast2d_or_csc(X, force_all_finite=False).astype(np.float)
363+
X = atleast2d_or_csc(X, force_all_finite=False, copy=False)
362364
statistics = self.statistics_
363365

364366
# Delete the invalid rows/columns

sklearn/preprocessing/tests/test_imputation.py

Lines changed: 38 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
from scipy import sparse
33

44
from sklearn.utils.testing import assert_equal
5+
from sklearn.utils.testing import assert_not_equal
56
from sklearn.utils.testing import assert_array_equal
67
from sklearn.utils.testing import assert_raises
78
from sklearn.utils.testing import assert_false
9+
from sklearn.utils.testing import assert_true
810

911
from sklearn.preprocessing.imputation import Imputer
1012
from sklearn.pipeline import Pipeline
@@ -250,26 +252,39 @@ def test_imputation_pickle():
250252

251253

252254
def test_imputation_copy():
253-
"""Test imputation with copy=True."""
254-
l = 5
255-
256-
# Test default behaviour and with copy=True
257-
for params in [{}, {'copy': True}]:
258-
X = sparse_random_matrix(l, l, density=0.75, random_state=0)
259-
260-
# Dense
261-
imputer = Imputer(missing_values=0, strategy="mean", **params)
262-
Xt = imputer.fit(X).transform(X)
263-
Xt[0, 0] = np.nan
264-
# Check that the objects are different and that they don't use
265-
# the same buffer
266-
assert_false(np.all(X.todense() == Xt))
267-
268-
# Sparse
269-
imputer = Imputer(missing_values=0, strategy="mean", **params)
270-
X = X.todense()
271-
Xt = imputer.fit(X).transform(X)
272-
Xt[0, 0] = np.nan
273-
# Check that the objects are different and that they don't use
274-
# the same buffer
275-
assert_false(np.all(X == Xt))
255+
"""Test imputation with copy"""
256+
X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)
257+
258+
# copy=True, dense
259+
X = X_orig.copy().todense()
260+
imputer = Imputer(missing_values=0, strategy="mean", copy=True)
261+
Xt = imputer.fit(X).transform(X)
262+
Xt[0, 0] = -1
263+
assert_false(np.all(X == Xt))
264+
assert_not_equal(X.ctypes.data, Xt.ctypes.data)
265+
266+
# copy=True, sparse
267+
X = X_orig.copy()
268+
imputer = Imputer(missing_values=0, strategy="mean", copy=True)
269+
X = X.todense()
270+
Xt = imputer.fit(X).transform(X)
271+
Xt[0, 0] = -1
272+
assert_false(np.all(X == Xt))
273+
assert_not_equal(X.ctypes.data, Xt.ctypes.data)
274+
275+
# copy=False, dense
276+
X = X_orig.copy().todense()
277+
imputer = Imputer(missing_values=0, strategy="mean", copy=False)
278+
Xt = imputer.fit(X).transform(X)
279+
Xt[0, 0] = -1
280+
assert_true(np.all(X == Xt))
281+
assert_equal(X.ctypes.data, Xt.ctypes.data)
282+
283+
# copy=False, sparse
284+
X = X_orig.copy()
285+
imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1)
286+
Xt = imputer.fit(X).transform(X)
287+
assert_true(np.all(X == Xt)) # Fail...
288+
289+
# Note: If X is sparse and if missing_values=0, then a (dense) copy of X is
290+
# made, even if copy=False.

0 commit comments

Comments
 (0)