Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG+1] Deprecate Imputer.axis argument #9672

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ Model evaluation and meta-estimators
- A scorer based on :func:`metrics.brier_score_loss` is also available.
:issue:`9521` by :user:`Hanmin Qin <qinhanmin2014>`.

- The ``axis`` parameter in
:class:`preprocessing.Imputer <preprocessing.Imputer>` is deprecated. Its
removal is planned for 0.22 release. :issue:`9672` by
:user:`Baze Petrushev <petrushev>`.

Bug fixes
.........

Expand Down
42 changes: 27 additions & 15 deletions sklearn/preprocessing/imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,16 @@ class Imputer(BaseEstimator, TransformerMixin):
- If "most_frequent", then replace missing using the most frequent
value along the axis.

axis : integer, optional (default=0)
axis : integer, optional (default=None)
The axis along which to impute.

- If `axis=0`, then impute along columns.
- If `axis=1`, then impute along rows.

.. deprecated:: 0.20
``axis`` will be removed from ``Imputer``, and it will only impute
along columns (i.e., ``axis=0``) in 0.22.

verbose : integer, optional (default=0)
Controls the verbosity of the imputer.

Expand Down Expand Up @@ -115,7 +119,7 @@ class Imputer(BaseEstimator, TransformerMixin):
contain missing values).
"""
def __init__(self, missing_values="NaN", strategy="mean",
axis=0, verbose=0, copy=True):
axis=None, verbose=0, copy=True):
self.missing_values = missing_values
self.strategy = strategy
self.axis = axis
Expand Down Expand Up @@ -143,27 +147,35 @@ def fit(self, X, y=None):
" got strategy={1}".format(allowed_strategies,
self.strategy))

if self.axis not in [0, 1]:
if self.axis is None:
self._axis = 0
else:
warnings.warn("'axis' will be removed from Imputer, and it will "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convention is to add the version the deprecation happened as well as the on it will be removed.

From http://scikit-learn.org/stable/developers/contributing.html#deprecation:

As in these examples, the warning message should always give both the version in which the deprecation happened and the version in which the old behavior will be removed.

"only impute along columns (axis=0) in 0.22",
DeprecationWarning)
self._axis = self.axis

if self._axis not in [0, 1]:
raise ValueError("Can only impute missing values on axis 0 and 1, "
" got axis={0}".format(self.axis))
" got axis={0}".format(self._axis))

# Since two different arrays can be provided in fit(X) and
# transform(X), the imputation data will be computed in transform()
# when the imputation is done per sample (i.e., when axis=1).
if self.axis == 0:
if self._axis == 0:
X = check_array(X, accept_sparse='csc', dtype=np.float64,
force_all_finite=False)

if sparse.issparse(X):
self.statistics_ = self._sparse_fit(X,
self.strategy,
self.missing_values,
self.axis)
self._axis)
else:
self.statistics_ = self._dense_fit(X,
self.strategy,
self.missing_values,
self.axis)
self._axis)

return self

Expand Down Expand Up @@ -306,7 +318,7 @@ def transform(self, X):
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
The input data to complete.
"""
if self.axis == 0:
if self._axis == 0:
check_is_fitted(self, 'statistics_')
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
force_all_finite=False, copy=self.copy)
Expand All @@ -326,27 +338,27 @@ def transform(self, X):
statistics = self._sparse_fit(X,
self.strategy,
self.missing_values,
self.axis)
self._axis)

else:
statistics = self._dense_fit(X,
self.strategy,
self.missing_values,
self.axis)
self._axis)

# Delete the invalid rows/columns
invalid_mask = np.isnan(statistics)
valid_mask = np.logical_not(invalid_mask)
valid_statistics = statistics[valid_mask]
valid_statistics_indexes = np.where(valid_mask)[0]
missing = np.arange(X.shape[not self.axis])[invalid_mask]
missing = np.arange(X.shape[not self._axis])[invalid_mask]

if self.axis == 0 and invalid_mask.any():
if self._axis == 0 and invalid_mask.any():
if self.verbose:
warnings.warn("Deleting features without "
"observed values: %s" % missing)
X = X[:, valid_statistics_indexes]
elif self.axis == 1 and invalid_mask.any():
elif self._axis == 1 and invalid_mask.any():
raise ValueError("Some rows only contain "
"missing values: %s" % missing)

Expand All @@ -363,10 +375,10 @@ def transform(self, X):
X = X.toarray()

mask = _get_mask(X, self.missing_values)
n_missing = np.sum(mask, axis=self.axis)
n_missing = np.sum(mask, axis=self._axis)
values = np.repeat(valid_statistics, n_missing)

if self.axis == 0:
if self._axis == 0:
coordinates = np.where(mask.transpose())[::-1]
else:
coordinates = mask
Expand Down