diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0ca707ce2cbbf..120573c1d09c5 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -48,6 +48,11 @@ Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. :issue:`9521` by :user:`Hanmin Qin `. +- The ``axis`` parameter in + :class:`preprocessing.Imputer ` is deprecated. Its + removal is planned for 0.22 release. :issue:`9672` by + :user:`Baze Petrushev `. + Bug fixes ......... diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 12d5425fbf604..881ccc31aed97 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -82,12 +82,16 @@ class Imputer(BaseEstimator, TransformerMixin): - If "most_frequent", then replace missing using the most frequent value along the axis. - axis : integer, optional (default=0) + axis : integer, optional (default=None) The axis along which to impute. - If `axis=0`, then impute along columns. - If `axis=1`, then impute along rows. + .. deprecated:: 0.20 + ``axis`` will be removed from ``Imputer``, and it will only impute + along columns (i.e., ``axis=0``) in 0.22. + verbose : integer, optional (default=0) Controls the verbosity of the imputer. @@ -115,7 +119,7 @@ class Imputer(BaseEstimator, TransformerMixin): contain missing values). """ def __init__(self, missing_values="NaN", strategy="mean", - axis=0, verbose=0, copy=True): + axis=None, verbose=0, copy=True): self.missing_values = missing_values self.strategy = strategy self.axis = axis @@ -143,14 +147,22 @@ def fit(self, X, y=None): " got strategy={1}".format(allowed_strategies, self.strategy)) - if self.axis not in [0, 1]: + if self.axis is None: + self._axis = 0 + else: + warnings.warn("'axis' will be removed from Imputer, and it will " + "only impute along columns (axis=0) in 0.22", + DeprecationWarning) + self._axis = self.axis + + if self._axis not in [0, 1]: raise ValueError("Can only impute missing values on axis 0 and 1, " - " got axis={0}".format(self.axis)) + " got axis={0}".format(self._axis)) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data will be computed in transform() # when the imputation is done per sample (i.e., when axis=1). - if self.axis == 0: + if self._axis == 0: X = check_array(X, accept_sparse='csc', dtype=np.float64, force_all_finite=False) @@ -158,12 +170,12 @@ def fit(self, X, y=None): self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, - self.axis) + self._axis) else: self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, - self.axis) + self._axis) return self @@ -306,7 +318,7 @@ def transform(self, X): X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ - if self.axis == 0: + if self._axis == 0: check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) @@ -326,27 +338,27 @@ def transform(self, X): statistics = self._sparse_fit(X, self.strategy, self.missing_values, - self.axis) + self._axis) else: statistics = self._dense_fit(X, self.strategy, self.missing_values, - self.axis) + self._axis) # Delete the invalid rows/columns invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] - missing = np.arange(X.shape[not self.axis])[invalid_mask] + missing = np.arange(X.shape[not self._axis])[invalid_mask] - if self.axis == 0 and invalid_mask.any(): + if self._axis == 0 and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] - elif self.axis == 1 and invalid_mask.any(): + elif self._axis == 1 and invalid_mask.any(): raise ValueError("Some rows only contain " "missing values: %s" % missing) @@ -363,10 +375,10 @@ def transform(self, X): X = X.toarray() mask = _get_mask(X, self.missing_values) - n_missing = np.sum(mask, axis=self.axis) + n_missing = np.sum(mask, axis=self._axis) values = np.repeat(valid_statistics, n_missing) - if self.axis == 0: + if self._axis == 0: coordinates = np.where(mask.transpose())[::-1] else: coordinates = mask