From c900dd564bb2e178f8fdd36acbf91f9ea7a8e90d Mon Sep 17 00:00:00 2001 From: petrushev Date: Fri, 1 Sep 2017 15:30:17 +0200 Subject: [PATCH 1/3] Deprecate ``Imputer.axis`` argument --- doc/whats_new.rst | 5 +++++ sklearn/preprocessing/imputation.py | 32 +++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 0ca707ce2cbbf..4e728afcb29e3 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -47,6 +47,11 @@ Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. :issue:`9521` by :user:`Hanmin Qin `. +- The ``axis`` parameter in + :class:`preprocessing.Imputer ` is deprecated. Its + removal is planned for 0.22 release. :issue:`9672` by + :user:`Baze Petrushev `. + Bug fixes ......... diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 12d5425fbf604..048404c11edbb 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -82,12 +82,16 @@ class Imputer(BaseEstimator, TransformerMixin): - If "most_frequent", then replace missing using the most frequent value along the axis. - axis : integer, optional (default=0) + axis : integer, optional (default=None) The axis along which to impute. - If `axis=0`, then impute along columns. - If `axis=1`, then impute along rows. + .. deprecated:: 0.20 + ``axis`` will be removed from ``Imputer``, and it will only impute + along columns (axis=0) in 0.22. + verbose : integer, optional (default=0) Controls the verbosity of the imputer. @@ -115,13 +119,18 @@ class Imputer(BaseEstimator, TransformerMixin): contain missing values). """ def __init__(self, missing_values="NaN", strategy="mean", - axis=0, verbose=0, copy=True): + axis=None, verbose=0, copy=True): self.missing_values = missing_values self.strategy = strategy - self.axis = axis self.verbose = verbose self.copy = copy + self.axis = axis + if axis is not None: + warnings.warn("'axis' will be removed from Imputer, and it will " + "only impute along columns (axis=0) in 0.22", + DeprecationWarning) + def fit(self, X, y=None): """Fit the imputer on X. @@ -143,14 +152,14 @@ def fit(self, X, y=None): " got strategy={1}".format(allowed_strategies, self.strategy)) - if self.axis not in [0, 1]: + if self.axis not in [None, 0, 1]: raise ValueError("Can only impute missing values on axis 0 and 1, " " got axis={0}".format(self.axis)) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data will be computed in transform() # when the imputation is done per sample (i.e., when axis=1). - if self.axis == 0: + if self.axis == 0 or self.axis is None: X = check_array(X, accept_sparse='csc', dtype=np.float64, force_all_finite=False) @@ -169,8 +178,12 @@ def fit(self, X, y=None): def _sparse_fit(self, X, strategy, missing_values, axis): """Fit the transformer on sparse data.""" + if axis is None: + axis = 0 + # Imputation is done "by column", so if we want to do it # by row we only need to convert the matrix to csr format. + if axis == 1: X = X.tocsr() else: @@ -249,6 +262,9 @@ def _sparse_fit(self, X, strategy, missing_values, axis): def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" + if axis is None: + axis = 0 + X = check_array(X, force_all_finite=False) mask = _get_mask(X, missing_values) masked_X = ma.masked_array(X, mask=mask) @@ -306,7 +322,7 @@ def transform(self, X): X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ - if self.axis == 0: + if self.axis is None or self.axis == 0: check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) @@ -341,7 +357,7 @@ def transform(self, X): valid_statistics_indexes = np.where(valid_mask)[0] missing = np.arange(X.shape[not self.axis])[invalid_mask] - if self.axis == 0 and invalid_mask.any(): + if (self.axis is None or self.axis == 0) and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) @@ -366,7 +382,7 @@ def transform(self, X): n_missing = np.sum(mask, axis=self.axis) values = np.repeat(valid_statistics, n_missing) - if self.axis == 0: + if self.axis is None or self.axis == 0: coordinates = np.where(mask.transpose())[::-1] else: coordinates = mask From 7d40c5f94383e7a5ee6cd18bf2c156163572d1ca Mon Sep 17 00:00:00 2001 From: petrushev Date: Sun, 3 Sep 2017 21:39:09 +0200 Subject: [PATCH 2/3] Add proxy property `Imputer.axis_` that is used in the fit/transform and equals to 0 when axis is None. --- doc/whats_new.rst | 2 +- sklearn/preprocessing/imputation.py | 50 +++++++++++++---------------- 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 4e728afcb29e3..120573c1d09c5 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -47,12 +47,12 @@ Model evaluation and meta-estimators - A scorer based on :func:`metrics.brier_score_loss` is also available. :issue:`9521` by :user:`Hanmin Qin `. + - The ``axis`` parameter in :class:`preprocessing.Imputer ` is deprecated. Its removal is planned for 0.22 release. :issue:`9672` by :user:`Baze Petrushev `. - Bug fixes ......... diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 048404c11edbb..da776067a929b 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -90,7 +90,7 @@ class Imputer(BaseEstimator, TransformerMixin): .. deprecated:: 0.20 ``axis`` will be removed from ``Imputer``, and it will only impute - along columns (axis=0) in 0.22. + along columns (i.e., ``axis=0``) in 0.22. verbose : integer, optional (default=0) Controls the verbosity of the imputer. @@ -122,15 +122,10 @@ def __init__(self, missing_values="NaN", strategy="mean", axis=None, verbose=0, copy=True): self.missing_values = missing_values self.strategy = strategy + self.axis = axis self.verbose = verbose self.copy = copy - self.axis = axis - if axis is not None: - warnings.warn("'axis' will be removed from Imputer, and it will " - "only impute along columns (axis=0) in 0.22", - DeprecationWarning) - def fit(self, X, y=None): """Fit the imputer on X. @@ -152,14 +147,22 @@ def fit(self, X, y=None): " got strategy={1}".format(allowed_strategies, self.strategy)) - if self.axis not in [None, 0, 1]: + if self.axis is None: + self.axis_ = 0 + else: + warnings.warn("'axis' will be removed from Imputer, and it will " + "only impute along columns (axis=0) in 0.22", + DeprecationWarning) + self.axis_ = self.axis + + if self.axis_ not in [0, 1]: raise ValueError("Can only impute missing values on axis 0 and 1, " - " got axis={0}".format(self.axis)) + " got axis={0}".format(self.axis_)) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data will be computed in transform() # when the imputation is done per sample (i.e., when axis=1). - if self.axis == 0 or self.axis is None: + if self.axis_ == 0: X = check_array(X, accept_sparse='csc', dtype=np.float64, force_all_finite=False) @@ -167,23 +170,19 @@ def fit(self, X, y=None): self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, - self.axis) + self.axis_) else: self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, - self.axis) + self.axis_) return self def _sparse_fit(self, X, strategy, missing_values, axis): """Fit the transformer on sparse data.""" - if axis is None: - axis = 0 - # Imputation is done "by column", so if we want to do it # by row we only need to convert the matrix to csr format. - if axis == 1: X = X.tocsr() else: @@ -262,9 +261,6 @@ def _sparse_fit(self, X, strategy, missing_values, axis): def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" - if axis is None: - axis = 0 - X = check_array(X, force_all_finite=False) mask = _get_mask(X, missing_values) masked_X = ma.masked_array(X, mask=mask) @@ -322,7 +318,7 @@ def transform(self, X): X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ - if self.axis is None or self.axis == 0: + if self.axis_ == 0: check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) @@ -342,27 +338,27 @@ def transform(self, X): statistics = self._sparse_fit(X, self.strategy, self.missing_values, - self.axis) + self.axis_) else: statistics = self._dense_fit(X, self.strategy, self.missing_values, - self.axis) + self.axis_) # Delete the invalid rows/columns invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] - missing = np.arange(X.shape[not self.axis])[invalid_mask] + missing = np.arange(X.shape[not self.axis_])[invalid_mask] - if (self.axis is None or self.axis == 0) and invalid_mask.any(): + if self.axis_ == 0 and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] - elif self.axis == 1 and invalid_mask.any(): + elif self.axis_ == 1 and invalid_mask.any(): raise ValueError("Some rows only contain " "missing values: %s" % missing) @@ -379,10 +375,10 @@ def transform(self, X): X = X.toarray() mask = _get_mask(X, self.missing_values) - n_missing = np.sum(mask, axis=self.axis) + n_missing = np.sum(mask, axis=self.axis_) values = np.repeat(valid_statistics, n_missing) - if self.axis is None or self.axis == 0: + if self.axis_ == 0: coordinates = np.where(mask.transpose())[::-1] else: coordinates = mask From 7f88b5fe3c4d969d329111d1f50ac344f5148606 Mon Sep 17 00:00:00 2001 From: petrushev Date: Tue, 5 Sep 2017 22:40:26 +0200 Subject: [PATCH 3/3] Switch to private proxy property `Imputer._axis` --- sklearn/preprocessing/imputation.py | 30 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index da776067a929b..881ccc31aed97 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -148,21 +148,21 @@ def fit(self, X, y=None): self.strategy)) if self.axis is None: - self.axis_ = 0 + self._axis = 0 else: warnings.warn("'axis' will be removed from Imputer, and it will " "only impute along columns (axis=0) in 0.22", DeprecationWarning) - self.axis_ = self.axis + self._axis = self.axis - if self.axis_ not in [0, 1]: + if self._axis not in [0, 1]: raise ValueError("Can only impute missing values on axis 0 and 1, " - " got axis={0}".format(self.axis_)) + " got axis={0}".format(self._axis)) # Since two different arrays can be provided in fit(X) and # transform(X), the imputation data will be computed in transform() # when the imputation is done per sample (i.e., when axis=1). - if self.axis_ == 0: + if self._axis == 0: X = check_array(X, accept_sparse='csc', dtype=np.float64, force_all_finite=False) @@ -170,12 +170,12 @@ def fit(self, X, y=None): self.statistics_ = self._sparse_fit(X, self.strategy, self.missing_values, - self.axis_) + self._axis) else: self.statistics_ = self._dense_fit(X, self.strategy, self.missing_values, - self.axis_) + self._axis) return self @@ -318,7 +318,7 @@ def transform(self, X): X : {array-like, sparse matrix}, shape = [n_samples, n_features] The input data to complete. """ - if self.axis_ == 0: + if self._axis == 0: check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) @@ -338,27 +338,27 @@ def transform(self, X): statistics = self._sparse_fit(X, self.strategy, self.missing_values, - self.axis_) + self._axis) else: statistics = self._dense_fit(X, self.strategy, self.missing_values, - self.axis_) + self._axis) # Delete the invalid rows/columns invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] - missing = np.arange(X.shape[not self.axis_])[invalid_mask] + missing = np.arange(X.shape[not self._axis])[invalid_mask] - if self.axis_ == 0 and invalid_mask.any(): + if self._axis == 0 and invalid_mask.any(): if self.verbose: warnings.warn("Deleting features without " "observed values: %s" % missing) X = X[:, valid_statistics_indexes] - elif self.axis_ == 1 and invalid_mask.any(): + elif self._axis == 1 and invalid_mask.any(): raise ValueError("Some rows only contain " "missing values: %s" % missing) @@ -375,10 +375,10 @@ def transform(self, X): X = X.toarray() mask = _get_mask(X, self.missing_values) - n_missing = np.sum(mask, axis=self.axis_) + n_missing = np.sum(mask, axis=self._axis) values = np.repeat(valid_statistics, n_missing) - if self.axis_ == 0: + if self._axis == 0: coordinates = np.where(mask.transpose())[::-1] else: coordinates = mask