diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 8de10a11ca351..fdaf50364671a 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -550,6 +550,12 @@ Changelog `fit` instead of `__init__`. :pr:`21434` by :user:`Krum Arnaudov `. +- |API| Adds :meth:`get_feature_names_out` to + :class:`preprocessing.Normalizer`, + :class:`preprocessing.KernelCenterer`, + :class:`preprocessing.OrdinalEncoder`, and + :class:`preprocessing.Binarizer`. :pr:`21079` by `Thomas Fan`_. + :mod:`sklearn.random_projection` ................................ diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 835694e11512c..ea38106837642 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -16,7 +16,12 @@ from scipy import optimize from scipy.special import boxcox -from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin +from ..base import ( + BaseEstimator, + TransformerMixin, + _OneToOneFeatureMixin, + _ClassNamePrefixFeaturesOutMixin, +) from ..utils import check_array from ..utils.deprecation import deprecated from ..utils.extmath import _incremental_mean_and_var, row_norms @@ -1825,7 +1830,7 @@ def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False): return X -class Normalizer(TransformerMixin, BaseEstimator): +class Normalizer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one @@ -1996,7 +2001,7 @@ def binarize(X, *, threshold=0.0, copy=True): return X -class Binarizer(TransformerMixin, BaseEstimator): +class Binarizer(_OneToOneFeatureMixin, TransformerMixin, BaseEstimator): """Binarize data (set feature values to 0 or 1) according to a threshold. Values greater than the threshold map to 1, while values less than @@ -2119,7 +2124,7 @@ def _more_tags(self): return {"stateless": True} -class KernelCenterer(TransformerMixin, BaseEstimator): +class KernelCenterer(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): r"""Center an arbitrary kernel matrix :math:`K`. Let define a kernel :math:`K` such that: @@ -2258,6 +2263,15 @@ def transform(self, K, copy=True): return K + @property + def _n_features_out(self): + """Number of transformed output features.""" + # Used by _ClassNamePrefixFeaturesOutMixin. This model preserves the + # number of input features but this is not a one-to-one mapping in the + # usual sense. Hence the choice not to use _OneToOneFeatureMixin to + # implement get_feature_names_out for this class. + return self.n_features_in_ + def _more_tags(self): return {"pairwise": True} diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 4c59cb691527f..b7fcdf616760a 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -7,7 +7,7 @@ from scipy import sparse import numbers -from ..base import BaseEstimator, TransformerMixin +from ..base import BaseEstimator, TransformerMixin, _OneToOneFeatureMixin from ..utils import check_array, is_scalar_nan from ..utils.deprecation import deprecated from ..utils.validation import check_is_fitted @@ -731,7 +731,7 @@ def get_feature_names_out(self, input_features=None): return np.asarray(feature_names, dtype=object) -class OrdinalEncoder(_BaseEncoder): +class OrdinalEncoder(_OneToOneFeatureMixin, _BaseEncoder): """ Encode categorical features as an integer array. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 3476e40dd9bbc..ee326aba1b3de 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -45,6 +45,7 @@ from sklearn.preprocessing import power_transform from sklearn.preprocessing._data import _handle_zeros_in_scale from sklearn.preprocessing._data import BOUNDS_THRESHOLD +from sklearn.metrics.pairwise import linear_kernel from sklearn.exceptions import NotFittedError @@ -2672,6 +2673,8 @@ def test_one_to_one_features(Transformer): StandardScaler, QuantileTransformer, PowerTransformer, + Normalizer, + Binarizer, ], ) def test_one_to_one_features_pandas(Transformer): @@ -2691,3 +2694,16 @@ def test_one_to_one_features_pandas(Transformer): with pytest.raises(ValueError, match=msg): invalid_names = list("abcd") tr.get_feature_names_out(invalid_names) + + +def test_kernel_centerer_feature_names_out(): + """Test that kernel centerer `feature_names_out`.""" + + rng = np.random.RandomState(0) + X = rng.random_sample((6, 4)) + X_pairwise = linear_kernel(X) + centerer = KernelCenterer().fit(X_pairwise) + + names_out = centerer.get_feature_names_out() + samples_out2 = X_pairwise.shape[1] + assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index dcc07d25af5fd..27c52088f80d9 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -1387,3 +1387,15 @@ def test_ordinal_encoder_python_integer(): assert_array_equal(encoder.categories_, np.sort(X, axis=0).T) X_trans = encoder.transform(X) assert_array_equal(X_trans, [[0], [3], [2], [1]]) + + +def test_ordinal_encoder_features_names_out_pandas(): + """Check feature names out is same as the input.""" + pd = pytest.importorskip("pandas") + + names = ["b", "c", "a"] + X = pd.DataFrame([[1, 2, 3]], columns=names) + enc = OrdinalEncoder().fit(X) + + feature_names_out = enc.get_feature_names_out() + assert_array_equal(names, feature_names_out) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index be26202d458d1..350e1e95d9882 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -382,7 +382,6 @@ def test_pandas_column_name_consistency(estimator): GET_FEATURES_OUT_MODULES_TO_IGNORE = [ "ensemble", "kernel_approximation", - "preprocessing", ] diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a4fa30ce55035..a6459059ba2f6 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1828,7 +1828,9 @@ def _get_feature_names(X): def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): - """Get output feature names for transformation. + """Check `input_features` and generate names if needed. + + Commonly used in :term:`get_feature_names_out`. Parameters ---------- @@ -1842,8 +1844,10 @@ def _check_feature_names_in(estimator, input_features=None, *, generate_names=Tr match `feature_names_in_` if `feature_names_in_` is defined. generate_names : bool, default=True - Wether to generate names when `input_features` is `None` and - `estimator.feature_names_in_` is not defined. + Whether to generate names when `input_features` is `None` and + `estimator.feature_names_in_` is not defined. This is useful for transformers + that validates `input_features` but do not require them in + :term:`get_feature_names_out` e.g. `PCA`. Returns -------