diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index a473908d8f1e7..cc1af2f237bf9 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -61,6 +61,13 @@ Changelog error when 'min_idf' or 'max_idf' are floating-point numbers greater than 1. :pr:`20752` by :user:`Alek Lefebvre `. +:mod:`sklearn.impute` +..................... + +- |API| Adds :meth:`get_feature_names_out` to :class:`impute.SimpleImputer`, + :class:`impute.KNNImputer`, :class:`impute.IterativeImputer`, and + :class:`impute.MissingIndicator`. :pr:`21078` by `Thomas Fan`_. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 32ec1624f0c2f..c97a8d24d4578 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -15,6 +15,7 @@ from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES +from ..utils.validation import _check_feature_names_in from ..utils._mask import _get_mask from ..utils import is_scalar_nan @@ -113,6 +114,13 @@ def _concatenate_indicator(self, X_imputed, X_indicator): return hstack((X_imputed, X_indicator)) + def _concatenate_indicator_feature_names_out(self, names, input_features): + if not self.add_indicator: + return names + + indicator_names = self.indicator_.get_feature_names_out(input_features) + return np.concatenate([names, indicator_names]) + def _more_tags(self): return {"allow_nan": is_scalar_nan(self.missing_values)} @@ -596,6 +604,30 @@ def inverse_transform(self, X): X_original[full_mask] = self.missing_values return X_original + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + input_features = _check_feature_names_in(self, input_features) + non_missing_mask = np.logical_not(_get_mask(self.statistics_, np.nan)) + names = input_features[non_missing_mask] + return self._concatenate_indicator_feature_names_out(names, input_features) + class MissingIndicator(TransformerMixin, BaseEstimator): """Binary indicators for missing values. @@ -922,6 +954,35 @@ def fit_transform(self, X, y=None): return imputer_mask + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + input_features = _check_feature_names_in(self, input_features) + prefix = self.__class__.__name__.lower() + return np.asarray( + [ + f"{prefix}_{feature_name}" + for feature_name in input_features[self.features_] + ], + dtype=object, + ) + def _more_tags(self): return { "allow_nan": True, diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 321c1f537520d..908ab5c9efeb1 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -10,6 +10,7 @@ from ..preprocessing import normalize from ..utils import check_array, check_random_state, _safe_indexing, is_scalar_nan from ..utils.validation import FLOAT_DTYPES, check_is_fitted +from ..utils.validation import _check_feature_names_in from ..utils._mask import _get_mask from ._base import _BaseImputer @@ -774,3 +775,26 @@ def fit(self, X, y=None): """ self.fit_transform(X) return self + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + input_features = _check_feature_names_in(self, input_features) + names = self.initial_imputer_.get_feature_names_out(input_features) + return self._concatenate_indicator_feature_names_out(names, input_features) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index c2bd1410e8ecd..ad7e3537d445f 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -13,6 +13,7 @@ from ..utils import is_scalar_nan from ..utils._mask import _get_mask from ..utils.validation import check_is_fitted +from ..utils.validation import _check_feature_names_in class KNNImputer(_BaseImputer): @@ -206,6 +207,7 @@ def fit(self, X, y=None): _check_weights(self.weights) self._fit_X = X self._mask_fit_X = _get_mask(self._fit_X, self.missing_values) + self._valid_mask = ~np.all(self._mask_fit_X, axis=0) super()._fit_indicator(self._mask_fit_X) @@ -242,7 +244,7 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) mask_fit_X = self._mask_fit_X - valid_mask = ~np.all(mask_fit_X, axis=0) + valid_mask = self._valid_mask X_indicator = super()._transform_indicator(mask) @@ -327,3 +329,26 @@ def process_chunk(dist_chunk, start): pass return super()._concatenate_indicator(X[:, valid_mask], X_indicator) + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. + + - If `input_features` is `None`, then `feature_names_in_` is + used as feature names in. If `feature_names_in_` is not defined, + then names are generated: `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is an array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + """ + input_features = _check_feature_names_in(self, input_features) + names = input_features[self._valid_mask] + return self._concatenate_indicator_feature_names_out(names, input_features) diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index c35245ac8c253..0c13547ce9b4c 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -14,7 +14,7 @@ from sklearn.impute import SimpleImputer -IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()] +IMPUTERS = [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()] SPARSE_IMPUTERS = [SimpleImputer()] @@ -122,3 +122,42 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator): X_trans = imputer.fit_transform(X_df) assert_allclose(X_trans_expected, X_trans) + + +@pytest.mark.parametrize("imputer", IMPUTERS, ids=lambda x: x.__class__.__name__) +@pytest.mark.parametrize("add_indicator", [True, False]) +def test_imputers_feature_names_out_pandas(imputer, add_indicator): + """Check feature names out for imputers.""" + pd = pytest.importorskip("pandas") + marker = np.nan + imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker) + + X = np.array( + [ + [marker, 1, 5, 3, marker, 1], + [2, marker, 1, 4, marker, 2], + [6, 3, 7, marker, marker, 3], + [1, 2, 9, 8, marker, 4], + ] + ) + X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"]) + imputer.fit(X_df) + + names = imputer.get_feature_names_out() + + if add_indicator: + expected_names = [ + "a", + "b", + "c", + "d", + "f", + "missingindicator_a", + "missingindicator_b", + "missingindicator_d", + "missingindicator_e", + ] + assert_array_equal(expected_names, names) + else: + expected_names = ["a", "b", "c", "d", "f"] + assert_array_equal(expected_names, names) diff --git a/sklearn/impute/tests/test_impute.py b/sklearn/impute/tests/test_impute.py index 2534f94116b57..9a4da4a9230a0 100644 --- a/sklearn/impute/tests/test_impute.py +++ b/sklearn/impute/tests/test_impute.py @@ -1493,3 +1493,22 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat): assert expected == _most_frequent( np.array(array, dtype=dtype), extra_value, n_repeat ) + + +def test_missing_indicator_feature_names_out(): + """Check that missing indicator return the feature names with a prefix.""" + pd = pytest.importorskip("pandas") + + missing_values = np.nan + X = pd.DataFrame( + [ + [missing_values, missing_values, 1, missing_values], + [4, missing_values, 2, 10], + ], + columns=["a", "b", "c", "d"], + ) + + indicator = MissingIndicator(missing_values=missing_values).fit(X) + feature_names = indicator.get_feature_names_out() + expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"] + assert_array_equal(expected_names, feature_names) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 4f6818081c67d..139a2bfb9702a 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -365,7 +365,6 @@ def test_pandas_column_name_consistency(estimator): "decomposition", "discriminant_analysis", "ensemble", - "impute", "isotonic", "kernel_approximation", "preprocessing",