diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 58104d23fcf4e..b50cb63588896 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -16,7 +16,7 @@ from ..base import ClassifierMixin, RegressorMixin from ..metrics import r2_score, accuracy_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_random_state, check_array, column_or_1d +from ..utils import check_random_state, column_or_1d, deprecated from ..utils import indices_to_mask from ..utils.metaestimators import if_delegate_has_method from ..utils.multiclass import check_classification_targets @@ -287,7 +287,7 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): sample_weight = _check_sample_weight(sample_weight, X, dtype=None) # Remap output - n_samples, self.n_features_ = X.shape + n_samples = X.shape[0] self._n_samples = n_samples y = self._validate_y(y) @@ -313,11 +313,11 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): if isinstance(self.max_features, numbers.Integral): max_features = self.max_features elif isinstance(self.max_features, float): - max_features = self.max_features * self.n_features_ + max_features = self.max_features * self.n_features_in_ else: raise ValueError("max_features must be int or float") - if not (0 < max_features <= self.n_features_): + if not (0 < max_features <= self.n_features_in_): raise ValueError("max_features must be in (0, n_features]") max_features = max(1, int(max_features)) @@ -408,7 +408,7 @@ def _get_estimators_indices(self): # to those in `_parallel_build_estimators()` feature_indices, sample_indices = _generate_bagging_indices( seed, self.bootstrap_features, self.bootstrap, - self.n_features_, self._n_samples, self._max_features, + self.n_features_in_, self._n_samples, self._max_features, self._max_samples) yield feature_indices, sample_indices @@ -429,6 +429,16 @@ def estimators_samples_(self): return [sample_indices for _, sample_indices in self._get_estimators_indices()] + # TODO: Remove in 1.2 + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute n_features_ was deprecated in version 1.0 and will be " + "removed in 1.2. Use 'n_features_in_' instead." + ) + @property + def n_features_(self): + return self.n_features_in_ + class BaggingClassifier(ClassifierMixin, BaseBagging): """A Bagging classifier. @@ -523,6 +533,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): n_features_ : int The number of features when :meth:`fit` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + estimators_ : list of estimators The collection of fitted base estimators. @@ -702,17 +716,11 @@ def predict_proba(self, X): """ check_is_fitted(self) # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} and " - "input n_features is {1}." - "".format(self.n_features_, X.shape[1])) - # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators, self.n_jobs) @@ -753,17 +761,11 @@ def predict_log_proba(self, X): check_is_fitted(self) if hasattr(self.base_estimator_, "predict_log_proba"): # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} " - "and input n_features is {1} " - "".format(self.n_features_, X.shape[1])) - # Parallel loop n_jobs, n_estimators, starts = _partition_estimators( self.n_estimators, self.n_jobs) @@ -811,17 +813,11 @@ def decision_function(self, X): check_is_fitted(self) # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} and " - "input n_features is {1} " - "".format(self.n_features_, X.shape[1])) - # Parallel loop n_jobs, n_estimators, starts = _partition_estimators(self.n_estimators, self.n_jobs) @@ -929,6 +925,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging): n_features_ : int The number of features when :meth:`fit` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + estimators_ : list of estimators The collection of fitted sub-estimators. @@ -1024,9 +1024,9 @@ def predict(self, X): """ check_is_fitted(self) # Check data - X = check_array( + X = self._validate_data( X, accept_sparse=['csr', 'csc'], dtype=None, - force_all_finite=False + force_all_finite=False, reset=False ) # Parallel loop diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index c97b5b9f12528..b8dd0dd547b61 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -57,7 +57,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor) from ..tree._tree import DTYPE, DOUBLE -from ..utils import check_random_state, check_array, compute_sample_weight +from ..utils import check_random_state, compute_sample_weight, deprecated from ..exceptions import DataConversionWarning from ._base import BaseEnsemble, _partition_estimators from ..utils.fixes import delayed @@ -312,9 +312,6 @@ def fit(self, X, y, sample_weight=None): # ensemble sorts the indices. X.sort_indices() - # Remap output - self.n_features_ = X.shape[1] - y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: warn("A column-vector y was passed when a 1d array was" @@ -446,7 +443,8 @@ def _compute_oob_predictions(self, X, y): (n_samples, 1, n_outputs) The OOB predictions. """ - X = check_array(X, dtype=DTYPE, accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', + reset=False) n_samples = y.shape[0] n_outputs = self.n_outputs_ @@ -530,12 +528,22 @@ def feature_importances_(self): for tree in self.estimators_ if tree.tree_.node_count > 1) if not all_importances: - return np.zeros(self.n_features_, dtype=np.float64) + return np.zeros(self.n_features_in_, dtype=np.float64) all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + # TODO: Remove in 1.2 + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute n_features_ was deprecated in version 1.0 and will be " + "removed in 1.2. Use 'n_features_in_' instead." + ) + @property + def n_features_(self): + return self.n_features_in_ + def _accumulate_prediction(predict, X, out, lock): """ @@ -1163,6 +1171,10 @@ class labels (multi-output problem). n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1463,6 +1475,10 @@ class RandomForestRegressor(ForestRegressor): n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -1783,6 +1799,10 @@ class labels (multi-output problem). n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -2068,6 +2088,10 @@ class ExtraTreesRegressor(ForestRegressor): n_features_ : int The number of features. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs. @@ -2292,6 +2316,10 @@ class RandomTreesEmbedding(BaseForest): n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_outputs_ : int The number of outputs when ``fit`` is performed. @@ -2421,7 +2449,7 @@ def fit_transform(self, X, y=None, sample_weight=None): X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ - X = check_array(X, accept_sparse=['csc']) + X = self._validate_data(X, accept_sparse=['csc']) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 15f5404f4701c..e9f7402188860 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -273,25 +273,25 @@ def _check_params(self): if isinstance(self.max_features, str): if self.max_features == "auto": if is_classifier(self): - max_features = max(1, int(np.sqrt(self.n_features_))) + max_features = max(1, int(np.sqrt(self.n_features_in_))) else: - max_features = self.n_features_ + max_features = self.n_features_in_ elif self.max_features == "sqrt": - max_features = max(1, int(np.sqrt(self.n_features_))) + max_features = max(1, int(np.sqrt(self.n_features_in_))) elif self.max_features == "log2": - max_features = max(1, int(np.log2(self.n_features_))) + max_features = max(1, int(np.log2(self.n_features_in_))) else: raise ValueError("Invalid value for max_features: %r. " "Allowed string values are 'auto', 'sqrt' " "or 'log2'." % self.max_features) elif self.max_features is None: - max_features = self.n_features_ + max_features = self.n_features_in_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if 0. < self.max_features <= 1.: max_features = max(int(self.max_features * - self.n_features_), 1) + self.n_features_in_), 1) else: raise ValueError("max_features must be in (0, n_features]") @@ -411,7 +411,6 @@ def fit(self, X, y, sample_weight=None, monitor=None): X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE, multi_output=True) - n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None @@ -608,9 +607,6 @@ def _raw_predict_init(self, X): """Check input and compute raw predictions of the init estimator.""" self._check_initialized() X = self.estimators_[0, 0]._validate_X_predict(X, check_input=True) - if X.shape[1] != self.n_features_: - raise ValueError("X.shape[1] should be {0:d}, not {1:d}.".format( - self.n_features_, X.shape[1])) if self.init_ == 'zero': raw_predictions = np.zeros(shape=(X.shape[0], self.loss_.K), dtype=np.float64) @@ -647,7 +643,8 @@ def _staged_raw_predict(self, X): Regression and binary classification are special cases with ``k == 1``, otherwise ``k==n_classes``. """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr', + reset=False) raw_predictions = self._raw_predict_init(X) for i in range(self.estimators_.shape[0]): predict_stage(self.estimators_, i, X, self.learning_rate, @@ -681,7 +678,7 @@ def feature_importances_(self): if tree.tree_.node_count > 1] if not relevant_trees: # degenerate case where all trees have only one node - return np.zeros(shape=self.n_features_, dtype=np.float64) + return np.zeros(shape=self.n_features_in_, dtype=np.float64) relevant_feature_importances = [ tree.tree_.compute_feature_importances(normalize=False) @@ -764,6 +761,16 @@ def apply(self, X): return leaves + # TODO: Remove in 1.2 + # mypy error: Decorated property not supported + @deprecated( # type: ignore + "Attribute n_features_ was deprecated in version 1.0 and will be " + "removed in 1.2. Use 'n_features_in_' instead." + ) + @property + def n_features_(self): + return self.n_features_in_ + class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): """Gradient Boosting for classification. @@ -1005,7 +1012,7 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): Set via the ``init`` argument or ``loss.init_estimator``. estimators_ : ndarray of DecisionTreeRegressor of \ -shape (n_estimators, ``loss_.K``) + shape (n_estimators, ``loss_.K``) The collection of fitted sub-estimators. ``loss_.K`` is 1 for binary classification, otherwise n_classes. @@ -1015,6 +1022,10 @@ class GradientBoostingClassifier(ClassifierMixin, BaseGradientBoosting): n_features_ : int The number of data features. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + n_classes_ : int The number of classes. @@ -1140,7 +1151,8 @@ def decision_function(self, X): :term:`classes_`. Regression and binary classification produce an array of shape (n_samples,). """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr', + reset=False) raw_predictions = self._raw_predict(X) if raw_predictions.shape[1] == 1: return raw_predictions.ravel() @@ -1548,6 +1560,10 @@ class GradientBoostingRegressor(RegressorMixin, BaseGradientBoosting): n_features_ : int The number of data features. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + max_features_ : int The inferred value of max_features. @@ -1647,7 +1663,8 @@ def predict(self, X): y : ndarray of shape (n_samples,) The predicted values. """ - X = check_array(X, dtype=DTYPE, order="C", accept_sparse='csr') + X = self._validate_data(X, dtype=DTYPE, order="C", accept_sparse='csr', + reset=False) # In regression we can directly return the raw value from the trees. return self._raw_predict(X).ravel() diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index d30cd030bf698..562ec8ba32b48 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -8,7 +8,7 @@ from timeit import default_timer as time from ...base import (BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier) -from ...utils import check_random_state, check_array, resample +from ...utils import check_random_state, resample from ...utils.validation import (check_is_fitted, check_consistent_length, _check_sample_weight, @@ -733,7 +733,8 @@ def _raw_predict(self, X): """ is_binned = getattr(self, '_in_fit', False) dtype = X_BINNED_DTYPE if is_binned else X_DTYPE - X = check_array(X, dtype=dtype, force_all_finite=False) + X = self._validate_data(X, dtype=dtype, force_all_finite=False, + reset=False) check_is_fitted(self) if X.shape[1] != self._n_features: raise ValueError( @@ -789,7 +790,8 @@ def _staged_raw_predict(self, X): The raw predictions of the input samples. The order of the classes corresponds to that in the attribute :term:`classes_`. """ - X = check_array(X, dtype=X_DTYPE, force_all_finite=False) + X = self._validate_data(X, dtype=X_DTYPE, force_all_finite=False, + reset=False) check_is_fitted(self) if X.shape[1] != self._n_features: raise ValueError( diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index ce7f77edb1de4..ee1fdadff2d24 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -144,6 +144,10 @@ class IsolationForest(OutlierMixin, BaseBagging): n_features_ : int The number of features when ``fit`` is performed. + .. deprecated:: 1.0 + Attribute `n_features_` was deprecated in version 1.0 and will be + removed in 1.2. Use `n_features_in_` instead. + Notes ----- The implementation is based on an ensemble of ExtraTreeRegressor. The @@ -238,7 +242,7 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - X = check_array(X, accept_sparse=['csc']) + X = self._validate_data(X, accept_sparse=['csc']) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. @@ -309,7 +313,7 @@ def predict(self, X): be considered as an inlier according to the fitted model. """ check_is_fitted(self) - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr', reset=False) is_inlier = np.ones(X.shape[0], dtype=int) is_inlier[self.decision_function(X) < 0] = -1 return is_inlier @@ -375,12 +379,7 @@ def score_samples(self, X): check_is_fitted(self) # Check data - X = check_array(X, accept_sparse='csr') - if self.n_features_ != X.shape[1]: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is {0} and " - "input n_features is {1}." - "".format(self.n_features_, X.shape[1])) + X = self._validate_data(X, accept_sparse='csr', reset=False) # Take the opposite of the scores as bigger is better (here less # abnormal) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 3ea94cff7da53..d5354232a4385 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -33,7 +33,7 @@ from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor from ..tree import DecisionTreeClassifier, DecisionTreeRegressor -from ..utils import check_array, check_random_state, _safe_indexing +from ..utils import check_random_state, _safe_indexing from ..utils.extmath import softmax from ..utils.extmath import stable_cumsum from ..metrics import accuracy_score, r2_score @@ -73,8 +73,10 @@ def __init__(self, self.random_state = random_state def _check_X(self, X): - return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True, - allow_nd=True, dtype=None) + # Only called to validate X in non-fit methods, therefore reset=False + return self._validate_data( + X, accept_sparse=['csr', 'csc'], ensure_2d=True, allow_nd=True, + dtype=None, reset=False) def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). diff --git a/sklearn/ensemble/tests/test_bagging.py b/sklearn/ensemble/tests/test_bagging.py index e7cb11185fa5c..b17cbf7c147ac 100644 --- a/sklearn/ensemble/tests/test_bagging.py +++ b/sklearn/ensemble/tests/test_bagging.py @@ -480,15 +480,6 @@ def test_parallel_classification(): decisions2 = ensemble.decision_function(X_test) assert_array_almost_equal(decisions1, decisions2) - X_err = np.hstack((X_test, np.zeros((X_test.shape[0], 1)))) - err_msg = ( - f"Number of features of the model must match the input. Model " - f"n_features is {X_test.shape[1]} and input n_features is " - f"{X_err.shape[1]} " - ) - with pytest.raises(ValueError, match=err_msg): - ensemble.decision_function(X_err) - ensemble = BaggingClassifier(SVC(decision_function_shape='ovr'), n_jobs=1, random_state=0).fit(X_train, y_train) @@ -921,3 +912,16 @@ def fit(self, X, y): assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0]) + + +# FIXME: remove in 1.2 +@pytest.mark.parametrize("Estimator", [BaggingClassifier, BaggingRegressor]) +def test_n_features_deprecation(Estimator): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = Estimator().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 89ded326d21aa..c05cad26708b4 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1476,3 +1476,21 @@ def test_little_tree_with_small_max_samples(ForestClass): msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg + + +# FIXME: remove in 1.2 +@pytest.mark.parametrize( + "Estimator", + [ExtraTreesClassifier, ExtraTreesRegressor, + RandomForestClassifier, RandomForestRegressor, + RandomTreesEmbedding] +) +def test_n_features_deprecation(Estimator): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = Estimator().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 57ac93f52d0d3..63d4e668e674f 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -1353,3 +1353,19 @@ def test_criterion_mae_deprecation(estimator): "will be removed in version 1.1") with pytest.warns(FutureWarning, match=msg): estimator.fit(X, y) + + +# FIXME: remove in 1.2 +@pytest.mark.parametrize( + "Estimator", + [GradientBoostingClassifier, GradientBoostingRegressor] +) +def test_n_features_deprecation(Estimator): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = Estimator().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index de0c56fff793b..0b3a521346b30 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -345,3 +345,15 @@ def test_iforest_with_uniform_data(): assert all(iforest.predict(X) == 1) assert all(iforest.predict(rng.randn(100, 10)) == 1) assert all(iforest.predict(np.ones((100, 10))) == 1) + + +# FIXME: remove in 1.2 +def test_n_features_deprecation(): + # Check that we raise the proper deprecation warning if accessing + # `n_features_`. + X = np.array([[1, 2], [3, 4]]) + y = np.array([1, 0]) + est = IsolationForest().fit(X, y) + + with pytest.warns(FutureWarning, match="n_features_ was deprecated"): + est.n_features_ diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index dbac492d5efb9..4cdae851f9b9c 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -266,7 +266,6 @@ def test_search_cv(estimator, check, request): N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = { 'calibration', 'compose', - 'ensemble', 'feature_extraction', 'isotonic', 'manifold',