diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst index a9ebaaced0672..96aa942fb9238 100644 --- a/doc/developers/develop.rst +++ b/doc/developers/develop.rst @@ -226,6 +226,17 @@ the dataset, e.g. when ``X`` is a precomputed kernel matrix. Specifically, the :term:`_pairwise` property is used by ``utils.metaestimators._safe_split`` to slice rows and columns. +Universal attributes +^^^^^^^^^^^^^^^^^^^^ + +Estimators that expect tabular input should set a `n_features_in_` +attribute at `fit` time to indicate the number of features that the estimator +expects for subsequent calls to `predict` or `transform`. +See +`SLEP010 +`_ +for details. + .. _rolling_your_own_estimator: Rolling your own estimator diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 368d92a012097..89622c52c5041 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -14,7 +14,6 @@ Version 0.23.0 Put the changes in their relevant module. - Changed models -------------- @@ -378,3 +377,12 @@ Changelog - |Fix| :class:`cluster.AgglomerativeClustering` add specific error when distance matrix is not square and `affinity=precomputed`. :pr:`16257` by :user:`Simona Maggio `. + +Miscellaneous +............. + +- |API| Most estimators now expose a `n_features_in_` attribute. This + attribute is equal to the number of features passed to the `fit` method. + See `SLEP010 + `_ + for details. :pr:`16112` by `Nicolas Hug`_. diff --git a/sklearn/base.py b/sklearn/base.py index e56e13872bffb..70dec8c030418 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -18,6 +18,8 @@ from . import __version__ from .utils import _IS_32BIT +from .utils.validation import check_X_y +from .utils.validation import check_array _DEFAULT_TAGS = { 'non_deterministic': False, @@ -343,6 +345,72 @@ def _get_tags(self): collected_tags.update(more_tags) return collected_tags + def _check_n_features(self, X, reset): + """Set the `n_features_in_` attribute, or check against it. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples. + reset : bool + If True, the `n_features_in_` attribute is set to `X.shape[1]`. + Else, the attribute must already exist and the function checks + that it is equal to `X.shape[1]`. + """ + n_features = X.shape[1] + + if reset: + self.n_features_in_ = n_features + else: + if not hasattr(self, 'n_features_in_'): + raise RuntimeError( + "The reset parameter is False but there is no " + "n_features_in_ attribute. Is this estimator fitted?" + ) + if n_features != self.n_features_in_: + raise ValueError( + 'X has {} features, but this {} is expecting {} features ' + 'as input.'.format(n_features, self.__class__.__name__, + self.n_features_in_) + ) + + def _validate_data(self, X, y=None, reset=True, **check_params): + """Validate input data and set or check the `n_features_in_` attribute. + + Parameters + ---------- + X : {array-like, sparse matrix, dataframe} of shape \ + (n_samples, n_features) + The input samples. + y : array-like of shape (n_samples,), default=None + The targets. If None, `check_array` is called on `X` and + `check_X_y` is called otherwise. + reset : bool, default=True + Whether to reset the `n_features_in_` attribute. + If False, the input will be checked for consistency with data + provided when reset was last True. + **check_params : kwargs + Parameters passed to :func:`sklearn.utils.check_array` or + :func:`sklearn.utils.check_X_y`. + + Returns + ------- + out : {ndarray, sparse matrix} or tuple of these + The validated input. A tuple is returned if `y` is not None. + """ + + if y is None: + X = check_array(X, **check_params) + out = X + else: + X, y = check_X_y(X, y, **check_params) + out = X, y + + if check_params.get('ensure_2d', True): + self._check_n_features(X, reset=reset) + + return out + class ClassifierMixin: """Mixin class for all classifiers in scikit-learn.""" diff --git a/sklearn/calibration.py b/sklearn/calibration.py index ff9c4b3e75c44..a5490efa28c0a 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -124,8 +124,8 @@ def fit(self, X, y, sample_weight=None): self : object Returns an instance of self. """ - X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'], - force_all_finite=False, allow_nd=True) + X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr', 'coo'], + force_all_finite=False, allow_nd=True) X, y = indexable(X, y) le = LabelBinarizer().fit(y) self.classes_ = le.classes_ diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 5f9ce02d869b7..9516c8e4bdd05 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -374,7 +374,7 @@ def fit(self, X, y=None): accept_sparse = False else: accept_sparse = 'csr' - X = check_array(X, accept_sparse=accept_sparse) + X = self._validate_data(X, accept_sparse=accept_sparse) if self.affinity == "precomputed": self.affinity_matrix_ = X elif self.affinity == "euclidean": diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 6bc5d30adee2b..182ae4b481116 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -809,7 +809,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ensure_min_samples=2, estimator=self) + X = self._validate_data(X, ensure_min_samples=2, estimator=self) memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: @@ -1055,9 +1055,14 @@ def fit(self, X, y=None, **params): ------- self """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - ensure_min_features=2, estimator=self) - return AgglomerativeClustering.fit(self, X.T, **params) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + ensure_min_features=2, estimator=self) + # save n_features_in_ attribute here to reset it after, because it will + # be overridden in AgglomerativeClustering since we passed it X.T. + n_features_in_ = self.n_features_in_ + AgglomerativeClustering.fit(self, X.T, **params) + self.n_features_in_ = n_features_in_ + return self @property def fit_predict(self): diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index 356072f2d33aa..c98272d6aae33 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -121,7 +121,7 @@ def fit(self, X, y=None): warnings.warn("'n_jobs' was deprecated in version 0.23 and will be" " removed in 0.25.", FutureWarning) - X = check_array(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) self._check_parameters() self._fit(X) return self diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 68c07cc5a2860..1d81dafc7504d 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -463,7 +463,7 @@ def fit(self, X, y=None): return self._fit(X) def _fit(self, X): - X = check_array(X, accept_sparse='csr', copy=self.copy) + X = self._validate_data(X, accept_sparse='csr', copy=self.copy) threshold = self.threshold branching_factor = self.branching_factor diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index 92a2b0f716ac7..6a33f411886b0 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -308,7 +308,7 @@ def fit(self, X, y=None, sample_weight=None): self """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') if not self.eps > 0.0: raise ValueError("eps must be positive.") diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index efe4c9fad23b3..27ec0e5f388f6 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -975,8 +975,10 @@ def fit(self, X, y=None, sample_weight=None): ' got %d instead' % self.max_iter ) - X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32], - order='C', copy=self.copy_x, accept_large_sparse=False) + X = self._validate_data(X, accept_sparse='csr', + dtype=[np.float64, np.float32], + order='C', copy=self.copy_x, + accept_large_sparse=False) # verify that the number of samples given is larger than k if _num_samples(X) < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" % ( @@ -1591,8 +1593,8 @@ def fit(self, X, y=None, sample_weight=None): self """ random_state = check_random_state(self.random_state) - X = check_array(X, accept_sparse="csr", order='C', - dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse="csr", order='C', + dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("n_samples=%d should be >= n_clusters=%d" diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 5245a187bbc86..dc90967ebe5dc 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -368,7 +368,7 @@ def fit(self, X, y=None): y : Ignored """ - X = check_array(X) + X = self._validate_data(X) bandwidth = self.bandwidth if bandwidth is None: bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs) diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 8e68b45d9a369..92322b0ab0bfd 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -245,7 +245,7 @@ def fit(self, X, y=None): self : instance of OPTICS The instance. """ - X = check_array(X, dtype=np.float) + X = self._validate_data(X, dtype=np.float) if self.cluster_method not in ['dbscan', 'xi']: raise ValueError("cluster_method should be one of" diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index 0f5f816a993cc..2faddabefa157 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -474,8 +474,8 @@ def fit(self, X, y=None): self """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=np.float64, ensure_min_samples=2) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=np.float64, ensure_min_samples=2) allow_squared = self.affinity in ["precomputed", "precomputed_nearest_neighbors"] if X.shape[0] == X.shape[1] and not allow_squared: diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py index a31e61dd2423d..7d5a920600d7d 100644 --- a/sklearn/cluster/tests/test_bicluster.py +++ b/sklearn/cluster/tests/test_bicluster.py @@ -254,6 +254,17 @@ def test_wrong_shape(): model.fit(data) +@pytest.mark.parametrize('est', + (SpectralBiclustering(), SpectralCoclustering())) +def test_n_features_in_(est): + + X, _, _ = make_biclusters((3, 3), 3, random_state=0) + + assert not hasattr(est, 'n_features_in_') + est.fit(X) + assert est.n_features_in_ == 3 + + @pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering]) @pytest.mark.parametrize("n_jobs", [None, 1]) def test_n_jobs_deprecated(klass, n_jobs): diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index f5526ec185875..e94757bca6993 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -513,6 +513,8 @@ def fit_transform(self, X, y=None): else: self._feature_names_in = None X = _check_X(X) + # set n_features_in_ attribute + self._check_n_features(X, reset=True) self._validate_transformers() self._validate_column_callables(X) self._validate_remainder(X) @@ -587,6 +589,7 @@ def transform(self, X): 'and for transform when using the ' 'remainder keyword') + # TODO: also call _check_n_features(reset=False) in 0.24 self._validate_features(X.shape[1], X_feature_names) Xs = self._fit_transform(X, None, _transform_one, fitted=True) self._validate_output(Xs) diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index aad8050cb689e..27f4ef63edf68 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -10,6 +10,7 @@ from ..utils.validation import check_is_fitted from ..utils import check_array, _safe_indexing from ..preprocessing import FunctionTransformer +from ..exceptions import NotFittedError __all__ = ['TransformedTargetRegressor'] @@ -235,3 +236,17 @@ def predict(self, X): def _more_tags(self): return {'poor_score': True, 'no_validation': True} + + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() returns False the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.regressor_.n_features_in_ diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index f6a49f4cd6601..d26f6895427bb 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -1187,6 +1187,18 @@ def test_column_transformer_mask_indexing(array_type): assert X_trans.shape == (3, 2) +def test_n_features_in(): + # make sure n_features_in is what is passed as input to the column + # transformer. + + X = [[1, 2], [3, 4], [5, 6]] + ct = ColumnTransformer([('a', DoubleTrans(), [0]), + ('b', DoubleTrans(), [1])]) + assert not hasattr(ct, 'n_features_in_') + ct.fit(X) + assert ct.n_features_in_ == 2 + + @pytest.mark.parametrize('cols, pattern, include, exclude', [ (['col_int', 'col_float'], None, np.number, None), (['col_int', 'col_float'], None, None, object), diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 2f617b981ca54..30520b64dc507 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -198,7 +198,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X) + X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index 9dbf786839585..4fd1b01d763b2 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -382,8 +382,8 @@ def fit(self, X, y=None): self : object """ # Covariance does not make sense for a single feature - X = check_array(X, ensure_min_features=2, ensure_min_samples=2, - estimator=self) + X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2, + estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) @@ -659,7 +659,7 @@ def fit(self, X, y=None): self : object """ # Covariance does not make sense for a single feature - X = check_array(X, ensure_min_features=2, estimator=self) + X = self._validate_data(X, ensure_min_features=2, estimator=self) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index 93624931c4303..586c39cadecbe 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -637,7 +637,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X, ensure_min_samples=2, estimator='MinCovDet') + X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet') random_state = check_random_state(self.random_state) n_samples, n_features = X.shape # check that the empirical covariance is full rank diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 61907274ad823..b2c43cb3eb3cd 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -140,7 +140,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X) + X = self._validate_data(X) # Not calling the parent object to fit, to avoid a potential # matrix inversion when setting the precision if self.assume_centered: @@ -412,7 +412,7 @@ def fit(self, X, y=None): """ # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) - X = check_array(X) + X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) else: @@ -562,7 +562,7 @@ def fit(self, X, y=None): ------- self : object """ - X = check_array(X) + X = self._validate_data(X) # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) if self.assume_centered: diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 125c5946b1562..af81ece6baf58 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -277,8 +277,8 @@ def fit(self, X, Y): # copy since this will contains the residuals (deflated) matrices check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_data(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) @@ -886,8 +886,8 @@ def fit(self, X, Y): """ # copy since this will contains the centered data check_consistent_length(X, Y) - X = check_array(X, dtype=np.float64, copy=self.copy, - ensure_min_samples=2) + X = self._validate_data(X, dtype=np.float64, copy=self.copy, + ensure_min_samples=2) Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False) if Y.ndim == 1: Y = Y.reshape(-1, 1) diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index f19305dbfc272..49b78a0916e7a 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1043,6 +1043,10 @@ def fit(self, X, y=None): """ return self + @property + def n_features_in_(self): + return self.components_.shape[1] + class DictionaryLearning(SparseCodingMixin, BaseEstimator): """Dictionary learning @@ -1217,7 +1221,7 @@ def fit(self, X, y=None): Returns the object itself """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) if self.n_components is None: n_components = X.shape[1] else: @@ -1424,7 +1428,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) U, (A, B), self.n_iter_ = dict_learning_online( X, self.n_components, self.alpha, diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 21bf89ae056d8..7147fd452559c 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -168,7 +168,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, copy=self.copy, dtype=np.float64) + X = self._validate_data(X, copy=self.copy, dtype=np.float64) n_samples, n_features = X.shape n_components = self.n_components diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 44e665556b805..f9e3a148f6860 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -424,14 +424,12 @@ def _fit(self, X, compute_sources=False): ------- X_new : array-like, shape (n_samples, n_components) """ + + X = self._validate_data(X, copy=self.whiten, dtype=FLOAT_DTYPES, + ensure_min_samples=2).T fun_args = {} if self.fun_args is None else self.fun_args random_state = check_random_state(self.random_state) - # make interface compatible with other decompositions - # a copy is required only for non whitened data - X = check_array(X, copy=self.whiten, dtype=FLOAT_DTYPES, - ensure_min_samples=2).T - alpha = fun_args.get('alpha', 1.0) if not 1 <= alpha <= 2: raise ValueError('alpha must be in [1,2]') diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index fe7c57c61999a..2a0d19d373dbb 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -194,8 +194,8 @@ def fit(self, X, y=None): self.singular_values_ = None self.noise_variance_ = None - X = check_array(X, accept_sparse=['csr', 'csc', 'lil'], - copy=self.copy, dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'lil'], + copy=self.copy, dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if self.batch_size is None: diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index a8559f341591b..b1f83c8e0ff81 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -275,7 +275,7 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - X = check_array(X, accept_sparse='csr', copy=self.copy_X) + X = self._validate_data(X, accept_sparse='csr', copy=self.copy_X) self._centerer = KernelCenterer() K = self._get_kernel(X) self._fit_transform(K) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 48b52df811734..ba68e03a16191 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -467,7 +467,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): def _more_tags(self): return {'requires_positive_X': True} - def _check_non_neg_array(self, X, whom): + def _check_non_neg_array(self, X, reset_n_features, whom): """check X format check X format and make sure no negative value in X. @@ -477,7 +477,8 @@ def _check_non_neg_array(self, X, whom): X : array-like or sparse matrix """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, reset=reset_n_features, + accept_sparse='csr') check_non_negative(X, whom) return X @@ -496,13 +497,23 @@ def partial_fit(self, X, y=None): self """ self._check_params() - X = self._check_non_neg_array(X, + first_time = not hasattr(self, 'components_') + + # In theory reset should be equal to `first_time`, but there are tests + # checking the input number of feature and they expect a specific + # string, which is not the same one raised by check_n_features. So we + # don't check n_features_in_ here for now (it's done with adhoc code in + # the estimator anyway). + # TODO: set reset=first_time when addressing reset in + # predict/transform/etc. + reset_n_features = True + X = self._check_non_neg_array(X, reset_n_features, "LatentDirichletAllocation.partial_fit") n_samples, n_features = X.shape batch_size = self.batch_size # initialize parameters or check - if not hasattr(self, 'components_'): + if first_time: self._init_latent_vars(n_features) if n_features != self.components_.shape[1]: @@ -540,7 +551,8 @@ def fit(self, X, y=None): self """ self._check_params() - X = self._check_non_neg_array(X, "LatentDirichletAllocation.fit") + X = self._check_non_neg_array(X, reset_n_features=True, + whom="LatentDirichletAllocation.fit") n_samples, n_features = X.shape max_iter = self.max_iter evaluate_every = self.evaluate_every @@ -609,7 +621,9 @@ def _unnormalized_transform(self, X): check_is_fitted(self) # make sure feature size is the same in fitted model and in X - X = self._check_non_neg_array(X, "LatentDirichletAllocation.transform") + X = self._check_non_neg_array( + X, reset_n_features=True, + whom="LatentDirichletAllocation.transform") n_samples, n_features = X.shape if n_features != self.components_.shape[1]: raise ValueError( @@ -733,7 +747,8 @@ def score(self, X, y=None): score : float Use approximate bound as score. """ - X = self._check_non_neg_array(X, "LatentDirichletAllocation.score") + X = self._check_non_neg_array(X, reset_n_features=True, + whom="LatentDirichletAllocation.score") doc_topic_distr = self._unnormalized_transform(X) score = self._approx_bound(X, doc_topic_distr, sub_sampling=False) @@ -762,8 +777,9 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, """ check_is_fitted(self) - X = self._check_non_neg_array(X, - "LatentDirichletAllocation.perplexity") + X = self._check_non_neg_array( + X, reset_n_features=True, + whom="LatentDirichletAllocation.perplexity") if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 1c14a84a990be..86c9acddfea1e 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1275,8 +1275,8 @@ def fit_transform(self, X, y=None, W=None, H=None): W : array, shape (n_samples, n_components) Transformed data. """ - X = check_array(X, accept_sparse=('csr', 'csc'), - dtype=[np.float64, np.float32]) + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + dtype=[np.float64, np.float32]) W, H, n_iter_ = non_negative_factorization( X=X, W=W, H=H, n_components=self.n_components, init=self.init, diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 41b5bd68fecce..f64a9752896b3 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -386,8 +386,8 @@ def _fit(self, X): raise TypeError('PCA does not support sparse input. See ' 'TruncatedSVD for a possible alternative.') - X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True, - copy=self.copy) + X = self._validate_data(X, dtype=[np.float64, np.float32], + ensure_2d=True, copy=self.copy) # Handle n_components==None if self.n_components is None: diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index da1bd0dddf529..158bbefc22e92 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -165,7 +165,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ @@ -371,7 +371,7 @@ def fit(self, X, y=None): Returns the instance itself. """ random_state = check_random_state(self.random_state) - X = check_array(X) + X = self._validate_data(X) _check_normalize_components( self.normalize_components, self.__class__.__name__ diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index e3bddd23c4de8..940eab56feea8 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -157,8 +157,8 @@ def fit_transform(self, X, y=None): X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - X = check_array(X, accept_sparse=['csr', 'csc'], - ensure_min_features=2) + X = self._validate_data(X, accept_sparse=['csr', 'csc'], + ensure_min_features=2) random_state = check_random_state(self.random_state) if self.algorithm == "arpack": diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index 9ecc9cbf25598..5f082ffea13ee 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -498,3 +498,9 @@ def test_sparse_coder_parallel_mmap(): sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2) sc.fit_transform(data) + + +def test_sparse_coder_n_features_in(): + d = np.array([[1, 2, 3], [1, 2, 3]]) + sc = SparseCoder(d) + assert sc.n_features_in_ == d.shape[1] diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index da986f900ab8e..a954081b380cb 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -423,8 +423,8 @@ def fit(self, X, y): y : array-like of shape (n_samples,) Target values. """ - X, y = check_X_y(X, y, ensure_min_samples=2, estimator=self, - dtype=[np.float64, np.float32]) + X, y = self._validate_data(X, y, ensure_min_samples=2, estimator=self, + dtype=[np.float64, np.float32]) self.classes_ = unique_labels(y) n_samples, _ = X.shape n_classes = len(self.classes_) @@ -645,7 +645,7 @@ def fit(self, X, y): y : array-like of shape (n_samples,) Target values (integers) """ - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) n_samples, n_features = X.shape diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 6fb9b21711930..daa2c1ff0da11 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -156,6 +156,8 @@ def fit(self, X, y, sample_weight=None): self.n_outputs_ = y.shape[1] + self.n_features_in_ = None # No input validation is done for X + check_consistent_length(X, y) if sample_weight is not None: @@ -489,6 +491,7 @@ def fit(self, X, y, sample_weight=None): % (self.strategy, allowed_strategies)) y = check_array(y, ensure_2d=False) + self.n_features_in_ = None # No input validation is done for X if len(y) == 0: raise ValueError("y must not be empty.") diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 8533c84ef5e88..d73f38954d21a 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -278,9 +278,9 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): random_state = check_random_state(self.random_state) # Convert data (X is required to be 2d and indexable) - X, y = check_X_y( - X, y, ['csr', 'csc'], dtype=None, force_all_finite=False, - multi_output=True + X, y = self._validate_data( + X, y, accept_sparse=['csr', 'csc'], dtype=None, + force_all_finite=False, multi_output=True ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 0b1c1c1c73686..8070d0cd0eaff 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -292,13 +292,13 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - # Validate and convert input data + # Validate or convert input data if issparse(y): raise ValueError( "sparse multilabel-indicator for y is not supported." - ) - X = check_array(X, accept_sparse="csc", dtype=DTYPE) - y = check_array(y, ensure_2d=False, dtype=None) + ) + X, y = self._validate_data(X, y, multi_output=True, + accept_sparse="csc", dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index ef76e3ba97bd7..4a458359437b3 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -411,7 +411,8 @@ def fit(self, X, y, sample_weight=None, monitor=None): # Check input # Since check_array converts both X and y to the same dtype, but the # trees use different types for X and y, checking them separately. - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=DTYPE) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=DTYPE) n_samples, self.n_features_ = X.shape sample_weight_is_none = sample_weight is None diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 97fcdb4713802..e18d3ac4b1f9b 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -106,7 +106,8 @@ def fit(self, X, y, sample_weight=None): acc_compute_hist_time = 0. # time spent computing histograms # time spent predicting X for gradient and hessians update acc_prediction_time = 0. - X, y = check_X_y(X, y, dtype=[X_DTYPE], force_all_finite=False) + X, y = self._validate_data(X, y, dtype=[X_DTYPE], + force_all_finite=False) y = self._encode_y(y) check_consistent_length(X, y) # Do not create unit sample weights by default to later skip some diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 6b206dfbe3d02..a5fedc6431344 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -140,6 +140,7 @@ def fit(self, X, y, sample_weight=None): delayed(_parallel_fit_estimator)(clone(est), X, y, sample_weight) for est in all_estimators if est != 'drop' ) + self.n_features_in_ = self.estimators_[0].n_features_in_ self.named_estimators_ = Bunch() est_fitted_idx = 0 diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index f79870a2c6891..d01647a5e4448 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -30,6 +30,7 @@ from ..utils.validation import check_is_fitted from ..utils.multiclass import check_classification_targets from ..utils.validation import column_or_1d +from ..exceptions import NotFittedError class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble): @@ -88,6 +89,20 @@ def fit(self, X, y, sample_weight=None): return self + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.estimators_[0].n_features_in_ + class VotingClassifier(ClassifierMixin, _BaseVoting): """Soft Voting/Majority Rule classifier for unfitted estimators. diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index 2908b888e7c91..de73858f4bb3f 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -71,25 +71,9 @@ def __init__(self, self.learning_rate = learning_rate self.random_state = random_state - def _validate_data(self, X, y=None): - - # Accept or convert to these sparse matrix formats so we can - # use _safe_indexing - accept_sparse = ['csr', 'csc'] - if y is None: - ret = check_array(X, - accept_sparse=accept_sparse, - ensure_2d=False, - allow_nd=True, - dtype=None) - else: - ret = check_X_y(X, y, - accept_sparse=accept_sparse, - ensure_2d=False, - allow_nd=True, - dtype=None, - y_numeric=is_regressor(self)) - return ret + def _check_X(self, X): + return check_array(X, accept_sparse=['csr', 'csc'], ensure_2d=True, + allow_nd=True, dtype=None) def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). @@ -116,7 +100,12 @@ def fit(self, X, y, sample_weight=None): if self.learning_rate <= 0: raise ValueError("learning_rate must be greater than zero") - X, y = self._validate_data(X, y) + X, y = self._validate_data(X, y, + accept_sparse=['csr', 'csc'], + ensure_2d=True, + allow_nd=True, + dtype=None, + y_numeric=is_regressor(self)) sample_weight = _check_sample_weight(sample_weight, X, np.float64) sample_weight /= sample_weight.sum() @@ -229,7 +218,7 @@ def staged_score(self, X, y, sample_weight=None): ------ z : float """ - X = self._validate_data(X) + X = self._check_X(X) for y_pred in self.staged_predict(X): if is_classifier(self): @@ -637,7 +626,7 @@ def predict(self, X): y : ndarray of shape (n_samples,) The predicted classes. """ - X = self._validate_data(X) + X = self._check_X(X) pred = self.decision_function(X) @@ -667,7 +656,7 @@ def staged_predict(self, X): y : generator of ndarray of shape (n_samples,) The predicted classes. """ - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_ @@ -701,7 +690,7 @@ def decision_function(self, X): class in ``classes_``, respectively. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] @@ -744,7 +733,7 @@ def staged_decision_function(self, X): class in ``classes_``, respectively. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] @@ -813,7 +802,7 @@ def predict_proba(self, X): outputs is the same of that of the :term:`classes_` attribute. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ @@ -847,7 +836,7 @@ def staged_predict_proba(self, X): The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ - X = self._validate_data(X) + X = self._check_X(X) n_classes = self.n_classes_ @@ -873,7 +862,7 @@ def predict_log_proba(self, X): The class probabilities of the input samples. The order of outputs is the same of that of the :term:`classes_` attribute. """ - X = self._validate_data(X) + X = self._check_X(X) return np.log(self.predict_proba(X)) @@ -1151,7 +1140,7 @@ def predict(self, X): The predicted regression values. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) return self._get_median_predict(X, len(self.estimators_)) @@ -1176,7 +1165,7 @@ def staged_predict(self, X): The predicted regression values. """ check_is_fitted(self) - X = self._validate_data(X) + X = self._check_X(X) for i, _ in enumerate(self.estimators_, 1): yield self._get_median_predict(X, limit=i) diff --git a/sklearn/ensemble/tests/test_voting.py b/sklearn/ensemble/tests/test_voting.py index 61b106cbeedff..599f62366b51b 100644 --- a/sklearn/ensemble/tests/test_voting.py +++ b/sklearn/ensemble/tests/test_voting.py @@ -514,6 +514,26 @@ def test_check_estimators_voting_estimator(estimator): check_no_attributes_set_in_init(estimator.__class__.__name__, estimator) +@pytest.mark.parametrize( + "est", + [VotingRegressor( + estimators=[('lr', LinearRegression()), + ('tree', DecisionTreeRegressor(random_state=0))]), + VotingClassifier( + estimators=[('lr', LogisticRegression(random_state=0)), + ('tree', DecisionTreeClassifier(random_state=0))])], + ids=['VotingRegressor', 'VotingClassifier'] +) +def test_n_features_in(est): + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + assert not hasattr(est, 'n_features_in_') + est.fit(X, y) + assert est.n_features_in_ == 2 + + @pytest.mark.parametrize( "estimator", [VotingRegressor( diff --git a/sklearn/feature_extraction/tests/test_dict_vectorizer.py b/sklearn/feature_extraction/tests/test_dict_vectorizer.py index 7e7481a369646..22a7402908cf1 100644 --- a/sklearn/feature_extraction/tests/test_dict_vectorizer.py +++ b/sklearn/feature_extraction/tests/test_dict_vectorizer.py @@ -110,3 +110,12 @@ def test_deterministic_vocabulary(): v_2 = DictVectorizer().fit([d_shuffled]) assert v_1.vocabulary_ == v_2.vocabulary_ + + +def test_n_features_in(): + # For vectorizers, n_features_in_ does not make sense and does not exist. + dv = DictVectorizer() + assert not hasattr(dv, 'n_features_in_') + d = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}] + dv.fit(d) + assert not hasattr(dv, 'n_features_in_') diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 0b680b2a9e400..86ae2fd6c149e 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1342,6 +1342,18 @@ def test_unused_parameters_warn(Vectorizer, stop_words, vect.fit(train_data) +@pytest.mark.parametrize('Vectorizer, X', ( + (HashingVectorizer, [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]), + (CountVectorizer, JUNK_FOOD_DOCS)) +) +def test_n_features_in(Vectorizer, X): + # For vectorizers, n_features_in_ does not make sense + vectorizer = Vectorizer() + assert not hasattr(vectorizer, 'n_features_in_') + vectorizer.fit(X) + assert not hasattr(vectorizer, 'n_features_in_') + + # TODO: Remove in 0.24 def test_vectorizermixin_is_deprecated(): class MyVectorizer(VectorizerMixin): diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 674127f06acd7..dd72bddc58eb5 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -6,6 +6,7 @@ from ._base import SelectorMixin from ..base import BaseEstimator, clone, MetaEstimatorMixin +from ..utils.validation import check_is_fitted from ..exceptions import NotFittedError from ..utils.metaestimators import if_delegate_has_method @@ -254,6 +255,20 @@ def partial_fit(self, X, y=None, **fit_params): self.estimator_.partial_fit(X, y, **fit_params) return self + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.estimator_.n_features_in_ + def _more_tags(self): estimator_tags = self.estimator._get_tags() return {'allow_nan': estimator_tags.get('allow_nan', True)} diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 91312c7dc80f9..69e3cc4de9e6c 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -155,9 +155,12 @@ def _fit(self, X, y, step_score=None): # self.scores_ will not be calculated when calling _fit through fit tags = self._get_tags() - X, y = check_X_y(X, y, "csc", ensure_min_features=2, - force_all_finite=not tags.get('allow_nan', True), - multi_output=True) + X, y = self._validate_data( + X, y, accept_sparse="csc", + ensure_min_features=2, + force_all_finite=not tags.get('allow_nan', True), + multi_output=True + ) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: @@ -491,9 +494,11 @@ def fit(self, X, y, groups=None): instance (e.g., :class:`~sklearn.model_selection.GroupKFold`). """ tags = self._get_tags() - X, y = check_X_y(X, y, "csc", ensure_min_features=2, - force_all_finite=not tags.get('allow_nan', True), - multi_output=True) + X, y = self._validate_data( + X, y, accept_sparse="csr", ensure_min_features=2, + force_all_finite=not tags.get('allow_nan', True), + multi_output=True + ) # Initialization cv = check_cv(self.cv, y, is_classifier(self.estimator)) diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 21990bb3a8167..221e46f2a505e 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -338,7 +338,8 @@ def fit(self, X, y): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc'], multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True) if not callable(self.score_func): raise TypeError("The score function should be a callable, %s (%s) " diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 4f9d720b762b9..6438e6b80dc0a 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -65,8 +65,9 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, ('csr', 'csc'), dtype=np.float64, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + dtype=np.float64, + force_all_finite='allow-nan') if hasattr(X, "toarray"): # sparse matrix _, self.variances_ = mean_variance_axis(X, axis=0) diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index 5618ff28128b8..ed8ed2a007a22 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -616,11 +616,11 @@ def fit(self, X, y): self : returns an instance of self. """ if self.kernel is None or self.kernel.requires_vector_input: - X, y = check_X_y(X, y, multi_output=False, - ensure_2d=True, dtype="numeric") + X, y = self._validate_data(X, y, multi_output=False, + ensure_2d=True, dtype="numeric") else: - X, y = check_X_y(X, y, multi_output=False, - ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, multi_output=False, + ensure_2d=False, dtype=None) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 0c1db0d209458..1b48efb39f26d 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -184,11 +184,11 @@ def fit(self, X, y): self._rng = check_random_state(self.random_state) if self.kernel_.requires_vector_input: - X, y = check_X_y(X, y, multi_output=True, y_numeric=True, - ensure_2d=True, dtype="numeric") + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, + ensure_2d=True, dtype="numeric") else: - X, y = check_X_y(X, y, multi_output=True, y_numeric=True, - ensure_2d=False, dtype=None) + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True, + ensure_2d=False, dtype=None) # Normalize target value if self.normalize_y: diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index 6ef34b8312e23..bc98778d5c5d8 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -217,7 +217,7 @@ def __init__(self, missing_values=np.nan, strategy="mean", self.verbose = verbose self.copy = copy - def _validate_input(self, X): + def _validate_input(self, X, in_fit): allowed_strategies = ["mean", "median", "most_frequent", "constant"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " @@ -235,8 +235,10 @@ def _validate_input(self, X): force_all_finite = "allow-nan" try: - X = check_array(X, accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite, copy=self.copy) + X = self._validate_data(X, reset=in_fit, + accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite, + copy=self.copy) except ValueError as ve: if "could not convert" in str(ve): new_ve = ValueError("Cannot use {} strategy with non-numeric " @@ -269,7 +271,7 @@ def fit(self, X, y=None): ------- self : SimpleImputer """ - X = self._validate_input(X) + X = self._validate_input(X, in_fit=True) super()._fit_indicator(X) # default fill_value is 0 for numerical input and "missing_value" @@ -407,7 +409,7 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_input(X) + X = self._validate_input(X, in_fit=False) X_indicator = super()._transform_indicator(X) statistics = self.statistics_ @@ -587,13 +589,14 @@ def _get_missing_features_info(self, X): return imputer_mask, features_indices - def _validate_input(self, X): + def _validate_input(self, X, in_fit): if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" - X = check_array(X, accept_sparse=('csc', 'csr'), dtype=None, - force_all_finite=force_all_finite) + X = self._validate_data(X, reset=in_fit, + accept_sparse=('csc', 'csr'), dtype=None, + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) if X.dtype.kind not in ("i", "u", "f", "O"): raise ValueError("MissingIndicator does not support data with " @@ -628,7 +631,7 @@ def _fit(self, X, y=None): The imputer mask of the original data. """ - X = self._validate_input(X) + X = self._validate_input(X, in_fit=True) self._n_features = X.shape[1] if self.features not in ('missing-only', 'all'): @@ -680,7 +683,7 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_input(X) + X = self._validate_input(X, in_fit=False) if X.shape[1] != self._n_features: raise ValueError("X has a different number of features " diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index e168892337bec..58a35d157c7a4 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -505,8 +505,8 @@ def _initial_imputation(self, X): else: force_all_finite = True - X = check_array(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite=force_all_finite) + X = self._validate_data(X, dtype=FLOAT_DTYPES, order="F", + force_all_finite=force_all_finite) _check_inputs_dtype(X, self.missing_values) mask_missing_values = _get_mask(X, self.missing_values) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index a34a51b945253..f782a46a6b40d 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -178,8 +178,9 @@ def fit(self, X, y=None): raise ValueError( "Expected n_neighbors > 0. Got {}".format(self.n_neighbors)) - X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=force_all_finite, copy=self.copy) + X = self._validate_data(X, accept_sparse=False, dtype=FLOAT_DTYPES, + force_all_finite=force_all_finite, + copy=self.copy) super()._fit_indicator(X) _check_weights(self.weights) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index f15bf508f8dad..6ae62ce245a56 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -104,7 +104,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -210,7 +210,7 @@ def fit(self, X, y=None): Returns the transformer. """ - X = check_array(X) + X = self._validate_data(X) random_state = check_random_state(self.random_state) n_features = X.shape[1] uniform = random_state.uniform(size=(n_features, self.n_components)) @@ -337,7 +337,7 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') check_non_negative(X, 'X in AdditiveChi2Sampler.fit') if self.sample_interval is None: @@ -556,7 +556,7 @@ def fit(self, X, y=None): X : array-like of shape (n_samples, n_features) Training data. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') rnd = check_random_state(self.random_state) n_samples = X.shape[0] diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index d8f0e16d6f2b9..21c43979c3b1e 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -157,8 +157,8 @@ def fit(self, X, y=None, sample_weight=None): self : returns an instance of self. """ # Convert data - X, y = check_X_y(X, y, accept_sparse=("csr", "csc"), multi_output=True, - y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=("csr", "csc"), + multi_output=True, y_numeric=True) if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = _check_sample_weight(sample_weight, X) diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 92383dd82019b..d280f9d0f0d81 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -498,8 +498,8 @@ def fit(self, X, y, sample_weight=None): """ n_jobs_ = self.n_jobs - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - y_numeric=True, multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 333ae5494b4e9..c67fc54f43157 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -190,7 +190,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError('n_iter should be greater than or equal to 1.' ' Got {!r}.'.format(self.n_iter)) - X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True) + X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -526,8 +526,8 @@ def fit(self, X, y): ------- self : returns an instance of self. """ - X, y = check_X_y(X, y, dtype=np.float64, y_numeric=True, - ensure_min_samples=2) + X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True, + ensure_min_samples=2) n_samples, n_features = X.shape coef_ = np.zeros(n_features) diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index 87b5be7b650a5..9281d03710455 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -751,9 +751,11 @@ def fit(self, X, y, sample_weight=None, check_input=True): # when bypassing checks if check_input: X_copied = self.copy_X and self.fit_intercept - X, y = check_X_y(X, y, accept_sparse='csc', - order='F', dtype=[np.float64, np.float32], - copy=X_copied, multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse='csc', + order='F', + dtype=[np.float64, np.float32], + copy=X_copied, multi_output=True, + y_numeric=True) y = check_array(y, order='F', copy=False, dtype=X.dtype.type, ensure_2d=False) @@ -1197,8 +1199,8 @@ def fit(self, X, y): # Let us not impose fortran ordering so far: it is # not useful for the cross-validation loop and will be done # by the model fitting itself - X = check_array(X, 'csc', dtype=[np.float64, np.float32], - copy=False) + X = self._validate_data(X, accept_sparse='csc', + dtype=[np.float64, np.float32], copy=False) if sparse.isspmatrix(X): if (hasattr(reference_to_old_X, "data") and not np.may_share_memory(reference_to_old_X.data, X.data)): @@ -1209,8 +1211,9 @@ def fit(self, X, y): copy_X = False del reference_to_old_X else: - X = check_array(X, 'csc', dtype=[np.float64, np.float32], - order='F', copy=copy_X) + X = self._validate_data(X, accept_sparse='csc', + dtype=[np.float64, np.float32], order='F', + copy=copy_X) copy_X = False if X.shape[0] != y.shape[0]: @@ -1831,8 +1834,8 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - X = check_array(X, dtype=[np.float64, np.float32], order='F', - copy=self.copy_X and self.fit_intercept) + X = self._validate_data(X, dtype=[np.float64, np.float32], order='F', + copy=self.copy_X and self.fit_intercept) y = check_array(y, dtype=X.dtype.type, ensure_2d=False) if hasattr(self, 'l1_ratio'): diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index 152055a62c662..1d3a3fcc73421 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -252,7 +252,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y( + X, y = self._validate_data( X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=[np.float64, np.float32]) diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 0a9a67844a3f3..9f0f62471376a 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -944,7 +944,7 @@ def fit(self, X, y, Xy=None): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True, multi_output=True) + X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) alpha = getattr(self, 'alpha', 0.) if hasattr(self, 'n_nonzero_coefs'): @@ -1367,7 +1367,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) X = as_float_array(X, copy=self.copy_X) y = as_float_array(y, copy=self.copy_X) @@ -1748,7 +1748,7 @@ def fit(self, X, y, copy_X=None): """ if copy_X is None: copy_X = self.copy_X - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, copy_X) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 4b541884eece8..9e84e56ee0284 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1339,8 +1339,9 @@ def fit(self, X, y, sample_weight=None): else: _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) self.classes_ = np.unique(y) @@ -1812,9 +1813,9 @@ def fit(self, X, y, sample_weight=None): "LogisticRegressionCV." ) - X, y = check_X_y(X, y, accept_sparse='csr', dtype=np.float64, - order="C", - accept_large_sparse=solver != 'liblinear') + X, y = self._validate_data(X, y, accept_sparse='csr', dtype=np.float64, + order="C", + accept_large_sparse=solver != 'liblinear') check_classification_targets(y) class_weight = self.class_weight diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index 1fc0a8b69491c..54b751423c933 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -641,7 +641,7 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, multi_output=True, y_numeric=True) n_features = X.shape[1] X, y, X_offset, y_offset, X_scale, Gram, Xy = \ @@ -879,8 +879,8 @@ def fit(self, X, y): self : object returns an instance of self. """ - X, y = check_X_y(X, y, y_numeric=True, ensure_min_features=2, - estimator=self) + X, y = self._validate_data(X, y, y_numeric=True, ensure_min_features=2, + estimator=self) X = as_float_array(X, copy=False, force_all_finite=False) cv = check_cv(self.cv, classifier=False) max_iter = (min(max(int(0.1 * X.shape[1]), 5), X.shape[1]) diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 0363032359524..cd5e3db49842d 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -246,7 +246,7 @@ def fit(self, X, y, sample_weight=None): `max_trials` randomly chosen sub-samples. """ - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') y = check_array(y, ensure_2d=False) check_consistent_length(X, y) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index a1f6b89230b94..1a9cb661318e9 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -537,10 +537,10 @@ def fit(self, X, y, sample_weight=None): _dtype = [np.float64, np.float32] _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = check_X_y(X, y, - accept_sparse=_accept_sparse, - dtype=_dtype, - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, + accept_sparse=_accept_sparse, + dtype=_dtype, + multi_output=True, y_numeric=True) if sparse.issparse(X) and self.fit_intercept: if self.solver not in ['auto', 'sparse_cg', 'sag']: raise ValueError( @@ -921,8 +921,8 @@ def fit(self, X, y, sample_weight=None): """ _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = check_X_y(X, y, accept_sparse=_accept_sparse, multi_output=True, - y_numeric=False) + X, y = self._validate_data(X, y, accept_sparse=_accept_sparse, + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) @@ -1450,8 +1450,9 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=[np.float64], - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float64], + multi_output=True, y_numeric=True) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, @@ -1618,6 +1619,7 @@ def fit(self, X, y, sample_weight=None): self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ + self.n_features_in_ = estimator.n_features_in_ return self @@ -1882,8 +1884,8 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - multi_output=True, y_numeric=False) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'], + multi_output=True, y_numeric=False) sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 7fc2126a131b1..df2189b9400cc 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -546,8 +546,9 @@ def _fit(self, X, y, alpha, C, loss, learning_rate, coef_init=None, if hasattr(self, "classes_"): self.classes_ = None - X, y = check_X_y(X, y, 'csr', dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) # labels can be encoded as float, int, or string literals # np.unique sorts in asc order; largest class id is positive class @@ -1120,8 +1121,9 @@ def __init__(self, loss="squared_loss", penalty="l2", alpha=0.0001, def _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, sample_weight, coef_init, intercept_init): - X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64, - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse="csr", copy=False, + order='C', dtype=np.float64, + accept_large_sparse=False) y = y.astype(np.float64, copy=False) n_samples, n_features = X.shape diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 35cb3b8f25471..a29cc26cdc0a3 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -357,7 +357,7 @@ def fit(self, X, y): self : returns an instance of self. """ random_state = check_random_state(self.random_state) - X, y = check_X_y(X, y, y_numeric=True) + X, y = self._validate_data(X, y, y_numeric=True) n_samples, n_features = X.shape n_subsamples, self.n_subpopulation_ = self._check_subparams(n_samples, n_features) diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 792c21ce51c2c..8a7fc3f85f425 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -140,13 +140,13 @@ def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto', self.metric_params = metric_params def _fit_transform(self, X): - self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors, algorithm=self.neighbors_algorithm, metric=self.metric, p=self.p, metric_params=self.metric_params, n_jobs=self.n_jobs) self.nbrs_.fit(X) + self.n_features_in_ = self.nbrs_.n_features_in_ self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 4854cf228f0ca..7b46d51df718d 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -652,7 +652,7 @@ def _fit_transform(self, X): n_jobs=self.n_jobs) random_state = check_random_state(self.random_state) - X = check_array(X, dtype=float) + X = self._validate_data(X, dtype=float) self.nbrs_.fit(X) self.embedding_, self.reconstruction_error_ = \ locally_linear_embedding( diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index b7b52344eb21b..ca8c08ed69f98 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -411,7 +411,7 @@ def fit_transform(self, X, y=None, init=None): algorithm. By default, the algorithm is initialized with a randomly chosen array. """ - X = check_array(X) + X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn("The MDS API has changed. ``fit`` now constructs an" " dissimilarity matrix from data. To use a custom " diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index e885a94eaaded..caac2236e1dd6 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -531,8 +531,8 @@ def fit(self, X, y=None): Returns the instance itself. """ - X = check_array(X, accept_sparse='csr', ensure_min_samples=2, - estimator=self) + X = self._validate_data(X, accept_sparse='csr', ensure_min_samples=2, + estimator=self) random_state = check_random_state(self.random_state) if isinstance(self.affinity, str): diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 81972dac33d07..d0c9e4e509a73 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -661,11 +661,12 @@ def _fit(self, X, skip_num_points=0): if self.angle < 0.0 or self.angle > 1.0: raise ValueError("'angle' must be between 0.0 - 1.0") if self.method == 'barnes_hut': - X = check_array(X, accept_sparse=['csr'], ensure_min_samples=2, - dtype=[np.float32, np.float64]) + X = self._validate_data(X, accept_sparse=['csr'], + ensure_min_samples=2, + dtype=[np.float32, np.float64]) else: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype=[np.float32, np.float64]) + X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'], + dtype=[np.float32, np.float64]) if self.metric == "precomputed": if isinstance(self.init, str) and self.init == 'pca': raise ValueError("The parameter init=\"pca\" cannot be " diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index 07f3669db27ef..5c09d67f6e63d 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -218,6 +218,7 @@ def fit_predict(self, X, y=None): Component labels. """ X = _check_X(X, self.n_components, ensure_min_samples=2) + self._check_n_features(X, reset=True) self._check_initial_parameters(X) # if we enable warm_start, we will have a unique initialisation diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 1c2551a5f600f..55e770d701858 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -566,6 +566,20 @@ def inverse_transform(self, Xt): self._check_is_fitted('inverse_transform') return self.best_estimator_.inverse_transform(Xt) + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the search estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + + return self.best_estimator_.n_features_in_ + @property def classes_(self): self._check_is_fitted("classes_") diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index e9f21f0eb5e8f..49d4b156e0686 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -67,6 +67,8 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import Pipeline from sklearn.linear_model import Ridge, SGDClassifier, LinearRegression +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.model_selection.tests.common import OneTimeSplitter @@ -1810,6 +1812,23 @@ def get_n_splits(self, *args, **kw): ridge.fit(X[:train_size], y[:train_size]) +def test_n_features_in(): + # make sure grid search and random search delegate n_features_in to the + # best estimator + n_features = 4 + X, y = make_classification(n_features=n_features) + gbdt = HistGradientBoostingClassifier() + param_grid = {'max_iter': [3, 4]} + gs = GridSearchCV(gbdt, param_grid) + rs = RandomizedSearchCV(gbdt, param_grid, n_iter=1) + assert not hasattr(gs, 'n_features_in_') + assert not hasattr(rs, 'n_features_in_') + gs.fit(X, y) + rs.fit(X, y) + assert gs.n_features_in_ == n_features + assert rs.n_features_in_ == n_features + + def test_search_cv__pairwise_property_delegated_to_base_estimator(): """ Test implementation of BaseSearchCV has the _pairwise property diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 13dda2f6e6927..9eeb4248f83fd 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -52,6 +52,7 @@ check_classification_targets, _ovr_decision_function) from .utils.metaestimators import _safe_split, if_delegate_has_method +from .exceptions import NotFittedError from joblib import Parallel, delayed @@ -433,6 +434,19 @@ def _pairwise(self): def _first_estimator(self): return self.estimators_[0] + @property + def n_features_in_(self): + # For consistency with other estimators we raise a AttributeError so + # that hasattr() fails if the OVR estimator isn't fitted. + try: + check_is_fitted(self) + except NotFittedError as nfe: + raise AttributeError( + "{} object has no n_features_in_ attribute." + .format(self.__class__.__name__) + ) from nfe + return self.estimators_[0].n_features_in_ + def _fit_ovo_binary(estimator, X, y, i, j): """Fit a single binary estimator (one-vs-one).""" @@ -521,7 +535,7 @@ def fit(self, X, y): ------- self """ - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) check_classification_targets(y) self.classes_ = np.unique(y) @@ -762,7 +776,7 @@ def fit(self, X, y): ------- self """ - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) if self.code_size <= 0: raise ValueError("code_size should be greater than 0, got {0}" "".format(self.code_size)) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 8b6d4b8384c4d..82edd85472880 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -152,9 +152,7 @@ def fit(self, X, y, sample_weight=None, **fit_params): raise ValueError("The base estimator should implement" " a fit method") - X, y = check_X_y(X, y, - multi_output=True, - accept_sparse=True) + X, y = self._validate_data(X, y, multi_output=True, accept_sparse=True) if is_classifier(self): check_classification_targets(y) @@ -423,7 +421,7 @@ def fit(self, X, Y, **fit_params): ------- self : object """ - X, Y = check_X_y(X, Y, multi_output=True, accept_sparse=True) + X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True) random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 6238aa294530a..c23cb86644e1b 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -203,6 +203,7 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + X, y = self._validate_data(X, y) y = column_or_1d(y, warn=True) return self._partial_fit(X, y, np.unique(y), _refit=True, sample_weight=sample_weight) @@ -472,7 +473,7 @@ def _check_X(self, X): return check_array(X, accept_sparse='csr') def _check_X_y(self, X, y): - return check_X_y(X, y, accept_sparse='csr') + return self._validate_data(X, y, accept_sparse='csr') def _update_class_log_prior(self, class_prior=None): n_classes = len(self.classes_) @@ -1154,8 +1155,8 @@ def _check_X(self, X): return X def _check_X_y(self, X, y): - X, y = check_X_y(X, y, dtype='int', accept_sparse=False, - force_all_finite=True) + X, y = self._validate_data(X, y, dtype='int', accept_sparse=False, + force_all_finite=True) if np.any(X < 0): raise ValueError("X must not contain negative values.") return X, y diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index f927c26868a5f..945959ef10d9c 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -396,8 +396,9 @@ def _fit(self, X): if self.effective_metric_ == 'precomputed': X = _check_precomputed(X) + self.n_features_in_ = X.shape[1] else: - X = check_array(X, accept_sparse='csr') + X = self._validate_data(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index 5b44f6f6b2b75..3404a9768f36a 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -152,7 +152,7 @@ def fit(self, X, y=None, sample_weight=None): Returns instance of object. """ algorithm = self._choose_algorithm(self.algorithm, self.metric) - X = check_array(X, order='C', dtype=DTYPE) + X = self._validate_data(X, order='C', dtype=DTYPE) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DTYPE) diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 131a1bf0b04c1..b9d2de01c958d 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -300,7 +300,7 @@ def _validate_params(self, X, y): """ # Validate the inputs X and y, and converts y to numerical classes. - X, y = check_X_y(X, y, ensure_min_samples=2) + X, y = self._validate_data(X, y, ensure_min_samples=2) check_classification_targets(y) y = LabelEncoder().fit_transform(y) diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 3eefb7b4fbf58..48712c1fcfb44 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -104,9 +104,9 @@ def fit(self, X, y): # If X is sparse and the metric is "manhattan", store it in a csc # format is easier to calculate the median. if self.metric == 'manhattan': - X, y = check_X_y(X, y, ['csc']) + X, y = self._validate_data(X, y, accept_sparse=['csc']) else: - X, y = check_X_y(X, y, ['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) is_X_sparse = sp.issparse(X) if is_X_sparse and self.shrink_threshold: raise ValueError("threshold shrinking not supported" diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 90f9210db5d6b..dbd4c012c8afa 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -943,8 +943,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu", n_iter_no_change=n_iter_no_change, max_fun=max_fun) def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], - multi_output=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) @@ -1352,8 +1352,8 @@ def predict(self, X): return y_pred def _validate_input(self, X, y, incremental): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'], - multi_output=True, y_numeric=True) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'], + multi_output=True, y_numeric=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) return X, y diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 38673eb4ab8bd..06e7cc71bad3c 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -343,7 +343,7 @@ def fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - X = check_array(X, accept_sparse='csr', dtype=np.float64) + X = self._validate_data(X, accept_sparse='csr', dtype=np.float64) n_samples = X.shape[0] rng = check_random_state(self.random_state) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index d387206288bf5..64d2de70df531 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -628,6 +628,11 @@ def _pairwise(self): # check if first estimator expects pairwise input return getattr(self.steps[0][1], '_pairwise', False) + @property + def n_features_in_(self): + # delegate to first step (which will call _check_is_fitted) + return self.steps[0][1].n_features_in_ + def _name_estimators(estimators): """Generate names for estimators.""" @@ -1000,6 +1005,11 @@ def _update_transformer_list(self, transformers): else next(transformers)) for name, old in self.transformer_list] + @property + def n_features_in_(self): + # X is passed to all transformers so we just delegate to the first one + return self.transformer_list[0][1].n_features_in_ + def make_union(*transformers, **kwargs): """ diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 9ff3723b25550..72ad6bacd43b4 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -368,17 +368,16 @@ def partial_fit(self, X, y=None): raise TypeError("MinMaxScaler does not support sparse input. " "Consider using MaxAbsScaler instead.") - X = check_array(X, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite="allow-nan") + first_pass = not hasattr(self, 'n_samples_seen_') + X = self._validate_data(X, reset=first_pass, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite="allow-nan") data_min = np.nanmin(X, axis=0) data_max = np.nanmax(X, axis=0) - # First pass - if not hasattr(self, 'n_samples_seen_'): + if first_pass: self.n_samples_seen_ = X.shape[0] - # Next steps else: data_min = np.minimum(self.data_min_, data_min) data_max = np.maximum(self.data_max_, data_max) @@ -695,9 +694,9 @@ def partial_fit(self, X, y=None): self : object Transformer instance. """ - X = check_array(X, accept_sparse=('csr', 'csc'), - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse=('csr', 'csc'), + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -790,9 +789,10 @@ def transform(self, X, copy=None): check_is_fitted(self) copy = copy if copy is not None else self.copy - X = check_array(X, accept_sparse='csr', copy=copy, - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + X = self._validate_data(X, reset=False, + accept_sparse='csr', copy=copy, + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_mean: @@ -965,9 +965,11 @@ def partial_fit(self, X, y=None): self : object Transformer instance. """ - X = check_array(X, accept_sparse=('csr', 'csc'), - estimator=self, dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + first_pass = not hasattr(self, 'n_samples_seen_') + X = self._validate_data(X, reset=first_pass, + accept_sparse=('csr', 'csc'), estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) @@ -975,10 +977,8 @@ def partial_fit(self, X, y=None): else: max_abs = np.nanmax(np.abs(X), axis=0) - # First pass - if not hasattr(self, 'n_samples_seen_'): + if first_pass: self.n_samples_seen_ = X.shape[0] - # Next passes else: max_abs = np.maximum(self.max_abs_, max_abs) self.n_samples_seen_ += X.shape[0] @@ -1196,8 +1196,9 @@ def fit(self, X, y=None): """ # at fit, convert sparse matrices to csc for optimized computation of # the quantiles - X = check_array(X, accept_sparse='csc', estimator=self, - dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + X = self._validate_data(X, accept_sparse='csc', estimator=self, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') q_min, q_max = self.quantile_range if not 0 <= q_min <= q_max <= 100: @@ -1505,7 +1506,8 @@ def fit(self, X, y=None): ------- self : instance """ - n_samples, n_features = check_array(X, accept_sparse=True).shape + n_samples, n_features = self._validate_data( + X, accept_sparse=True).shape combinations = self._combinations(n_features, self.degree, self.interaction_only, self.include_bias) @@ -1811,7 +1813,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + self._validate_data(X, accept_sparse='csr') return self def transform(self, X, copy=None): @@ -1945,7 +1947,7 @@ def fit(self, X, y=None): ---------- X : array-like """ - check_array(X, accept_sparse='csr') + self._validate_data(X, accept_sparse='csr') return self def transform(self, X, copy=None): @@ -2025,7 +2027,7 @@ def fit(self, K, y=None): self : returns an instance of self. """ - K = check_array(K, dtype=FLOAT_DTYPES) + K = self._validate_data(K, dtype=FLOAT_DTYPES) if K.shape[0] != K.shape[1]: raise ValueError("Kernel matrix must be a square matrix." @@ -2347,7 +2349,7 @@ def fit(self, X, y=None): " and {} samples.".format(self.n_quantiles, self.subsample)) - X = self._check_inputs(X, copy=False) + X = self._check_inputs(X, in_fit=True, copy=False) n_samples = X.shape[0] if self.n_quantiles > n_samples: @@ -2438,11 +2440,22 @@ def _transform_col(self, X_col, quantiles, inverse): return X_col - def _check_inputs(self, X, accept_sparse_negative=False, copy=False): + def _check_inputs(self, X, in_fit, accept_sparse_negative=False, + copy=False): """Check inputs before fit and transform""" - X = check_array(X, accept_sparse='csc', copy=copy, - dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + # In theory reset should be equal to `in_fit`, but there are tests + # checking the input number of feature and they expect a specific + # string, which is not the same one raised by check_n_features. So we + # don't check n_features_in_ here for now (it's done with adhoc code in + # the estimator anyway). + # TODO: set reset=in_fit when addressing reset in + # predict/transform/etc. + reset = True + + X = self._validate_data(X, reset=reset, + accept_sparse='csc', copy=copy, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') # we only accept positive sparse matrix when ignore_implicit_zeros is # false and that we call fit or transform. with np.errstate(invalid='ignore'): # hide NaN comparison warnings @@ -2518,7 +2531,7 @@ def transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, copy=self.copy) + X = self._check_inputs(X, in_fit=False, copy=self.copy) self._check_is_fitted(X) return self._transform(X, inverse=False) @@ -2539,7 +2552,8 @@ def inverse_transform(self, X): Xt : ndarray or sparse matrix, shape (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, accept_sparse_negative=True, copy=self.copy) + X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, + copy=self.copy) self._check_is_fitted(X) return self._transform(X, inverse=True) @@ -2781,7 +2795,8 @@ def fit_transform(self, X, y=None): return self._fit(X, y, force_transform=True) def _fit(self, X, y=None, force_transform=False): - X = self._check_input(X, check_positive=True, check_method=True) + X = self._check_input(X, in_fit=True, check_positive=True, + check_method=True) if not self.copy and not force_transform: # if call from fit() X = X.copy() # force copy so that fit does not change X inplace @@ -2823,7 +2838,8 @@ def transform(self, X): The transformed data. """ check_is_fitted(self) - X = self._check_input(X, check_positive=True, check_shape=True) + X = self._check_input(X, in_fit=False, check_positive=True, + check_shape=True) transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform @@ -2869,7 +2885,7 @@ def inverse_transform(self, X): The original data """ check_is_fitted(self) - X = self._check_input(X, check_shape=True) + X = self._check_input(X, in_fit=False, check_shape=True) if self.standardize: X = self._scaler.inverse_transform(X) @@ -2974,7 +2990,7 @@ def _neg_log_likelihood(lmbda): # choosing bracket -2, 2 like for boxcox return optimize.brent(_neg_log_likelihood, brack=(-2, 2)) - def _check_input(self, X, check_positive=False, check_shape=False, + def _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method=False): """Validate the input before fit and transform. @@ -2992,8 +3008,8 @@ def _check_input(self, X, check_positive=False, check_shape=False, check_method : bool If True, check that the transformation method is valid. """ - X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, copy=self.copy, - force_all_finite='allow-nan') + X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, + copy=self.copy, force_all_finite='allow-nan') with np.warnings.catch_warnings(): np.warnings.filterwarnings( diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 5a73bf5c7f845..67641601e06f5 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -137,7 +137,7 @@ def fit(self, X, y=None): ------- self """ - X = check_array(X, dtype='numeric') + X = self._validate_data(X, dtype='numeric') valid_encode = ('onehot', 'onehot-dense', 'ordinal') if self.encode not in valid_encode: diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 4aceaa08100f2..85ce3a1f845c1 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -92,7 +92,7 @@ def __init__(self, func=None, inverse_func=None, validate=False, def _check_input(self, X): if self.validate: - return check_array(X, accept_sparse=self.accept_sparse) + return self._validate_data(X, accept_sparse=self.accept_sparse) return X def _check_inverse_transform(self, X): diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 311a254239cf0..d18f3bf846901 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -354,7 +354,7 @@ def fit(self, X, y=None): self """ - X = check_array(X, accept_sparse=['csr', 'csc']) + X = self._validate_data(X, accept_sparse=['csr', 'csc']) n_samples, n_features = X.shape diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index d40b5008db33c..a84a9950aa3ac 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -223,7 +223,7 @@ def fit(self, X, y): ------- self : object """ - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) self.X_ = X check_classification_targets(y) diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 1f81987f78c52..662a4ffa24678 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -147,9 +147,9 @@ def fit(self, X, y, sample_weight=None): if callable(self.kernel): check_consistent_length(X, y) else: - X, y = check_X_y(X, y, dtype=np.float64, - order='C', accept_sparse='csr', - accept_large_sparse=False) + X, y = self._validate_data(X, y, dtype=np.float64, + order='C', accept_sparse='csr', + accept_large_sparse=False) y = self._validate_targets(y) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 1d96900555400..e04b904e294b2 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -217,9 +217,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = check_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) check_classification_targets(y) self.classes_ = np.unique(y) @@ -395,9 +395,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Penalty term must be positive; got (C=%r)" % self.C) - X, y = check_X_y(X, y, accept_sparse='csr', - dtype=np.float64, order="C", - accept_large_sparse=False) + X, y = self._validate_data(X, y, accept_sparse='csr', + dtype=np.float64, order="C", + accept_large_sparse=False) penalty = 'l2' # SVR only accepts l2 penalty self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear( X, y, self.C, self.fit_intercept, self.intercept_scaling, diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 0d4addb48e64d..38abb0b158fd3 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -756,6 +756,17 @@ def test_dtype_of_classifier_probas(strategy): assert probas.dtype == np.float64 +@pytest.mark.filterwarnings("ignore:The default value of strategy.*") # 0.24 +@pytest.mark.parametrize('Dummy', (DummyRegressor, DummyClassifier)) +def test_n_features_in_(Dummy): + X = [[1, 2]] + y = [0] + d = Dummy() + assert not hasattr(d, 'n_features_in_') + d.fit(X, y) + assert d.n_features_in_ is None + + @pytest.mark.parametrize("Dummy", (DummyRegressor, DummyClassifier)) def test_outputs_2d_deprecation(Dummy): X = [[1, 2]] diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index bd9246269f0f4..b9c2e26abac61 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -34,6 +34,8 @@ from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer +from sklearn.experimental import enable_hist_gradient_boosting # noqa +from sklearn.ensemble import HistGradientBoostingClassifier iris = load_iris() @@ -1161,6 +1163,49 @@ def test_verbose(est, method, pattern, capsys): assert re.match(pattern, capsys.readouterr().out) +def test_n_features_in_pipeline(): + # make sure pipelines delegate n_features_in to the first step + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + ss = StandardScaler() + gbdt = HistGradientBoostingClassifier() + pipe = make_pipeline(ss, gbdt) + assert not hasattr(pipe, 'n_features_in_') + pipe.fit(X, y) + assert pipe.n_features_in_ == ss.n_features_in_ == 2 + + # if the first step has the n_features_in attribute then the pipeline also + # has it, even though it isn't fitted. + ss = StandardScaler() + gbdt = HistGradientBoostingClassifier() + pipe = make_pipeline(ss, gbdt) + ss.fit(X, y) + assert pipe.n_features_in_ == ss.n_features_in_ == 2 + assert not hasattr(gbdt, 'n_features_in_') + + +def test_n_features_in_feature_union(): + # make sure FeatureUnion delegates n_features_in to the first transformer + + X = [[1, 2], [3, 4], [5, 6]] + y = [0, 1, 2] + + ss = StandardScaler() + fu = make_union(ss) + assert not hasattr(fu, 'n_features_in_') + fu.fit(X, y) + assert fu.n_features_in_ == ss.n_features_in_ == 2 + + # if the first step has the n_features_in attribute then the feature_union + # also has it, even though it isn't fitted. + ss = StandardScaler() + fu = make_union(ss) + ss.fit(X, y) + assert fu.n_features_in_ == ss.n_features_in_ == 2 + + def test_feature_union_fit_params(): # Regression test for issue: #15117 class Dummy(TransformerMixin, BaseEstimator): diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 41d09ead9aec4..09481aefeed41 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -146,7 +146,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, raise ValueError("ccp_alpha must be greater than or equal to 0") if check_input: - X = check_array(X, dtype=DTYPE, accept_sparse="csc") + X = self._validate_data(X, dtype=DTYPE, accept_sparse="csc") y = check_array(y, ensure_2d=False, dtype=None) if issparse(X): X.sort_indices() diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d376f1edb3097..21060b3462520 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -281,6 +281,8 @@ def _yield_all_checks(name, estimator): yield check_dict_unchanged yield check_dont_overwrite_parameters yield check_fit_idempotent + if not tags["no_validation"]: + yield check_n_features_in if tags["requires_positive_X"]: yield check_fit_non_negative @@ -2906,3 +2908,41 @@ def check_fit_idempotent(name, estimator_orig): atol=max(tol, 1e-9), rtol=max(tol, 1e-7), err_msg="Idempotency check failed for method {}".format(method) ) + + +def check_n_features_in(name, estimator_orig): + # Make sure that n_features_in_ attribute doesn't exist until fit is + # called, and that its value is correct. + + rng = np.random.RandomState(0) + + estimator = clone(estimator_orig) + set_random_state(estimator) + if 'warm_start' in estimator.get_params(): + estimator.set_params(warm_start=False) + + n_samples = 100 + X = rng.normal(loc=100, size=(n_samples, 2)) + X = _pairwise_estimator_convert_X(X, estimator) + if is_regressor(estimator_orig): + y = rng.normal(size=n_samples) + else: + y = rng.randint(low=0, high=2, size=n_samples) + y = _enforce_estimator_tags_y(estimator, y) + + assert not hasattr(estimator, 'n_features_in_') + estimator.fit(X, y) + if hasattr(estimator, 'n_features_in_'): + assert estimator.n_features_in_ == X.shape[1] + else: + warnings.warn( + "As of scikit-learn 0.23, estimators should expose a " + "n_features_in_ attribute, unless the 'no_validation' tag is " + "True. This attribute should be equal to the number of features " + "passed to the fit method. " + "An error will be raised from version 0.25 when calling " + "check_estimator(). " + "See SLEP010: " + "https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html", # noqa + FutureWarning + ) diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py index cc017b6667658..748666884e60e 100644 --- a/sklearn/utils/tests/test_estimator_checks.py +++ b/sklearn/utils/tests/test_estimator_checks.py @@ -60,7 +60,7 @@ def __init__(self, key=0): self.key = key def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self def predict(self, X): @@ -75,7 +75,7 @@ def __init__(self, acceptable_key=0): def fit(self, X, y=None): self.wrong_attribute = 0 - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -85,14 +85,14 @@ def __init__(self, wrong_attribute=0): def fit(self, X, y=None): self.wrong_attribute = 1 - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self class ChangesUnderscoreAttribute(BaseEstimator): def fit(self, X, y=None): self._good_attribute = 1 - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -109,7 +109,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -126,7 +126,7 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self @@ -145,19 +145,19 @@ def set_params(self, **kwargs): return super().set_params(**kwargs) def fit(self, X, y=None): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self class NoCheckinPredict(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) return self class NoSparseClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc']) if sp.issparse(X): raise ValueError("Nonsensical Error") return self @@ -169,7 +169,7 @@ def predict(self, X): class CorrectNotFittedErrorClassifier(BaseBadClassifier): def fit(self, X, y): - X, y = check_X_y(X, y) + X, y = self._validate_data(X, y) self.coef_ = np.ones(X.shape[1]) return self @@ -182,10 +182,11 @@ def predict(self, X): class NoSampleWeightPandasSeriesType(BaseEstimator): def fit(self, X, y, sample_weight=None): # Convert data - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc"), - multi_output=True, - y_numeric=True) + X, y = self._validate_data( + X, y, + accept_sparse=("csr", "csc"), + multi_output=True, + y_numeric=True) # Function is only called after we verify that pandas is installed from pandas import Series if isinstance(sample_weight, Series): @@ -222,7 +223,7 @@ def fit(self, X, y): class BadTransformerWithoutMixin(BaseEstimator): def fit(self, X, y=None): - X = check_array(X) + X = self._validate_data(X) return self def transform(self, X): @@ -233,10 +234,11 @@ def transform(self, X): class NotInvariantPredict(BaseEstimator): def fit(self, X, y): # Convert data - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc"), - multi_output=True, - y_numeric=True) + X, y = self._validate_data( + X, y, + accept_sparse=("csr", "csc"), + multi_output=True, + y_numeric=True) return self def predict(self, X): @@ -249,11 +251,12 @@ def predict(self, X): class LargeSparseNotSupportedClassifier(BaseEstimator): def fit(self, X, y): - X, y = check_X_y(X, y, - accept_sparse=("csr", "csc", "coo"), - accept_large_sparse=True, - multi_output=True, - y_numeric=True) + X, y = self._validate_data( + X, y, + accept_sparse=("csr", "csc", "coo"), + accept_large_sparse=True, + multi_output=True, + y_numeric=True) if sp.issparse(X): if X.getformat() == "coo": if X.row.dtype == "int64" or X.col.dtype == "int64": @@ -268,7 +271,7 @@ def fit(self, X, y): class SparseTransformer(BaseEstimator): def fit(self, X, y=None): - self.X_shape_ = check_array(X).shape + self.X_shape_ = self._validate_data(X).shape return self def fit_transform(self, X, y=None): @@ -320,7 +323,7 @@ def _more_tags(self): class RequiresPositiveYRegressor(LinearRegression): def fit(self, X, y): - X, y = check_X_y(X, y, multi_output=True) + X, y = self._validate_data(X, y, multi_output=True) if (y <= 0).any(): raise ValueError('negative y values not supported!') return super().fit(X, y)