diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index c27565bc3968d..e043d21e538f6 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -99,6 +99,14 @@ Fixed models where the underlying check for an attribute did not work with NumPy arrays. :pr:`21145` by :user:`Zahlii `. +Miscellaneous +............. + +- |Fix| Fitting an estimator on a dataset that has no feature names, that was previously + fitted on a dataset with feature names no longer keeps the old feature names stored in + the `feature_names_in_` attribute. :pr:`21389` by + :user:`Jérémie du Boisberranger `. + .. _changes_1_0: Version 1.0.0 diff --git a/sklearn/base.py b/sklearn/base.py index 60fc82eff6088..f3074a7c0f2a5 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -421,6 +421,10 @@ def _check_feature_names(self, X, *, reset): feature_names_in = _get_feature_names(X) if feature_names_in is not None: self.feature_names_in_ = feature_names_in + elif hasattr(self, "feature_names_in_"): + # Delete the attribute when the estimator is fitted on a new dataset + # that has no feature names. + delattr(self, "feature_names_in_") return fitted_feature_names = getattr(self, "feature_names_in_", None) diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 70b3a5028169b..01799048a7f6b 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -915,6 +915,22 @@ def fit(self, X, y=None): Returns the fitted instance. """ X = self._validate_data(X, ensure_min_samples=2, estimator=self) + return self._fit(X) + + def _fit(self, X): + """Fit without validation + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + Training instances to cluster, or distances between instances if + ``affinity='precomputed'``. + + Returns + ------- + self : object + Returns the fitted instance. + """ memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: @@ -1218,17 +1234,8 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - X = self._validate_data( - X, - accept_sparse=["csr", "csc", "coo"], - ensure_min_features=2, - estimator=self, - ) - # save n_features_in_ attribute here to reset it after, because it will - # be overridden in AgglomerativeClustering since we passed it X.T. - n_features_in_ = self.n_features_in_ - AgglomerativeClustering.fit(self, X.T) - self.n_features_in_ = n_features_in_ + X = self._validate_data(X, ensure_min_features=2, estimator=self) + super()._fit(X.T) return self @property diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 99ecdd0317e89..a723e3451e24f 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -684,20 +684,6 @@ def _unnormalized_transform(self, X): doc_topic_distr : ndarray of shape (n_samples, n_components) Document topic distribution for X. """ - check_is_fitted(self) - - # make sure feature size is the same in fitted model and in X - X = self._check_non_neg_array( - X, reset_n_features=True, whom="LatentDirichletAllocation.transform" - ) - n_samples, n_features = X.shape - if n_features != self.components_.shape[1]: - raise ValueError( - "The provided data has %d dimensions while " - "the model was trained with feature size %d." - % (n_features, self.components_.shape[1]) - ) - doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False) return doc_topic_distr @@ -851,12 +837,6 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False) score : float Perplexity score. """ - check_is_fitted(self) - - X = self._check_non_neg_array( - X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity" - ) - if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) else: @@ -902,4 +882,8 @@ def perplexity(self, X, sub_sampling=False): score : float Perplexity score. """ + check_is_fitted(self) + X = self._check_non_neg_array( + X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity" + ) return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 091c5814ed80a..3dfb113c9279c 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -257,6 +257,15 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ + # Convert data (X is required to be 2d and indexable) + X, y = self._validate_data( + X, + y, + accept_sparse=["csr", "csc"], + dtype=None, + force_all_finite=False, + multi_output=True, + ) return self._fit(X, y, self.max_samples, sample_weight=sample_weight) def _parallel_args(self): @@ -295,15 +304,6 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): """ random_state = check_random_state(self.random_state) - # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( - X, - y, - accept_sparse=["csr", "csc"], - dtype=None, - force_all_finite=False, - multi_output=True, - ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index cb85da4dc851e..6b6342301769d 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -68,6 +68,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..utils.fixes import _joblib_parallel_args from ..utils.multiclass import check_classification_targets, type_of_target from ..utils.validation import check_is_fitted, _check_sample_weight +from ..utils.validation import _num_samples __all__ = [ @@ -2627,14 +2628,8 @@ def fit_transform(self, X, y=None, sample_weight=None): X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ - X = self._validate_data(X, accept_sparse=["csc"]) - if issparse(X): - # Pre-sort indices to avoid that each individual tree of the - # ensemble sorts the indices. - X.sort_indices() - rnd = check_random_state(self.random_state) - y = rnd.uniform(size=X.shape[0]) + y = rnd.uniform(size=_num_samples(X)) super().fit(X, y, sample_weight=sample_weight) self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 1dcc81e3b988f..ef7c27dd82ad6 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -700,16 +700,6 @@ def fit(self, X, y, sample_weight=None): self.normalize, default=False, estimator_name=self.__class__.__name__ ) - _dtype = [np.float64, np.float32] - _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = self._validate_data( - X, - y, - accept_sparse=_accept_sparse, - dtype=_dtype, - multi_output=True, - y_numeric=True, - ) if self.solver == "lbfgs" and not self.positive: raise ValueError( "'lbfgs' solver can be used only when positive=True. " @@ -1008,6 +998,15 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ + _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) + X, y = self._validate_data( + X, + y, + accept_sparse=_accept_sparse, + dtype=[np.float64, np.float32], + multi_output=True, + y_numeric=True, + ) return super().fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 58265db58a903..ce03fc3697566 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -648,19 +648,12 @@ def _fit( ): self._validate_params() if hasattr(self, "classes_"): - self.classes_ = None - - X, y = self._validate_data( - X, - y, - accept_sparse="csr", - dtype=np.float64, - order="C", - accept_large_sparse=False, - ) + # delete the attribute otherwise _partial_fit thinks it's not the first call + delattr(self, "classes_") # labels can be encoded as float, int, or string literals # np.unique sorts in asc order; largest class id is positive class + y = self._validate_data(y=y) classes = np.unique(y) if self.warm_start and hasattr(self, "coef_"): diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index a994e080bbb2a..fe06957f5deed 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -665,7 +665,7 @@ def test_groups_support(Est): ] error_msg = "The 'groups' parameter should not be None." for cv in group_cvs: - gs = Est(clf, grid, cv=cv) + gs = Est(clf, grid, cv=cv, random_state=0) with pytest.raises(ValueError, match=error_msg): gs.fit(X, y) gs.fit(X, y, groups=groups) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 52d552a181dac..0f43f39205ac0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -241,7 +241,7 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - X, y = self._validate_data(X, y) + y = self._validate_data(y=y) return self._partial_fit( X, y, np.unique(y), _refit=True, sample_weight=sample_weight ) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 36dc6064e4a46..f3b9c3e44ef38 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -638,6 +638,11 @@ def transform(self, X): trans = NoOpTransformer().fit(df) assert_array_equal(trans.feature_names_in_, df.columns) + # fit again but on ndarray does not keep the previous feature names (see #21383) + trans.fit(X_np) + assert not hasattr(trans, "feature_names_in_") + + trans.fit(df) msg = "The feature names should match those that were passed" df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1]) with pytest.warns(FutureWarning, match=msg): @@ -665,7 +670,7 @@ def transform(self, X): assert not record # fit on dataframe with no feature names or all integer feature names - # -> do not warn on trainsform + # -> do not warn on transform Xs = [X_np, df_int_names] for X in Xs: with pytest.warns(None) as record: