From d19c40df45493b63d6324547ce42d9a20dfd8147 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Oct 2021 16:23:33 +0200 Subject: [PATCH 1/9] delete attr when fit on ndarray --- sklearn/base.py | 4 ++++ sklearn/tests/test_base.py | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 60fc82eff6088..f3074a7c0f2a5 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -421,6 +421,10 @@ def _check_feature_names(self, X, *, reset): feature_names_in = _get_feature_names(X) if feature_names_in is not None: self.feature_names_in_ = feature_names_in + elif hasattr(self, "feature_names_in_"): + # Delete the attribute when the estimator is fitted on a new dataset + # that has no feature names. + delattr(self, "feature_names_in_") return fitted_feature_names = getattr(self, "feature_names_in_", None) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 36dc6064e4a46..19f2a583a2d2e 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -638,6 +638,11 @@ def transform(self, X): trans = NoOpTransformer().fit(df) assert_array_equal(trans.feature_names_in_, df.columns) + # fit again but on ndarray does not keep the previous feature names (see #21383) + trans.fit(df.to_numpy()) + assert not hasattr(trans, "feature_names_in_") + + trans.fit(df) msg = "The feature names should match those that were passed" df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1]) with pytest.warns(FutureWarning, match=msg): @@ -665,7 +670,7 @@ def transform(self, X): assert not record # fit on dataframe with no feature names or all integer feature names - # -> do not warn on trainsform + # -> do not warn on transform Xs = [X_np, df_int_names] for X in Xs: with pytest.warns(None) as record: From 6994cf574da5c37a270bc55829a2beea3a7c99cf Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Oct 2021 16:29:59 +0200 Subject: [PATCH 2/9] black --- sklearn/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 19f2a583a2d2e..eb25e9626ceb2 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -641,7 +641,7 @@ def transform(self, X): # fit again but on ndarray does not keep the previous feature names (see #21383) trans.fit(df.to_numpy()) assert not hasattr(trans, "feature_names_in_") - + trans.fit(df) msg = "The feature names should match those that were passed" df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1]) From 5c90d548b4d54f9c1a2b0ded462c9e961ee9773f Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Oct 2021 16:37:16 +0200 Subject: [PATCH 3/9] what's new --- doc/whats_new/v1.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index c27565bc3968d..e043d21e538f6 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -99,6 +99,14 @@ Fixed models where the underlying check for an attribute did not work with NumPy arrays. :pr:`21145` by :user:`Zahlii `. +Miscellaneous +............. + +- |Fix| Fitting an estimator on a dataset that has no feature names, that was previously + fitted on a dataset with feature names no longer keeps the old feature names stored in + the `feature_names_in_` attribute. :pr:`21389` by + :user:`Jérémie du Boisberranger `. + .. _changes_1_0: Version 1.0.0 From 842a73bafc2a1126c7a96dd6cbc5cc0f6378fd16 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 21 Oct 2021 16:38:17 +0200 Subject: [PATCH 4/9] iter --- sklearn/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index eb25e9626ceb2..f3b9c3e44ef38 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -639,7 +639,7 @@ def transform(self, X): assert_array_equal(trans.feature_names_in_, df.columns) # fit again but on ndarray does not keep the previous feature names (see #21383) - trans.fit(df.to_numpy()) + trans.fit(X_np) assert not hasattr(trans, "feature_names_in_") trans.fit(df) From dc82d4fd7c7a2e0abb84e5e6d820b6bcb8bee62d Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 22 Oct 2021 16:04:35 +0200 Subject: [PATCH 5/9] remove double validation --- sklearn/cluster/_agglomerative.py | 30 +++++++++++++------- sklearn/decomposition/_lda.py | 19 ++----------- sklearn/ensemble/_bagging.py | 18 ++++++------ sklearn/ensemble/_forest.py | 9 ++---- sklearn/linear_model/_ridge.py | 19 ++++++------- sklearn/linear_model/_stochastic_gradient.py | 11 +------ sklearn/naive_bayes.py | 1 - sklearn/utils/estimator_checks.py | 1 - 8 files changed, 43 insertions(+), 65 deletions(-) diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 70b3a5028169b..74141fa6ff447 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -23,6 +23,7 @@ from ..utils.fixes import _astype_copy_false from ..utils.graph import _fix_connected_components from ..utils.validation import check_memory +from ..utils.validation import _num_features # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' from . import _hierarchical_fast as _hierarchical # type: ignore @@ -915,6 +916,22 @@ def fit(self, X, y=None): Returns the fitted instance. """ X = self._validate_data(X, ensure_min_samples=2, estimator=self) + return self._fit(X) + + def _fit(self, X): + """Fit without validation + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples) + Training instances to cluster, or distances between instances if + ``affinity='precomputed'``. + + Returns + ------- + self : object + Returns the fitted instance. + """ memory = check_memory(self.memory) if self.n_clusters is not None and self.n_clusters <= 0: @@ -1218,17 +1235,8 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - X = self._validate_data( - X, - accept_sparse=["csr", "csc", "coo"], - ensure_min_features=2, - estimator=self, - ) - # save n_features_in_ attribute here to reset it after, because it will - # be overridden in AgglomerativeClustering since we passed it X.T. - n_features_in_ = self.n_features_in_ - AgglomerativeClustering.fit(self, X.T) - self.n_features_in_ = n_features_in_ + X = self._validate_data(X, ensure_min_features=2, estimator=self) + super()._fit(X.T) return self @property diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 99ecdd0317e89..0858024f6b3f4 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -686,18 +686,6 @@ def _unnormalized_transform(self, X): """ check_is_fitted(self) - # make sure feature size is the same in fitted model and in X - X = self._check_non_neg_array( - X, reset_n_features=True, whom="LatentDirichletAllocation.transform" - ) - n_samples, n_features = X.shape - if n_features != self.components_.shape[1]: - raise ValueError( - "The provided data has %d dimensions while " - "the model was trained with feature size %d." - % (n_features, self.components_.shape[1]) - ) - doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False) return doc_topic_distr @@ -853,10 +841,6 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False) """ check_is_fitted(self) - X = self._check_non_neg_array( - X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity" - ) - if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) else: @@ -902,4 +886,7 @@ def perplexity(self, X, sub_sampling=False): score : float Perplexity score. """ + X = self._check_non_neg_array( + X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity" + ) return self._perplexity_precomp_distr(X, sub_sampling=sub_sampling) diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index 091c5814ed80a..3dfb113c9279c 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -257,6 +257,15 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ + # Convert data (X is required to be 2d and indexable) + X, y = self._validate_data( + X, + y, + accept_sparse=["csr", "csc"], + dtype=None, + force_all_finite=False, + multi_output=True, + ) return self._fit(X, y, self.max_samples, sample_weight=sample_weight) def _parallel_args(self): @@ -295,15 +304,6 @@ def _fit(self, X, y, max_samples=None, max_depth=None, sample_weight=None): """ random_state = check_random_state(self.random_state) - # Convert data (X is required to be 2d and indexable) - X, y = self._validate_data( - X, - y, - accept_sparse=["csr", "csc"], - dtype=None, - force_all_finite=False, - multi_output=True, - ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=None) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index cb85da4dc851e..6b6342301769d 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -68,6 +68,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..utils.fixes import _joblib_parallel_args from ..utils.multiclass import check_classification_targets, type_of_target from ..utils.validation import check_is_fitted, _check_sample_weight +from ..utils.validation import _num_samples __all__ = [ @@ -2627,14 +2628,8 @@ def fit_transform(self, X, y=None, sample_weight=None): X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ - X = self._validate_data(X, accept_sparse=["csc"]) - if issparse(X): - # Pre-sort indices to avoid that each individual tree of the - # ensemble sorts the indices. - X.sort_indices() - rnd = check_random_state(self.random_state) - y = rnd.uniform(size=X.shape[0]) + y = rnd.uniform(size=_num_samples(X)) super().fit(X, y, sample_weight=sample_weight) self.one_hot_encoder_ = OneHotEncoder(sparse=self.sparse_output) diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 1dcc81e3b988f..ef7c27dd82ad6 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -700,16 +700,6 @@ def fit(self, X, y, sample_weight=None): self.normalize, default=False, estimator_name=self.__class__.__name__ ) - _dtype = [np.float64, np.float32] - _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) - X, y = self._validate_data( - X, - y, - accept_sparse=_accept_sparse, - dtype=_dtype, - multi_output=True, - y_numeric=True, - ) if self.solver == "lbfgs" and not self.positive: raise ValueError( "'lbfgs' solver can be used only when positive=True. " @@ -1008,6 +998,15 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ + _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) + X, y = self._validate_data( + X, + y, + accept_sparse=_accept_sparse, + dtype=[np.float64, np.float32], + multi_output=True, + y_numeric=True, + ) return super().fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 58265db58a903..2926a36c8fb11 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -648,16 +648,7 @@ def _fit( ): self._validate_params() if hasattr(self, "classes_"): - self.classes_ = None - - X, y = self._validate_data( - X, - y, - accept_sparse="csr", - dtype=np.float64, - order="C", - accept_large_sparse=False, - ) + delattr(self, "classes_") # labels can be encoded as float, int, or string literals # np.unique sorts in asc order; largest class id is positive class diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 52d552a181dac..b7af6a5c5e059 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -241,7 +241,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - X, y = self._validate_data(X, y) return self._partial_fit( X, y, np.unique(y), _refit=True, sample_weight=sample_weight ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index ccc6ff23ed8fc..1ba2949e4ee7a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3732,7 +3732,6 @@ def check_dataframe_column_names_consistency(name, estimator_orig): set_random_state(estimator) X_orig = rng.normal(size=(150, 8)) - # Some picky estimators (e.g. SkewedChi2Sampler) only accept skewed positive data. X_orig -= X_orig.min() + 0.5 X_orig = _enforce_estimator_tags_x(estimator, X_orig) From f610d7f84d2d8b669601fa93e0d84d9462802e52 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 22 Oct 2021 16:07:40 +0200 Subject: [PATCH 6/9] flake8 --- sklearn/cluster/_agglomerative.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 74141fa6ff447..01799048a7f6b 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -23,7 +23,6 @@ from ..utils.fixes import _astype_copy_false from ..utils.graph import _fix_connected_components from ..utils.validation import check_memory -from ..utils.validation import _num_features # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' from . import _hierarchical_fast as _hierarchical # type: ignore From 9e400c8694688750f456b234a3c20ed581c349a9 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 22 Oct 2021 16:42:41 +0200 Subject: [PATCH 7/9] iter --- sklearn/decomposition/_lda.py | 5 +---- sklearn/utils/estimator_checks.py | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 0858024f6b3f4..a723e3451e24f 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -684,8 +684,6 @@ def _unnormalized_transform(self, X): doc_topic_distr : ndarray of shape (n_samples, n_components) Document topic distribution for X. """ - check_is_fitted(self) - doc_topic_distr, _ = self._e_step(X, cal_sstats=False, random_init=False) return doc_topic_distr @@ -839,8 +837,6 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, sub_sampling=False) score : float Perplexity score. """ - check_is_fitted(self) - if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) else: @@ -886,6 +882,7 @@ def perplexity(self, X, sub_sampling=False): score : float Perplexity score. """ + check_is_fitted(self) X = self._check_non_neg_array( X, reset_n_features=True, whom="LatentDirichletAllocation.perplexity" ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1ba2949e4ee7a..ccc6ff23ed8fc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -3732,6 +3732,7 @@ def check_dataframe_column_names_consistency(name, estimator_orig): set_random_state(estimator) X_orig = rng.normal(size=(150, 8)) + # Some picky estimators (e.g. SkewedChi2Sampler) only accept skewed positive data. X_orig -= X_orig.min() + 0.5 X_orig = _enforce_estimator_tags_x(estimator, X_orig) From 43904ff7e6fbb0be7a3031b5e1c83e3b2626d794 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 22 Oct 2021 17:56:18 +0200 Subject: [PATCH 8/9] iter --- sklearn/linear_model/_stochastic_gradient.py | 2 ++ sklearn/naive_bayes.py | 1 + 2 files changed, 3 insertions(+) diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 2926a36c8fb11..ce03fc3697566 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -648,10 +648,12 @@ def _fit( ): self._validate_params() if hasattr(self, "classes_"): + # delete the attribute otherwise _partial_fit thinks it's not the first call delattr(self, "classes_") # labels can be encoded as float, int, or string literals # np.unique sorts in asc order; largest class id is positive class + y = self._validate_data(y=y) classes = np.unique(y) if self.warm_start and hasattr(self, "coef_"): diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index b7af6a5c5e059..0f43f39205ac0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -241,6 +241,7 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ + y = self._validate_data(y=y) return self._partial_fit( X, y, np.unique(y), _refit=True, sample_weight=sample_weight ) From fc443094224eb83fc62b8a2d2cde2fc0c4ba3ca5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Sat, 23 Oct 2021 11:17:07 +0200 Subject: [PATCH 9/9] Make test_groups_support deterministic --- sklearn/model_selection/tests/test_successive_halving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/model_selection/tests/test_successive_halving.py b/sklearn/model_selection/tests/test_successive_halving.py index a994e080bbb2a..fe06957f5deed 100644 --- a/sklearn/model_selection/tests/test_successive_halving.py +++ b/sklearn/model_selection/tests/test_successive_halving.py @@ -665,7 +665,7 @@ def test_groups_support(Est): ] error_msg = "The 'groups' parameter should not be None." for cv in group_cvs: - gs = Est(clf, grid, cv=cv) + gs = Est(clf, grid, cv=cv, random_state=0) with pytest.raises(ValueError, match=error_msg): gs.fit(X, y) gs.fit(X, y, groups=groups)