diff --git a/doc/whats_new/upcoming_changes/sklearn.decomposition/30224.fix.rst b/doc/whats_new/upcoming_changes/sklearn.decomposition/30224.fix.rst new file mode 100644 index 0000000000000..e325431c6e88f --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.decomposition/30224.fix.rst @@ -0,0 +1,6 @@ +- :class:`~sklearn.decomposition.IncrementalPCA` + will now only raise a ``ValueError`` when the number of samples in the + input data to ``partial_fit`` is less than the number of components + on the first call to ``partial_fit``. Subsequent calls to ``partial_fit`` + no longer face this restriction. + By :user:`Thomas Gessey-Jones ` diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index b2caf81aa9793..8fda4ddd1470f 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -306,11 +306,11 @@ def partial_fit(self, X, y=None, check_input=True): "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features) ) - elif not self.n_components <= n_samples: + elif self.n_components > n_samples and first_pass: raise ValueError( - "n_components=%r must be less or equal to " - "the batch number of samples " - "%d." % (self.n_components, n_samples) + f"n_components={self.n_components} must be less or equal to " + f"the batch number of samples {n_samples} for the first " + "partial_fit call." ) else: self.n_components_ = self.n_components diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 50ddf39b04503..e12be7337cbb3 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -139,14 +139,13 @@ def test_incremental_pca_validation(): ): IncrementalPCA(n_components, batch_size=10).fit(X) - # Tests that n_components is also <= n_samples. + # Test that n_components is also <= n_samples in first call to partial fit. n_components = 3 with pytest.raises( ValueError, match=( - "n_components={} must be" - " less or equal to the batch number of" - " samples {}".format(n_components, n_samples) + f"n_components={n_components} must be less or equal to the batch " + f"number of samples {n_samples} for the first partial_fit call." ), ): IncrementalPCA(n_components=n_components).partial_fit(X) @@ -233,6 +232,27 @@ def test_incremental_pca_batch_signs(): assert_almost_equal(np.sign(i), np.sign(j), decimal=6) +def test_incremental_pca_partial_fit_small_batch(): + # Test that there is no minimum batch size after the first partial_fit + # Non-regression test + rng = np.random.RandomState(1999) + n, p = 50, 3 + X = rng.randn(n, p) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + n_components = p + pipca = IncrementalPCA(n_components=n_components) + pipca.partial_fit(X[:n_components]) + for idx in range(n_components, n): + pipca.partial_fit(X[idx : idx + 1]) + + pca = PCA(n_components=n_components) + pca.fit(X) + + assert_allclose(pca.components_, pipca.components_, atol=1e-3) + + def test_incremental_pca_batch_values(): # Test that components_ values are stable over batch sizes. rng = np.random.RandomState(1999)