From 1490969ff271d1d8b7d7d404455a0c4a8f6542bf Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Mon, 4 Nov 2024 17:21:51 +0000 Subject: [PATCH 1/8] Enforce minimum batch size only on first partial_fit call in IncrementalPCA --- sklearn/decomposition/_incremental_pca.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index b2caf81aa9793..bc0fb62836ee7 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -306,11 +306,11 @@ def partial_fit(self, X, y=None, check_input=True): "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features) ) - elif not self.n_components <= n_samples: + elif self.n_components > n_samples and first_pass: raise ValueError( - "n_components=%r must be less or equal to " - "the batch number of samples " - "%d." % (self.n_components, n_samples) + "n_components=%r must be less or equal to the batch number " + "of samples %d for the first partial_fit " + "call." % (self.n_components, n_samples) ) else: self.n_components_ = self.n_components From ae39244550d045d117f95d8d4f7676a8ff837321 Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Mon, 4 Nov 2024 17:51:56 +0000 Subject: [PATCH 2/8] Add non-regression test --- .../tests/test_incremental_pca.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 50ddf39b04503..2bd3f8e731076 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -139,14 +139,15 @@ def test_incremental_pca_validation(): ): IncrementalPCA(n_components, batch_size=10).fit(X) - # Tests that n_components is also <= n_samples. + # Tests that n_components is also <= n_samples in first call to partial fit n_components = 3 with pytest.raises( ValueError, match=( "n_components={} must be" " less or equal to the batch number of" - " samples {}".format(n_components, n_samples) + " samples {} for the first partial_fit" + " call.".format(n_components, n_samples) ), ): IncrementalPCA(n_components=n_components).partial_fit(X) @@ -233,6 +234,27 @@ def test_incremental_pca_batch_signs(): assert_almost_equal(np.sign(i), np.sign(j), decimal=6) +def test_incremental_pca_partial_fit_small_batch(): + # Test that there is no minimum batch size after the first partial_fit + # Non-regression test + rng = np.random.RandomState(1999) + n, p = 50, 3 + X = rng.randn(n, p) # spherical data + X[:, 1] *= 0.00001 # make middle component relatively small + X += [5, 4, 3] # make a large mean + + n_components = p + pipca = IncrementalPCA(n_components=n_components) + pipca.partial_fit(X[:n_components]) + for idx in range(n_components, n): + pipca.partial_fit(X[idx : idx + 1]) + + pca = PCA(n_components=n_components) + pca.fit(X) + + assert_almost_equal(pca.components_, pipca.components_, decimal=3) + + def test_incremental_pca_batch_values(): # Test that components_ values are stable over batch sizes. rng = np.random.RandomState(1999) From 55ac444d7487747cb08dbd1941b282351205d0e0 Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Tue, 5 Nov 2024 13:59:22 +0000 Subject: [PATCH 3/8] Linting --- sklearn/decomposition/_incremental_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index bc0fb62836ee7..1b86104cc81f2 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -309,7 +309,7 @@ def partial_fit(self, X, y=None, check_input=True): elif self.n_components > n_samples and first_pass: raise ValueError( "n_components=%r must be less or equal to the batch number " - "of samples %d for the first partial_fit " + "of samples %d for the first partial_fit " "call." % (self.n_components, n_samples) ) else: From 66c4a22d29962d3c738812019f2239e3a6aa12d7 Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Tue, 5 Nov 2024 14:24:09 +0000 Subject: [PATCH 4/8] Changelog --- .../upcoming_changes/sklearn.decomposition/30211.fix.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 doc/whats_new/upcoming_changes/sklearn.decomposition/30211.fix.rst diff --git a/doc/whats_new/upcoming_changes/sklearn.decomposition/30211.fix.rst b/doc/whats_new/upcoming_changes/sklearn.decomposition/30211.fix.rst new file mode 100644 index 0000000000000..e325431c6e88f --- /dev/null +++ b/doc/whats_new/upcoming_changes/sklearn.decomposition/30211.fix.rst @@ -0,0 +1,6 @@ +- :class:`~sklearn.decomposition.IncrementalPCA` + will now only raise a ``ValueError`` when the number of samples in the + input data to ``partial_fit`` is less than the number of components + on the first call to ``partial_fit``. Subsequent calls to ``partial_fit`` + no longer face this restriction. + By :user:`Thomas Gessey-Jones ` From b52e8f7efb1c60b139eb02c04a85d2894981ef2e Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Tue, 5 Nov 2024 14:27:07 +0000 Subject: [PATCH 5/8] Correct changelog file name --- .../sklearn.decomposition/{30211.fix.rst => 30224.fix.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/whats_new/upcoming_changes/sklearn.decomposition/{30211.fix.rst => 30224.fix.rst} (100%) diff --git a/doc/whats_new/upcoming_changes/sklearn.decomposition/30211.fix.rst b/doc/whats_new/upcoming_changes/sklearn.decomposition/30224.fix.rst similarity index 100% rename from doc/whats_new/upcoming_changes/sklearn.decomposition/30211.fix.rst rename to doc/whats_new/upcoming_changes/sklearn.decomposition/30224.fix.rst From a96dd38497ecf9047aadbd4815bb4735c365e935 Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Tue, 5 Nov 2024 18:03:04 +0000 Subject: [PATCH 6/8] Use assert_allclose instead of assert_almost_equal --- sklearn/decomposition/tests/test_incremental_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 2bd3f8e731076..ba04720a2a819 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -252,7 +252,7 @@ def test_incremental_pca_partial_fit_small_batch(): pca = PCA(n_components=n_components) pca.fit(X) - assert_almost_equal(pca.components_, pipca.components_, decimal=3) + assert_allclose(pca.components_, pipca.components_, atol=1e-3) def test_incremental_pca_batch_values(): From 5996d430bd4c5b98f8ec9b407554d806b6413ef7 Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Tue, 5 Nov 2024 18:08:15 +0000 Subject: [PATCH 7/8] Use f strings in error messages --- sklearn/decomposition/_incremental_pca.py | 6 +++--- sklearn/decomposition/tests/test_incremental_pca.py | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index 1b86104cc81f2..8fda4ddd1470f 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -308,9 +308,9 @@ def partial_fit(self, X, y=None, check_input=True): ) elif self.n_components > n_samples and first_pass: raise ValueError( - "n_components=%r must be less or equal to the batch number " - "of samples %d for the first partial_fit " - "call." % (self.n_components, n_samples) + f"n_components={self.n_components} must be less or equal to " + f"the batch number of samples {n_samples} for the first " + "partial_fit call." ) else: self.n_components_ = self.n_components diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index ba04720a2a819..429c6ace263e0 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -144,10 +144,8 @@ def test_incremental_pca_validation(): with pytest.raises( ValueError, match=( - "n_components={} must be" - " less or equal to the batch number of" - " samples {} for the first partial_fit" - " call.".format(n_components, n_samples) + f"n_components={n_components} must be less or equal to the batch " + f"number of samples {n_samples} for the first partial_fit call." ), ): IncrementalPCA(n_components=n_components).partial_fit(X) From c7528373803b482d27751dfe9670ce505326dd4f Mon Sep 17 00:00:00 2001 From: Thomas Gessey-Jones Date: Tue, 5 Nov 2024 18:10:44 +0000 Subject: [PATCH 8/8] Correct grammar in comment --- sklearn/decomposition/tests/test_incremental_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 429c6ace263e0..e12be7337cbb3 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -139,7 +139,7 @@ def test_incremental_pca_validation(): ): IncrementalPCA(n_components, batch_size=10).fit(X) - # Tests that n_components is also <= n_samples in first call to partial fit + # Test that n_components is also <= n_samples in first call to partial fit. n_components = 3 with pytest.raises( ValueError,