Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Remove unnecessary restriction on number of samples in IncrementalPCA #30224

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- :class:`~sklearn.decomposition.IncrementalPCA`
will now only raise a ``ValueError`` when the number of samples in the
input data to ``partial_fit`` is less than the number of components
on the first call to ``partial_fit``. Subsequent calls to ``partial_fit``
no longer face this restriction.
By :user:`Thomas Gessey-Jones <ThomasGesseyJonesPX>`
8 changes: 4 additions & 4 deletions sklearn/decomposition/_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,11 +306,11 @@ def partial_fit(self, X, y=None, check_input=True):
"more rows than columns for IncrementalPCA "
"processing" % (self.n_components, n_features)
)
elif not self.n_components <= n_samples:
elif self.n_components > n_samples and first_pass:
raise ValueError(
"n_components=%r must be less or equal to "
"the batch number of samples "
"%d." % (self.n_components, n_samples)
f"n_components={self.n_components} must be less or equal to "
f"the batch number of samples {n_samples} for the first "
"partial_fit call."
)
else:
self.n_components_ = self.n_components
Expand Down
28 changes: 24 additions & 4 deletions sklearn/decomposition/tests/test_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,13 @@ def test_incremental_pca_validation():
):
IncrementalPCA(n_components, batch_size=10).fit(X)

# Tests that n_components is also <= n_samples.
# Test that n_components is also <= n_samples in first call to partial fit.
n_components = 3
with pytest.raises(
ValueError,
match=(
"n_components={} must be"
" less or equal to the batch number of"
" samples {}".format(n_components, n_samples)
f"n_components={n_components} must be less or equal to the batch "
f"number of samples {n_samples} for the first partial_fit call."
),
):
IncrementalPCA(n_components=n_components).partial_fit(X)
Expand Down Expand Up @@ -233,6 +232,27 @@ def test_incremental_pca_batch_signs():
assert_almost_equal(np.sign(i), np.sign(j), decimal=6)


def test_incremental_pca_partial_fit_small_batch():
# Test that there is no minimum batch size after the first partial_fit
# Non-regression test
rng = np.random.RandomState(1999)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= 0.00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean

n_components = p
pipca = IncrementalPCA(n_components=n_components)
pipca.partial_fit(X[:n_components])
for idx in range(n_components, n):
pipca.partial_fit(X[idx : idx + 1])

pca = PCA(n_components=n_components)
pca.fit(X)

assert_allclose(pca.components_, pipca.components_, atol=1e-3)


def test_incremental_pca_batch_values():
# Test that components_ values are stable over batch sizes.
rng = np.random.RandomState(1999)
Expand Down