diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 86c6f7c26ca44..258dfe19b33cb 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -41,12 +41,16 @@ Bug fixes Decomposition, manifold learning and clustering +- Fix for uninformative error in :class:`decomposition.incremental_pca`: + now an error is raised if the number of components is larger than the + chosen batch size. The ``n_components=None`` case was adapted accordingly. + :issue:`6452`. By :user:`Wally Gauze `. + - Fixed a bug where the ``partial_fit`` method of :class:`decomposition.IncrementalPCA` used integer division instead of float division on Python 2 versions. :issue:`9492` by :user:`James Bourbeau `. - Version 0.19 ============ diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index f381dd76d64cc..f0604001fab53 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -211,11 +211,18 @@ def partial_fit(self, X, y=None, check_input=True): self.components_ = None if self.n_components is None: - self.n_components_ = n_features + if self.components_ is None: + self.n_components_ = min(n_samples, n_features) + else: + self.n_components_ = self.components_.shape[0] elif not 1 <= self.n_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d, need " "more rows than columns for IncrementalPCA " "processing" % (self.n_components, n_features)) + elif not self.n_components <= n_samples: + raise ValueError("n_components=%r must be less or equal to " + "the batch number of samples " + "%d." % (self.n_components, n_samples)) else: self.n_components_ = self.n_components diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index f9772e84706cc..f6f39db22c944 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -4,6 +4,7 @@ from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raises_regex from sklearn import datasets from sklearn.decomposition import PCA, IncrementalPCA @@ -73,10 +74,41 @@ def test_incremental_pca_inverse(): def test_incremental_pca_validation(): # Test that n_components is >=1 and <= n_features. - X = [[0, 1], [1, 0]] - for n_components in [-1, 0, .99, 3]: - assert_raises(ValueError, IncrementalPCA(n_components, - batch_size=10).fit, X) + X = np.array([[0, 1, 0], [1, 0, 0]]) + n_samples, n_features = X.shape + for n_components in [-1, 0, .99, 4]: + assert_raises_regex(ValueError, + "n_components={} invalid for n_features={}, need" + " more rows than columns for IncrementalPCA " + "processing".format(n_components, n_features), + IncrementalPCA(n_components, batch_size=10).fit, X) + + # Tests that n_components is also <= n_samples. + n_components = 3 + assert_raises_regex(ValueError, + "n_components={} must be less or equal to " + "the batch number of samples {}".format( + n_components, n_samples), + IncrementalPCA( + n_components=n_components).partial_fit, X) + + +def test_n_components_none(): + # Ensures that n_components == None is handled correctly + rng = np.random.RandomState(1999) + for n_samples, n_features in [(50, 10), (10, 50)]: + X = rng.rand(n_samples, n_features) + ipca = IncrementalPCA(n_components=None) + + # First partial_fit call, ipca.n_components_ is inferred from + # min(X.shape) + ipca.partial_fit(X) + assert ipca.n_components_ == min(X.shape) + + # Second partial_fit call, ipca.n_components_ is inferred from + # ipca.components_ computed from the first partial_fit call + ipca.partial_fit(X) + assert ipca.n_components_ == ipca.components_.shape[0] def test_incremental_pca_set_params():