diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9cb6832204280..e336d4b18840a 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -239,6 +239,9 @@ Decomposition, manifold learning and clustering ``singular_values_``, like in :class:`decomposition.IncrementalPCA`. :issue:`7685` by :user:`Tommy Löfstedt ` +- Fixed the implementation of noise_variance_ in :class:`decomposition.PCA`. + :issue:`9108` by `Hanmin Qin `_. + - :class:`decomposition.NMF` now faster when ``beta_loss=0``. :issue:`9277` by :user:`hongkahjun`. diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index de447f1edd6aa..c0f1eb77b5f56 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -201,6 +201,9 @@ class PCA(_BasePCA): explained_variance_ : array, shape (n_components,) The amount of variance explained by each of the selected components. + Equal to n_components largest eigenvalues + of the covariance matrix of X. + .. versionadded:: 0.18 explained_variance_ratio_ : array, shape (n_components,) @@ -232,6 +235,9 @@ class PCA(_BasePCA): http://www.miketipping.com/papers/met-mppca.pdf. It is required to computed the estimated data covariance and score samples. + Equal to the average of (min(n_features, n_samples) - n_components) + smallest eigenvalues of the covariance matrix of X. + References ---------- For n_components == 'mle', this class uses the method of `Thomas P. Minka: @@ -494,9 +500,10 @@ def _fit_truncated(self, X, n_components, svd_solver): self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. - if self.n_components_ < n_features: + if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) + self.noise_variance_ /= min(n_features, n_samples) - n_components else: self.noise_variance_ = 0. diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 34b63c0674335..6795013b0790a 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -529,6 +529,50 @@ def test_pca_score3(): assert_true(ll.argmax() == 1) +def test_pca_score_with_different_solvers(): + digits = datasets.load_digits() + X_digits = digits.data + + pca_dict = {svd_solver: PCA(n_components=30, svd_solver=svd_solver, + random_state=0) + for svd_solver in solver_list} + + for pca in pca_dict.values(): + pca.fit(X_digits) + # Sanity check for the noise_variance_. For more details see + # https://github.com/scikit-learn/scikit-learn/issues/7568 + # https://github.com/scikit-learn/scikit-learn/issues/8541 + # https://github.com/scikit-learn/scikit-learn/issues/8544 + assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0) + + # Compare scores with different svd_solvers + score_dict = {svd_solver: pca.score(X_digits) + for svd_solver, pca in pca_dict.items()} + assert_almost_equal(score_dict['full'], score_dict['arpack']) + assert_almost_equal(score_dict['full'], score_dict['randomized'], + decimal=3) + + +def test_pca_zero_noise_variance_edge_cases(): + # ensure that noise_variance_ is 0 in edge cases + # when n_components == min(n_samples, n_features) + n, p = 100, 3 + + rng = np.random.RandomState(0) + X = rng.randn(n, p) * .1 + np.array([3, 4, 5]) + # arpack raises ValueError for n_components == min(n_samples, + # n_features) + svd_solvers = ['full', 'randomized'] + + for svd_solver in svd_solvers: + pca = PCA(svd_solver=svd_solver, n_components=p) + pca.fit(X) + assert pca.noise_variance_ == 0 + + pca.fit(X.T) + assert pca.noise_variance_ == 0 + + def test_svd_solver_auto(): rng = np.random.RandomState(0) X = rng.uniform(size=(1000, 50))