diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 24cb1649c5fee..9cf646bf28c9f 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -152,7 +152,7 @@ class PCA(_BasePCA): number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. - If ``svd_solver == 'arpack'``, the number of components must be + If ``svd_solver in {'arpack', 'lobpcg'}``, the number of components must be strictly less than the minimum of n_features and n_samples. Hence, the None case results in:: @@ -174,7 +174,7 @@ class PCA(_BasePCA): improve the predictive accuracy of the downstream estimators by making their data respect some hard-wired assumptions. - svd_solver : {'auto', 'full', 'covariance_eigh', 'arpack', 'randomized'},\ + svd_solver : {'auto', 'full', 'covariance_eigh', 'arpack', 'lobpcg', 'randomized'},\ default='auto' "auto" : The solver is selected by a default 'auto' policy is based on `X.shape` and @@ -200,7 +200,8 @@ class PCA(_BasePCA): therefore less numerical stable (e.g. on input data with a large range of singular values). "arpack" : - Run SVD truncated to `n_components` calling ARPACK solver via + "lobpcg" : + Run SVD truncated to `n_components` calling ARPACK or LOBPCG solver via `scipy.sparse.linalg.svds`. It requires strictly `0 < n_components < min(X.shape)` "randomized" : @@ -332,7 +333,7 @@ class PCA(_BasePCA): `_ via the score and score_samples methods. - For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`. + For ``svd_solver in {'arpack', 'lobpcg'}``, refer to `scipy.sparse.linalg.svds`. For svd_solver == 'randomized', see: :doi:`Halko, N., Martinsson, P. G., and Tropp, J. A. (2011). @@ -386,7 +387,9 @@ class PCA(_BasePCA): "copy": ["boolean"], "whiten": ["boolean"], "svd_solver": [ - StrOptions({"auto", "full", "covariance_eigh", "arpack", "randomized"}) + StrOptions( + {"auto", "full", "covariance_eigh", "arpack", "lobpcg", "randomized"} + ) ], "tol": [Interval(Real, 0, None, closed="left")], "iterated_power": [ @@ -485,15 +488,20 @@ def _fit(self, X): xp, is_array_api_compliant = get_namespace(X) # Raise an error for sparse input and unsupported svd_solver - if issparse(X) and self.svd_solver not in ["auto", "arpack", "covariance_eigh"]: + if issparse(X) and self.svd_solver not in { + "auto", + "arpack", + "lobpcg", + "covariance_eigh", + }: raise TypeError( - 'PCA only support sparse inputs with the "arpack" and' + 'PCA only support sparse inputs with the "arpack", "lobpcg", and' f' "covariance_eigh" solvers, while "{self.svd_solver}" was passed. See' " TruncatedSVD for a possible alternative." ) - if self.svd_solver == "arpack" and is_array_api_compliant: + if self.svd_solver in {"arpack", "lobpcg"} and is_array_api_compliant: raise ValueError( - "PCA with svd_solver='arpack' is not supported for Array API inputs." + f"PCA with {self.svd_solver=} is not supported for Array API inputs." ) # Validate the data, without ever forcing a copy as any solver that @@ -516,7 +524,7 @@ def _fit(self, X): self._fit_svd_solver = "arpack" if self.n_components is None: - if self._fit_svd_solver != "arpack": + if self._fit_svd_solver not in {"arpack", "lobpcg"}: n_components = min(X.shape) else: n_components = min(X.shape) - 1 @@ -540,7 +548,7 @@ def _fit(self, X): # Call different fits for either full or truncated SVD if self._fit_svd_solver in ("full", "covariance_eigh"): return self._fit_full(X, n_components, xp, is_array_api_compliant) - elif self._fit_svd_solver in ["arpack", "randomized"]: + elif self._fit_svd_solver in {"arpack", "lobpcg", "randomized"}: return self._fit_truncated(X, n_components, xp) def _fit_full(self, X, n_components, xp, is_array_api_compliant): @@ -704,7 +712,7 @@ def _fit_full(self, X, n_components, xp, is_array_api_compliant): return U, S, Vt, X, x_is_centered, xp def _fit_truncated(self, X, n_components, xp): - """Fit the model by computing truncated SVD (by ARPACK or randomized) + """Fit the model by computing truncated SVD (by ARPACK, LOBPCG or randomized) on X. """ n_samples, n_features = X.shape @@ -722,7 +730,9 @@ def _fit_truncated(self, X, n_components, xp): "svd_solver='%s'" % (n_components, min(n_samples, n_features), svd_solver) ) - elif svd_solver == "arpack" and n_components == min(n_samples, n_features): + elif svd_solver in {"arpack", "lobpcg"} and n_components == min( + n_samples, n_features + ): raise ValueError( "n_components=%r must be strictly less than " "min(n_samples, n_features)=%r with " @@ -745,9 +755,11 @@ def _fit_truncated(self, X, n_components, xp): X_centered -= self.mean_ x_is_centered = not self.copy - if svd_solver == "arpack": + if svd_solver in {"arpack", "lobpcg"}: v0 = _init_arpack_v0(min(X.shape), random_state) - U, S, Vt = svds(X_centered, k=n_components, tol=self.tol, v0=v0) + U, S, Vt = svds( + X_centered, k=n_components, tol=self.tol, v0=v0, solver=svd_solver + ) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. S = S[::-1] diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 52f769bfb9001..6d64ad3cb9701 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -25,7 +25,7 @@ from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS iris = datasets.load_iris() -PCA_SOLVERS = ["full", "covariance_eigh", "arpack", "randomized", "auto"] +PCA_SOLVERS = ["full", "covariance_eigh", "arpack", "lobpcg", "randomized", "auto"] # `SPARSE_M` and `SPARSE_N` could be larger, but be aware: # * SciPy's generation of random sparse matrix can be costly @@ -73,7 +73,7 @@ def test_pca(svd_solver, n_components): @pytest.mark.parametrize("density", [0.01, 0.1, 0.30]) @pytest.mark.parametrize("n_components", [1, 2, 10]) @pytest.mark.parametrize("sparse_container", CSR_CONTAINERS + CSC_CONTAINERS) -@pytest.mark.parametrize("svd_solver", ["arpack", "covariance_eigh"]) +@pytest.mark.parametrize("svd_solver", ["arpack", "lobpcg", "covariance_eigh"]) @pytest.mark.parametrize("scale", [1, 10, 100]) def test_pca_sparse( global_random_seed, svd_solver, sparse_container, n_components, density, scale @@ -429,7 +429,7 @@ def test_pca_explained_variance_empirical(X, svd_solver): assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3) -@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) +@pytest.mark.parametrize("svd_solver", ["arpack", "lobpcg", "randomized"]) def test_pca_singular_values_consistency(svd_solver): rng = np.random.RandomState(0) n_samples, n_features = 100, 80 @@ -564,6 +564,7 @@ def test_pca_validation(svd_solver, data, n_components, err_msg): [ ("full", min(iris.data.shape)), ("arpack", min(iris.data.shape) - 1), + ("lobpcg", min(iris.data.shape) - 1), ("randomized", min(iris.data.shape)), ], ) @@ -719,7 +720,7 @@ def test_pca_sanity_noise_variance(svd_solver): assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0) -@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"]) +@pytest.mark.parametrize("svd_solver", ["arpack", "lobpcg", "randomized"]) def test_pca_score_consistency_solvers(svd_solver): # Check the consistency of score between solvers X, _ = datasets.load_digits(return_X_y=True)