diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index e730b546049f7..4f5e13e7860a5 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -70,7 +70,7 @@ Bug fixes Decomposition, manifold learning and clustering -- Fix for uninformative error in :class:`decomposition.incremental_pca`: +- Fix for uninformative error in :class:`decomposition.IncrementalPCA`: now an error is raised if the number of components is larger than the chosen batch size. The ``n_components=None`` case was adapted accordingly. :issue:`6452`. By :user:`Wally Gauze `. @@ -87,6 +87,11 @@ Decomposition, manifold learning and clustering where all samples had equal similarity. :issue:`9612`. By :user:`Jonatan Samoocha `. +- In :class:`decomposition.PCA` selecting a n_components parameter greater than + the number of samples now raises an error. + Similarly, the ``n_components=None`` case now selects the minimum of + n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze `. + API changes summary ------------------- diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 2ba3d37f8b81d..16b8619ac9019 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -134,8 +134,12 @@ class PCA(_BasePCA): to guess the dimension if ``0 < n_components < 1`` and svd_solver == 'full', select the number of components such that the amount of variance that needs to be - explained is greater than the percentage specified by n_components - n_components cannot be equal to n_features for svd_solver == 'arpack'. + explained is greater than the percentage specified by n_components. + If svd_solver == 'arpack', the number of components must be strictly + less than the minimum of n_features and n_samples. + Hence, the None case results in: + + n_components == min(n_samples, n_features) - 1 copy : bool (default True) If False, data passed to fit are overwritten and running @@ -166,7 +170,7 @@ class PCA(_BasePCA): arpack : run SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`. It requires strictly - 0 < n_components < X.shape[1] + 0 < n_components < min(X.shape) randomized : run randomized SVD by the method of Halko et al. @@ -210,7 +214,7 @@ class PCA(_BasePCA): Percentage of variance explained by each of the selected components. If ``n_components`` is not set then all components are stored and the - sum of explained variances is equal to 1.0. + sum of the ratios is equal to 1.0. singular_values_ : array, shape (n_components,) The singular values corresponding to each of the selected components. @@ -226,7 +230,8 @@ class PCA(_BasePCA): The estimated number of components. When n_components is set to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this number is estimated from input data. Otherwise it equals the parameter - n_components, or n_features if n_components is None. + n_components, or the lesser value of n_features and n_samples + if n_components is None. noise_variance_ : float The estimated noise covariance following the Probabilistic PCA model @@ -371,7 +376,10 @@ def _fit(self, X): # Handle n_components==None if self.n_components is None: - n_components = X.shape[1] + if self.svd_solver != 'arpack': + n_components = min(X.shape) + else: + n_components = min(X.shape) - 1 else: n_components = self.n_components @@ -404,10 +412,11 @@ def _fit_full(self, X, n_components): if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") - elif not 0 <= n_components <= n_features: + elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " - "n_features=%r with svd_solver='full'" - % (n_components, n_features)) + "min(n_samples, n_features)=%r with " + "svd_solver='full'" + % (n_components, min(n_samples, n_features))) # Center data self.mean_ = np.mean(X, axis=0) @@ -462,14 +471,19 @@ def _fit_truncated(self, X, n_components, svd_solver): raise ValueError("n_components=%r cannot be a string " "with svd_solver='%s'" % (n_components, svd_solver)) - elif not 1 <= n_components <= n_features: + elif not 1 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 1 and " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) - elif svd_solver == 'arpack' and n_components == n_features: - raise ValueError("n_components=%r must be stricly less than " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) + elif svd_solver == 'arpack' and n_components == min(n_samples, + n_features): + raise ValueError("n_components=%r must be strictly less than " + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) random_state = check_random_state(self.random_state) @@ -504,6 +518,7 @@ def _fit_truncated(self, X, n_components, svd_solver): self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. + if self.n_components_ < min(n_features, n_samples): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 6795013b0790a..aa67189407296 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -8,6 +8,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import ignore_warnings @@ -349,11 +350,58 @@ def test_pca_inverse(): def test_pca_validation(): - X = [[0, 1], [1, 0]] + # Ensures that solver-specific extreme inputs for the n_components + # parameter raise errors + X = np.array([[0, 1, 0], [1, 0, 0]]) + smallest_d = 2 # The smallest dimension + lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0} + for solver in solver_list: - for n_components in [-1, 3]: - assert_raises(ValueError, - PCA(n_components, svd_solver=solver).fit, X) + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + for n_components in [-1, 3]: + + if solver == 'auto': + solver_reported = 'full' + else: + solver_reported = solver + + assert_raises_regex(ValueError, + "n_components={}L? must be between " + "{}L? and min\(n_samples, n_features\)=" + "{}L? with svd_solver=\'{}\'" + .format(n_components, + lower_limit[solver], + smallest_d, + solver_reported), + PCA(n_components, + svd_solver=solver).fit, data) + if solver == 'arpack': + + n_components = smallest_d + + assert_raises_regex(ValueError, + "n_components={}L? must be " + "strictly less than " + "min\(n_samples, n_features\)={}L?" + " with svd_solver=\'arpack\'" + .format(n_components, smallest_d), + PCA(n_components, svd_solver=solver) + .fit, data) + + +def test_n_components_none(): + # Ensures that n_components == None is handled correctly + X = iris.data + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + for solver in solver_list: + pca = PCA(svd_solver=solver) + pca.fit(data) + if solver == 'arpack': + assert_equal(pca.n_components_, min(data.shape) - 1) + else: + assert_equal(pca.n_components_, min(data.shape)) def test_randomized_pca_check_projection():