scikit-learn · jnothman · Sep 9, 2017 · Mar 1, 2017 · Mar 6, 2017 · Mar 8, 2017
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -70,7 +70,7 @@ Bug fixes
 
 Decomposition, manifold learning and clustering
 
-- Fix for uninformative error in :class:`decomposition.incremental_pca`:
+- Fix for uninformative error in :class:`decomposition.IncrementalPCA`:
   now an error is raised if the number of components is larger than the
   chosen batch size. The ``n_components=None`` case was adapted accordingly.
   :issue:`6452`. By :user:`Wally Gauze <wallygauze>`.
@@ -87,6 +87,11 @@ Decomposition, manifold learning and clustering
   where all samples had equal similarity.
   :issue:`9612`. By :user:`Jonatan Samoocha <jsamoocha>`.
 
+- In :class:`decomposition.PCA` selecting a n_components parameter greater than
+  the number of samples now raises an error.
+  Similarly, the ``n_components=None`` case now selects the minimum of
+  n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze <wallygauze>`.
+
 API changes summary
 -------------------
 

diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py
@@ -134,8 +134,12 @@ class PCA(_BasePCA):
         to guess the dimension
         if ``0 < n_components < 1`` and svd_solver == 'full', select the number
         of components such that the amount of variance that needs to be
-        explained is greater than the percentage specified by n_components
-        n_components cannot be equal to n_features for svd_solver == 'arpack'.
+        explained is greater than the percentage specified by n_components.
+        If svd_solver == 'arpack', the number of components must be strictly
+        less than the minimum of n_features and n_samples.
+        Hence, the None case results in:
+
+            n_components == min(n_samples, n_features) - 1
 
     copy : bool (default True)
         If False, data passed to fit are overwritten and running
@@ -166,7 +170,7 @@ class PCA(_BasePCA):
         arpack :
             run SVD truncated to n_components calling ARPACK solver via
             `scipy.sparse.linalg.svds`. It requires strictly
-            0 < n_components < X.shape[1]
+            0 < n_components < min(X.shape)
         randomized :
             run randomized SVD by the method of Halko et al.
 
@@ -210,7 +214,7 @@ class PCA(_BasePCA):
         Percentage of variance explained by each of the selected components.
 
         If ``n_components`` is not set then all components are stored and the
-        sum of explained variances is equal to 1.0.
+        sum of the ratios is equal to 1.0.
 
     singular_values_ : array, shape (n_components,)
         The singular values corresponding to each of the selected components.
@@ -226,7 +230,8 @@ class PCA(_BasePCA):
         The estimated number of components. When n_components is set
         to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
         number is estimated from input data. Otherwise it equals the parameter
-        n_components, or n_features if n_components is None.
+        n_components, or the lesser value of n_features and n_samples
+        if n_components is None.
 
     noise_variance_ : float
         The estimated noise covariance following the Probabilistic PCA model
@@ -371,7 +376,10 @@ def _fit(self, X):
 
         # Handle n_components==None
         if self.n_components is None:
-            n_components = X.shape[1]
+            if self.svd_solver != 'arpack':
+                n_components = min(X.shape)
+            else:
+                n_components = min(X.shape) - 1
         else:
             n_components = self.n_components
 
@@ -404,10 +412,11 @@ def _fit_full(self, X, n_components):
             if n_samples < n_features:
                 raise ValueError("n_components='mle' is only supported "
                                  "if n_samples >= n_features")
-        elif not 0 <= n_components <= n_features:
+        elif not 0 <= n_components <= min(n_samples, n_features):
             raise ValueError("n_components=%r must be between 0 and "
-                             "n_features=%r with svd_solver='full'"
-                             % (n_components, n_features))
+                             "min(n_samples, n_features)=%r with "
+                             "svd_solver='full'"
+                             % (n_components, min(n_samples, n_features)))
 
         # Center data
         self.mean_ = np.mean(X, axis=0)
@@ -462,14 +471,19 @@ def _fit_truncated(self, X, n_components, svd_solver):
             raise ValueError("n_components=%r cannot be a string "
                              "with svd_solver='%s'"
                              % (n_components, svd_solver))
-        elif not 1 <= n_components <= n_features:
+        elif not 1 <= n_components <= min(n_samples, n_features):
             raise ValueError("n_components=%r must be between 1 and "
-                             "n_features=%r with svd_solver='%s'"
-                             % (n_components, n_features, svd_solver))
-        elif svd_solver == 'arpack' and n_components == n_features:
-            raise ValueError("n_components=%r must be stricly less than "
-                             "n_features=%r with svd_solver='%s'"
-                             % (n_components, n_features, svd_solver))
+                             "min(n_samples, n_features)=%r with "
+                             "svd_solver='%s'"
+                             % (n_components, min(n_samples, n_features),
+                                svd_solver))
+        elif svd_solver == 'arpack' and n_components == min(n_samples,
+                                                            n_features):
+            raise ValueError("n_components=%r must be strictly less than "
+                             "min(n_samples, n_features)=%r with "
+                             "svd_solver='%s'"
+                             % (n_components, min(n_samples, n_features),
+                                svd_solver))
 
         random_state = check_random_state(self.random_state)
 
@@ -504,6 +518,7 @@ def _fit_truncated(self, X, n_components, svd_solver):
         self.explained_variance_ratio_ = \
             self.explained_variance_ / total_var.sum()
         self.singular_values_ = S.copy()  # Store the singular values.
+
         if self.n_components_ < min(n_features, n_samples):
             self.noise_variance_ = (total_var.sum() -
                                     self.explained_variance_.sum())

diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py
@@ -8,6 +8,7 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import assert_raises_regex
 from sklearn.utils.testing import assert_no_warnings
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import ignore_warnings
@@ -349,11 +350,58 @@ def test_pca_inverse():
 
 
 def test_pca_validation():
-    X = [[0, 1], [1, 0]]
+    # Ensures that solver-specific extreme inputs for the n_components
+    # parameter raise errors
+    X = np.array([[0, 1, 0], [1, 0, 0]])
+    smallest_d = 2  # The smallest dimension
+    lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0}
+
     for solver in solver_list:
-        for n_components in [-1, 3]:
-            assert_raises(ValueError,
-                          PCA(n_components, svd_solver=solver).fit, X)
+        # We conduct the same test on X.T so that it is invariant to axis.
+        for data in [X, X.T]:
+            for n_components in [-1, 3]:
+
+                if solver == 'auto':
+                    solver_reported = 'full'
+                else:
+                    solver_reported = solver
+
+                assert_raises_regex(ValueError,
+                                    "n_components={}L? must be between "
+                                    "{}L? and min\(n_samples, n_features\)="
+                                    "{}L? with svd_solver=\'{}\'"
+                                    .format(n_components,
+                                            lower_limit[solver],
+                                            smallest_d,
+                                            solver_reported),
+                                    PCA(n_components,
+                                        svd_solver=solver).fit, data)
+            if solver == 'arpack':
+
+                n_components = smallest_d
+
+                assert_raises_regex(ValueError,
+                                    "n_components={}L? must be "
+                                    "strictly less than "
+                                    "min\(n_samples, n_features\)={}L?"
+                                    " with svd_solver=\'arpack\'"
+                                    .format(n_components, smallest_d),
+                                    PCA(n_components, svd_solver=solver)
+                                    .fit, data)
+
+
+def test_n_components_none():
+    # Ensures that n_components == None is handled correctly
+    X = iris.data
+    # We conduct the same test on X.T so that it is invariant to axis.
+    for data in [X, X.T]:
+        for solver in solver_list:
+            pca = PCA(svd_solver=solver)
+            pca.fit(data)
+            if solver == 'arpack':
+                assert_equal(pca.n_components_, min(data.shape) - 1)
+            else:
+                assert_equal(pca.n_components_, min(data.shape))
 
 
 def test_randomized_pca_check_projection():