From 967d792eab04173ac1a1b7743f215465f1cdb369 Mon Sep 17 00:00:00 2001 From: Wally Date: Wed, 1 Mar 2017 20:57:37 +0000 Subject: [PATCH 01/23] fixed issue 8484 --- sklearn/decomposition/pca.py | 42 ++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index eb11d9b032106..e8cf5f8737c93 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -134,8 +134,11 @@ class PCA(_BasePCA): to guess the dimension if ``0 < n_components < 1`` and svd_solver == 'full', select the number of components such that the amount of variance that needs to be - explained is greater than the percentage specified by n_components - n_components cannot be equal to n_features for svd_solver == 'arpack'. + explained is greater than the percentage specified by n_components. + if svd_solver == 'arpack', the number of components must be strictly + less than the minimum of n_features and n_samples: + + n_components == min(n_samples, n_features) copy : bool (default True) If False, data passed to fit are overwritten and running @@ -166,7 +169,7 @@ class PCA(_BasePCA): arpack : run SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`. It requires strictly - 0 < n_components < X.shape[1] + 0 < n_components < min(X.shape) randomized : run randomized SVD by the method of Halko et al. @@ -207,7 +210,7 @@ class PCA(_BasePCA): Percentage of variance explained by each of the selected components. If ``n_components`` is not set then all components are stored and the - sum of explained variances is equal to 1.0. + sum of the ratios is equal to 1.0. singular_values_ : array, shape (n_components,) The singular values corresponding to each of the selected components. @@ -223,7 +226,8 @@ class PCA(_BasePCA): The estimated number of components. When n_components is set to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this number is estimated from input data. Otherwise it equals the parameter - n_components, or n_features if n_components is None. + n_components, or the lesser value of n_features and n_samples + if n_components is None. noise_variance_ : float The estimated noise covariance following the Probabilistic PCA model @@ -367,7 +371,7 @@ def _fit(self, X): # Handle n_components==None if self.n_components is None: - n_components = X.shape[1] + n_components = min(X.shape) else: n_components = self.n_components @@ -400,10 +404,11 @@ def _fit_full(self, X, n_components): if n_samples < n_features: raise ValueError("n_components='mle' is only supported " "if n_samples >= n_features") - elif not 0 <= n_components <= n_features: + elif not 0 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 0 and " - "n_features=%r with svd_solver='full'" - % (n_components, n_features)) + "min(n_samples, n_features)=%r with " + "svd_solver='full'" + % (n_components, min(n_samples, n_features))) # Center data self.mean_ = np.mean(X, axis=0) @@ -458,14 +463,19 @@ def _fit_truncated(self, X, n_components, svd_solver): raise ValueError("n_components=%r cannot be a string " "with svd_solver='%s'" % (n_components, svd_solver)) - elif not 1 <= n_components <= n_features: + elif not 1 <= n_components <= min(n_samples, n_features): raise ValueError("n_components=%r must be between 1 and " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) - elif svd_solver == 'arpack' and n_components == n_features: + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) + elif svd_solver == 'arpack' and n_components == min(n_samples, + n_features): raise ValueError("n_components=%r must be stricly less than " - "n_features=%r with svd_solver='%s'" - % (n_components, n_features, svd_solver)) + "min(n_samples, n_features)=%r with " + "svd_solver='%s'" + % (n_components, min(n_samples, n_features), + svd_solver)) random_state = check_random_state(self.random_state) @@ -500,7 +510,7 @@ def _fit_truncated(self, X, n_components, svd_solver): self.explained_variance_ratio_ = \ self.explained_variance_ / total_var.sum() self.singular_values_ = S.copy() # Store the singular values. - if self.n_components_ < n_features: + if self.n_components_ < min(n_samples, n_features): self.noise_variance_ = (total_var.sum() - self.explained_variance_.sum()) else: From 8ffff6fe0048d3b1ba0dc7b6c90aaf646dc94051 Mon Sep 17 00:00:00 2001 From: Wally Date: Mon, 6 Mar 2017 15:20:54 +0000 Subject: [PATCH 02/23] dealt with indentation issues flagged by flake8 --- sklearn/decomposition/pca.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index e8cf5f8737c93..0a672bf02e3ba 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -468,14 +468,14 @@ def _fit_truncated(self, X, n_components, svd_solver): "min(n_samples, n_features)=%r with " "svd_solver='%s'" % (n_components, min(n_samples, n_features), - svd_solver)) + svd_solver)) elif svd_solver == 'arpack' and n_components == min(n_samples, - n_features): + n_features): raise ValueError("n_components=%r must be stricly less than " "min(n_samples, n_features)=%r with " "svd_solver='%s'" % (n_components, min(n_samples, n_features), - svd_solver)) + svd_solver)) random_state = check_random_state(self.random_state) From cbdffc4927f11755d2b01e20d3838ee102130d83 Mon Sep 17 00:00:00 2001 From: Wally Date: Wed, 8 Mar 2017 07:35:33 +0000 Subject: [PATCH 03/23] code to handle n_components==None with arpack was missing --- sklearn/decomposition/pca.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 0a672bf02e3ba..0aee4c721a6e1 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -371,7 +371,10 @@ def _fit(self, X): # Handle n_components==None if self.n_components is None: - n_components = min(X.shape) + if self.svd_solver is not 'arpack': + n_components = min(X.shape) + else: + n_components = min(X.shape) - 1 else: n_components = self.n_components From 279184c47e414bec05bf4a0e98342088f84d4cef Mon Sep 17 00:00:00 2001 From: Wally Date: Wed, 8 Mar 2017 07:37:05 +0000 Subject: [PATCH 04/23] added non-regression tests for my previous changes in pca --- sklearn/decomposition/tests/test_pca.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index ea321089d719c..44befd421b2a4 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -340,11 +340,26 @@ def test_pca_inverse(): def test_pca_validation(): - X = [[0, 1], [1, 0]] + # Ensures that extreme inputs for n_components common to all solvers + # (less than 0 or more than the lesser dimension of the input + # matrix X) raise errors. + X = np.array([[0, 1, 0], [1, 0, 0]]) for solver in solver_list: - for n_components in [-1, 3]: + for n_comp in [-1, 3]: assert_raises(ValueError, - PCA(n_components, svd_solver=solver).fit, X) + PCA(n_components=n_comp, svd_solver=solver).fit, X) + + +def test_n_components_none(): + # Ensures that n_components == None is handled correctly + X = iris.data + for solver in solver_list: + pca = PCA(svd_solver=solver) + pca.fit(X) + if solver == 'arpack': + assert_equal(pca.n_components_, min(X.shape)-1) + else: + assert_equal(pca.n_components_, min(X.shape)) def test_randomized_pca_check_projection(): From 4d093ab3b89284ac1b399e46772dc838b28cadb7 Mon Sep 17 00:00:00 2001 From: Wally Date: Tue, 4 Apr 2017 21:15:32 +0100 Subject: [PATCH 05/23] minor change: reverted iterator name in test_pca --- sklearn/decomposition/tests/test_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 44befd421b2a4..d88ce339baa1c 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -345,9 +345,9 @@ def test_pca_validation(): # matrix X) raise errors. X = np.array([[0, 1, 0], [1, 0, 0]]) for solver in solver_list: - for n_comp in [-1, 3]: + for n_components in [-1, 3]: assert_raises(ValueError, - PCA(n_components=n_comp, svd_solver=solver).fit, X) + PCA(n_components, svd_solver=solver).fit, X) def test_n_components_none(): From 0f3810164533320a9d83811385f39659d53a4a38 Mon Sep 17 00:00:00 2001 From: Wally Date: Fri, 7 Apr 2017 14:46:25 +0100 Subject: [PATCH 06/23] changed AssertRaises to regex variant in test, and minor writing change in docs --- sklearn/decomposition/pca.py | 2 +- sklearn/decomposition/tests/test_pca.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 0aee4c721a6e1..30e77ceff81de 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -135,7 +135,7 @@ class PCA(_BasePCA): if ``0 < n_components < 1`` and svd_solver == 'full', select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. - if svd_solver == 'arpack', the number of components must be strictly + If svd_solver == 'arpack', the number of components must be strictly less than the minimum of n_features and n_samples: n_components == min(n_samples, n_features) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index d88ce339baa1c..b7f8b942aaace 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -8,6 +8,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import ignore_warnings @@ -346,8 +347,11 @@ def test_pca_validation(): X = np.array([[0, 1, 0], [1, 0, 0]]) for solver in solver_list: for n_components in [-1, 3]: - assert_raises(ValueError, - PCA(n_components, svd_solver=solver).fit, X) + assert_raises_regex(ValueError, + "n_components\=.* must be between .* and min\(" + "n_samples, n_features\)\=.* with svd_solver" + "\=\'(?:full|arpack|randomized|auto)\'$", + PCA(n_components, svd_solver=solver).fit, X) def test_n_components_none(): From 4ee548c75e2f0384961144f0bcc989853aab6257 Mon Sep 17 00:00:00 2001 From: Wally Date: Sat, 27 May 2017 21:05:09 +0100 Subject: [PATCH 07/23] corrected pca.py fix --- sklearn/decomposition/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 30e77ceff81de..db0af4e855a2b 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -371,7 +371,7 @@ def _fit(self, X): # Handle n_components==None if self.n_components is None: - if self.svd_solver is not 'arpack': + if self.svd_solver != 'arpack': n_components = min(X.shape) else: n_components = min(X.shape) - 1 From b72ffe41a5809692be1d423bec33288be08e0461 Mon Sep 17 00:00:00 2001 From: Wally Date: Sat, 27 May 2017 21:07:19 +0100 Subject: [PATCH 08/23] improved test_pca_validation()'s scope --- sklearn/decomposition/tests/test_pca.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index b7f8b942aaace..66a2d1a2fb1e8 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -352,6 +352,12 @@ def test_pca_validation(): "n_samples, n_features\)\=.* with svd_solver" "\=\'(?:full|arpack|randomized|auto)\'$", PCA(n_components, svd_solver=solver).fit, X) + # We conduct the same test on X.T so that it is invariant to axis. + assert_raises_regex(ValueError, + "n_components\=.* must be between .* and min\(" + "n_samples, n_features\)\=.* with svd_solver" + "\=\'(?:full|arpack|randomized|auto)\'$", + PCA(n_components, svd_solver=solver).fit, X.T) def test_n_components_none(): @@ -361,7 +367,7 @@ def test_n_components_none(): pca = PCA(svd_solver=solver) pca.fit(X) if solver == 'arpack': - assert_equal(pca.n_components_, min(X.shape)-1) + assert_equal(pca.n_components_, min(X.shape) - 1) else: assert_equal(pca.n_components_, min(X.shape)) From 110cd1860c26e7190f86bfcc7945a76106896129 Mon Sep 17 00:00:00 2001 From: Wally Date: Tue, 15 Aug 2017 16:23:18 +0100 Subject: [PATCH 09/23] added an entry to whats_new.rst --- doc/whats_new.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 258dfe19b33cb..c10b4f86111e2 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -51,6 +51,11 @@ Decomposition, manifold learning and clustering division on Python 2 versions. :issue:`9492` by :user:`James Bourbeau `. +- In :class:`decomposition.pca` selecting a n_components parameter greater than + the number of samples failed to raise an error. + Similarly, the ``n_components=None`` case now selects the minimum of + n_samples and n_features. :issue:`6452`. By :user:`Wally Gauze `. + Version 0.19 ============ From c9049f9a2e942aa049914ec81ce931bdcb6bbb98 Mon Sep 17 00:00:00 2001 From: wallygauze Date: Tue, 15 Aug 2017 20:42:19 +0100 Subject: [PATCH 10/23] add requested code for axis-invariance check --- sklearn/decomposition/tests/test_pca.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 6b0836f9e15aa..1c52e2cc1edce 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -379,6 +379,16 @@ def test_n_components_none(): assert_equal(pca.n_components_, min(X.shape) - 1) else: assert_equal(pca.n_components_, min(X.shape)) + + # We conduct the same test on X.T so that it is invariant to axis. + X_2 = X.T + for solver in solver_list: + pca = PCA(svd_solver=solver) + pca.fit(X_2) + if solver == 'arpack': + assert_equal(pca.n_components_, min(X_2.shape) - 1) + else: + assert_equal(pca.n_components_, min(X_2.shape)) def test_randomized_pca_check_projection(): From c89ef02e727307f331881e0334bc674902cf7e8b Mon Sep 17 00:00:00 2001 From: wallygauze Date: Tue, 15 Aug 2017 20:48:57 +0100 Subject: [PATCH 11/23] Clarified doc change --- sklearn/decomposition/pca.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 1b043ff2b3f2b..337396212bb87 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -136,9 +136,10 @@ class PCA(_BasePCA): of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. If svd_solver == 'arpack', the number of components must be strictly - less than the minimum of n_features and n_samples: - - n_components == min(n_samples, n_features) + less than the minimum of n_features and n_samples. + Hence, the None case results in: + + n_components == min(n_samples, n_features) - 1 copy : bool (default True) If False, data passed to fit are overwritten and running From b91fa3b61a3817bcc062f4791528d93478d0b763 Mon Sep 17 00:00:00 2001 From: Wally Date: Tue, 15 Aug 2017 22:42:09 +0100 Subject: [PATCH 12/23] rephrased whats_new entry --- doc/whats_new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index c10b4f86111e2..4e7305888bc99 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -52,9 +52,9 @@ Decomposition, manifold learning and clustering :user:`James Bourbeau `. - In :class:`decomposition.pca` selecting a n_components parameter greater than - the number of samples failed to raise an error. + the number of samples now raises an error. Similarly, the ``n_components=None`` case now selects the minimum of - n_samples and n_features. :issue:`6452`. By :user:`Wally Gauze `. + n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze `. Version 0.19 ============ From d44986827fb9e2049e5882dcd84d6fb26c2aa08b Mon Sep 17 00:00:00 2001 From: Wally Date: Tue, 15 Aug 2017 22:42:50 +0100 Subject: [PATCH 13/23] fixed flake8 --- sklearn/decomposition/pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 337396212bb87..1178f8e540552 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -136,9 +136,9 @@ class PCA(_BasePCA): of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. If svd_solver == 'arpack', the number of components must be strictly - less than the minimum of n_features and n_samples. + less than the minimum of n_features and n_samples. Hence, the None case results in: - + n_components == min(n_samples, n_features) - 1 copy : bool (default True) From 07c1e1d8945aa195855b4b375ed23610f5cd8b0d Mon Sep 17 00:00:00 2001 From: Wally Date: Tue, 15 Aug 2017 22:43:16 +0100 Subject: [PATCH 14/23] refactored test code --- sklearn/decomposition/tests/test_pca.py | 48 ++++++++++--------------- 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 1c52e2cc1edce..c4073330a48fb 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -354,41 +354,31 @@ def test_pca_validation(): # (less than 0 or more than the lesser dimension of the input # matrix X) raise errors. X = np.array([[0, 1, 0], [1, 0, 0]]) - for solver in solver_list: - for n_components in [-1, 3]: - assert_raises_regex(ValueError, - "n_components\=.* must be between .* and min\(" - "n_samples, n_features\)\=.* with svd_solver" - "\=\'(?:full|arpack|randomized|auto)\'$", - PCA(n_components, svd_solver=solver).fit, X) - # We conduct the same test on X.T so that it is invariant to axis. - assert_raises_regex(ValueError, - "n_components\=.* must be between .* and min\(" - "n_samples, n_features\)\=.* with svd_solver" - "\=\'(?:full|arpack|randomized|auto)\'$", - PCA(n_components, svd_solver=solver).fit, X.T) + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: + for solver in solver_list: + for n_components in [-1, 3]: + assert_raises_regex(ValueError, + "n_components\=.* must be between .* and " + "min\(n_samples, n_features\)\=.* with " + "svd_solver\=" + "\'(?:full|arpack|randomized|auto)\'$", + PCA(n_components, svd_solver=solver).fit, + data) def test_n_components_none(): # Ensures that n_components == None is handled correctly X = iris.data - for solver in solver_list: - pca = PCA(svd_solver=solver) - pca.fit(X) - if solver == 'arpack': - assert_equal(pca.n_components_, min(X.shape) - 1) - else: - assert_equal(pca.n_components_, min(X.shape)) - # We conduct the same test on X.T so that it is invariant to axis. - X_2 = X.T - for solver in solver_list: - pca = PCA(svd_solver=solver) - pca.fit(X_2) - if solver == 'arpack': - assert_equal(pca.n_components_, min(X_2.shape) - 1) - else: - assert_equal(pca.n_components_, min(X_2.shape)) + for data in [X, X.T]: + for solver in solver_list: + pca = PCA(svd_solver=solver) + pca.fit(data) + if solver == 'arpack': + assert_equal(pca.n_components_, min(data.shape) - 1) + else: + assert_equal(pca.n_components_, min(data.shape)) def test_randomized_pca_check_projection(): From 724e612f9910155db0be27f983e49bf9281a3ec2 Mon Sep 17 00:00:00 2001 From: Wally Date: Fri, 18 Aug 2017 20:00:22 +0100 Subject: [PATCH 15/23] corrected whats_new entry typo --- doc/whats_new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 4e7305888bc99..cfe3b1f475d24 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -41,7 +41,7 @@ Bug fixes Decomposition, manifold learning and clustering -- Fix for uninformative error in :class:`decomposition.incremental_pca`: +- Fix for uninformative error in :class:`decomposition.IncrementalPCA`: now an error is raised if the number of components is larger than the chosen batch size. The ``n_components=None`` case was adapted accordingly. :issue:`6452`. By :user:`Wally Gauze `. @@ -51,7 +51,7 @@ Decomposition, manifold learning and clustering division on Python 2 versions. :issue:`9492` by :user:`James Bourbeau `. -- In :class:`decomposition.pca` selecting a n_components parameter greater than +- In :class:`decomposition.PCA` selecting a n_components parameter greater than the number of samples now raises an error. Similarly, the ``n_components=None`` case now selects the minimum of n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze `. From 2251ae5046495523d385a5f294da846bdeb36ffc Mon Sep 17 00:00:00 2001 From: Wally Date: Fri, 18 Aug 2017 20:01:55 +0100 Subject: [PATCH 16/23] arpack case was missing from test; improved overall test --- sklearn/decomposition/tests/test_pca.py | 27 ++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index c4073330a48fb..47c0b403af687 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -354,17 +354,30 @@ def test_pca_validation(): # (less than 0 or more than the lesser dimension of the input # matrix X) raise errors. X = np.array([[0, 1, 0], [1, 0, 0]]) + minimum = 2 # The smallest dimension + lower_limit = {'randomized':1,'full':0,'auto':0} # We conduct the same test on X.T so that it is invariant to axis. for data in [X, X.T]: for solver in solver_list: for n_components in [-1, 3]: - assert_raises_regex(ValueError, - "n_components\=.* must be between .* and " - "min\(n_samples, n_features\)\=.* with " - "svd_solver\=" - "\'(?:full|arpack|randomized|auto)\'$", - PCA(n_components, svd_solver=solver).fit, - data) + if solver == 'arpack': + assert_raises_regex(ValueError, + "n_components={} must be stricly less " + "than min(n_samples, n_features)={} " + "with svd_solver='arpack'" + .format(n_components, minimum), + PCA(n_components, svd_solver=solver) + .fit, data) + else: + assert_raises_regex(ValueError, + "n_components={} must be between {} " + "and min(n_samples, n_features)={} with " + "svd_solver='{}'" + .format(n_components, + lower_limit[solver], minimum, + solver), + PCA(n_components, + svd_solver=solver).fit, data) def test_n_components_none(): From f9af4d63ff3caacfeb8c315c43f3e6ee060f18ce Mon Sep 17 00:00:00 2001 From: Wally Date: Fri, 18 Aug 2017 21:15:15 +0100 Subject: [PATCH 17/23] flake8 corrections --- sklearn/decomposition/tests/test_pca.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 47c0b403af687..7d6787b72ad49 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -354,8 +354,8 @@ def test_pca_validation(): # (less than 0 or more than the lesser dimension of the input # matrix X) raise errors. X = np.array([[0, 1, 0], [1, 0, 0]]) - minimum = 2 # The smallest dimension - lower_limit = {'randomized':1,'full':0,'auto':0} + minimum = 2 # The smallest dimension + lower_limit = {'randomized': 1, 'full': 0, 'auto': 0} # We conduct the same test on X.T so that it is invariant to axis. for data in [X, X.T]: for solver in solver_list: @@ -371,8 +371,8 @@ def test_pca_validation(): else: assert_raises_regex(ValueError, "n_components={} must be between {} " - "and min(n_samples, n_features)={} with " - "svd_solver='{}'" + "and min(n_samples, n_features)={} " + "with svd_solver='{}'" .format(n_components, lower_limit[solver], minimum, solver), From fe7047fd853421d358034dfdcdba9690a2d06bc8 Mon Sep 17 00:00:00 2001 From: Wally Date: Sat, 19 Aug 2017 21:06:15 +0100 Subject: [PATCH 18/23] arpack case was still missing + fixed my test bug + more refactoring --- sklearn/decomposition/tests/test_pca.py | 60 ++++++++++++++----------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 7d6787b72ad49..c94712d0b18cb 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -350,34 +350,44 @@ def test_pca_inverse(): def test_pca_validation(): - # Ensures that extreme inputs for n_components common to all solvers - # (less than 0 or more than the lesser dimension of the input - # matrix X) raise errors. + # Ensures that solver-specific extreme inputs for the n_components + # parameter raise errors X = np.array([[0, 1, 0], [1, 0, 0]]) - minimum = 2 # The smallest dimension - lower_limit = {'randomized': 1, 'full': 0, 'auto': 0} - # We conduct the same test on X.T so that it is invariant to axis. - for data in [X, X.T]: - for solver in solver_list: + smallest_d = 2 # The smallest dimension + lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0} + + for solver in solver_list: + # We conduct the same test on X.T so that it is invariant to axis. + for data in [X, X.T]: for n_components in [-1, 3]: - if solver == 'arpack': - assert_raises_regex(ValueError, - "n_components={} must be stricly less " - "than min(n_samples, n_features)={} " - "with svd_solver='arpack'" - .format(n_components, minimum), - PCA(n_components, svd_solver=solver) - .fit, data) + + if solver == 'auto': + solver_reported = 'full' else: - assert_raises_regex(ValueError, - "n_components={} must be between {} " - "and min(n_samples, n_features)={} " - "with svd_solver='{}'" - .format(n_components, - lower_limit[solver], minimum, - solver), - PCA(n_components, - svd_solver=solver).fit, data) + solver_reported = solver + + assert_raises_regex(ValueError, + "n_components={} must be between " + "{} and min\(n_samples, n_features\)=" + "{} with svd_solver=\'{}\'" + .format(n_components, + lower_limit[solver], + smallest_d, + solver_reported), + PCA(n_components, + svd_solver=solver).fit, data) + if solver == 'arpack': + + n_components = smallest_d + + assert_raises_regex(ValueError, + "n_components={} must be " + "strictly less than " + "min\(n_samples, n_features\)={}" + " with svd_solver=\'arpack\'" + .format(n_components, smallest_d), + PCA(n_components, svd_solver=solver) + .fit, data) def test_n_components_none(): From 1e7cd1047201c52ee7694729d9887ad9e190a578 Mon Sep 17 00:00:00 2001 From: Wally Date: Sat, 19 Aug 2017 21:06:40 +0100 Subject: [PATCH 19/23] corrected typo --- sklearn/decomposition/pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/pca.py b/sklearn/decomposition/pca.py index 1178f8e540552..9cd108710ad3f 100644 --- a/sklearn/decomposition/pca.py +++ b/sklearn/decomposition/pca.py @@ -475,7 +475,7 @@ def _fit_truncated(self, X, n_components, svd_solver): svd_solver)) elif svd_solver == 'arpack' and n_components == min(n_samples, n_features): - raise ValueError("n_components=%r must be stricly less than " + raise ValueError("n_components=%r must be strictly less than " "min(n_samples, n_features)=%r with " "svd_solver='%s'" % (n_components, min(n_samples, n_features), From a52851294954ffac94e474583d95e4bddcf3417b Mon Sep 17 00:00:00 2001 From: Wally Date: Sun, 20 Aug 2017 13:13:38 +0100 Subject: [PATCH 20/23] allow type long? --- sklearn/decomposition/tests/test_pca.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index c94712d0b18cb..6c0ccb2a1d813 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -353,7 +353,7 @@ def test_pca_validation(): # Ensures that solver-specific extreme inputs for the n_components # parameter raise errors X = np.array([[0, 1, 0], [1, 0, 0]]) - smallest_d = 2 # The smallest dimension + smallest_d = int(2) # The smallest dimension lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0} for solver in solver_list: @@ -367,9 +367,9 @@ def test_pca_validation(): solver_reported = solver assert_raises_regex(ValueError, - "n_components={} must be between " - "{} and min\(n_samples, n_features\)=" - "{} with svd_solver=\'{}\'" + "n_components={}L? must be between " + "{}L? and min\(n_samples, n_features\)=" + "{}L? with svd_solver=\'{}\'" .format(n_components, lower_limit[solver], smallest_d, @@ -381,9 +381,9 @@ def test_pca_validation(): n_components = smallest_d assert_raises_regex(ValueError, - "n_components={} must be " + "n_components={}L? must be " "strictly less than " - "min\(n_samples, n_features\)={}" + "min\(n_samples, n_features\)={}L?" " with svd_solver=\'arpack\'" .format(n_components, smallest_d), PCA(n_components, svd_solver=solver) From f25bd9ccd35e2bc0fe124848a90cee2894df004f Mon Sep 17 00:00:00 2001 From: Wally Date: Tue, 22 Aug 2017 15:44:49 +0100 Subject: [PATCH 21/23] accidentally left useless piece of code --- sklearn/decomposition/tests/test_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 6c0ccb2a1d813..aa67189407296 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -353,7 +353,7 @@ def test_pca_validation(): # Ensures that solver-specific extreme inputs for the n_components # parameter raise errors X = np.array([[0, 1, 0], [1, 0, 0]]) - smallest_d = int(2) # The smallest dimension + smallest_d = 2 # The smallest dimension lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0} for solver in solver_list: From bd1f151fe5778b55ef7654d756c5951d07905d79 Mon Sep 17 00:00:00 2001 From: Wally Date: Fri, 8 Sep 2017 20:51:29 +0100 Subject: [PATCH 22/23] reverted changes in doc/whats_new.rst --- doc/whats_new.rst | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index fe8e0e007f932..5de27d3251787 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -65,7 +65,7 @@ Bug fixes Decomposition, manifold learning and clustering -- Fix for uninformative error in :class:`decomposition.IncrementalPCA`: +- Fix for uninformative error in :class:`decomposition.incremental_pca`: now an error is raised if the number of components is larger than the chosen batch size. The ``n_components=None`` case was adapted accordingly. :issue:`6452`. By :user:`Wally Gauze `. @@ -82,11 +82,6 @@ Decomposition, manifold learning and clustering where all samples had equal similarity. :issue:`9612`. By :user:`Jonatan Samoocha `. -- In :class:`decomposition.PCA` selecting a n_components parameter greater than - the number of samples now raises an error. - Similarly, the ``n_components=None`` case now selects the minimum of - n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze `. - Version 0.19 ============ From e3ecd12b45b905ed7973ea743308e55f41812d34 Mon Sep 17 00:00:00 2001 From: Wally Date: Fri, 8 Sep 2017 20:57:56 +0100 Subject: [PATCH 23/23] added entry in whats_new/v0.20.rst --- doc/whats_new/v0.20.rst | 102 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 doc/whats_new/v0.20.rst diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst new file mode 100644 index 0000000000000..4f5e13e7860a5 --- /dev/null +++ b/doc/whats_new/v0.20.rst @@ -0,0 +1,102 @@ +.. include:: _contributors.rst + +.. currentmodule:: sklearn + +.. _changes_0_20: + +Version 0.20 (under development) +================================ + +Changed models +-------------- + +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + +Changelog +--------- + +New features +............ + +Classifiers and regressors + +- :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` now support early stopping + via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071` + by `Raghav RV`_ + +- Added :class:`naive_bayes.ComplementNB`, which implements the Complement + Naive Bayes classifier described in Rennie et al. (2003). + By :user:`Michael A. Alcorn `. + +Enhancements +............ + +Classifiers and regressors + +- In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` + is faster when using ``return_std=True`` in particular more when called + several times in a row. :issue:`9234` by :user:`andrewww ` + and :user:`Minghui Liu `. + +- Add `named_estimators_` parameter in + :class:`sklearn.ensemble.voting_classifier` to access fitted + estimators. :issue:`9157` by :user:`Herilalaina Rakotoarison `. + + +Model evaluation and meta-estimators + +- A scorer based on :func:`metrics.brier_score_loss` is also available. + :issue:`9521` by :user:`Hanmin Qin `. + +Linear, kernelized and related models + +- Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the + underlying implementation is not random. + :issue:`9497` by :user:`Albert Thomas `. + +Bug fixes +......... + +Decomposition, manifold learning and clustering + +- Fix for uninformative error in :class:`decomposition.IncrementalPCA`: + now an error is raised if the number of components is larger than the + chosen batch size. The ``n_components=None`` case was adapted accordingly. + :issue:`6452`. By :user:`Wally Gauze `. + +- Fixed a bug where the ``partial_fit`` method of + :class:`decomposition.IncrementalPCA` used integer division instead of float + division on Python 2 versions. :issue:`9492` by + :user:`James Bourbeau `. + +- Fixed a bug where the ``fit`` method of + :class:`cluster.affinity_propagation_.AffinityPropagation` stored cluster + centers as 3d array instead of 2d array in case of non-convergence. For the + same class, fixed undefined and arbitrary behavior in case of training data + where all samples had equal similarity. + :issue:`9612`. By :user:`Jonatan Samoocha `. + +- In :class:`decomposition.PCA` selecting a n_components parameter greater than + the number of samples now raises an error. + Similarly, the ``n_components=None`` case now selects the minimum of + n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze `. + +API changes summary +------------------- + +Linear, kernelized and related models + +- Deprecate ``random_state`` parameter in :class:`svm.OneClassSVM` as the + underlying implementation is not random. + :issue:`9497` by :user:`Albert Thomas `.