From 7a47d1d6662d80a2982bbec5beab77e3333b0ba3 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Thu, 3 Aug 2017 22:49:21 -0500 Subject: [PATCH 1/6] Ensures that partial_fit uses float division --- sklearn/decomposition/incremental_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index c7b09c93dace9..dc0d8504b74a2 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -246,7 +246,7 @@ def partial_fit(self, X, y=None, check_input=True): X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ - np.sqrt((self.n_samples_seen_ * n_samples) / + np.sqrt(float(self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction)) From 8c0b3b9038d4eb529bfd58b600a9a443a576a1b9 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Tue, 8 Aug 2017 21:27:46 -0500 Subject: [PATCH 2/6] Switches to using future division for float division --- sklearn/decomposition/incremental_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index dc0d8504b74a2..f381dd76d64cc 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -4,6 +4,7 @@ # Giorgio Patrini # License: BSD 3 clause +from __future__ import division import numpy as np from scipy import linalg @@ -246,7 +247,7 @@ def partial_fit(self, X, y=None, check_input=True): X -= col_batch_mean # Build matrix of combined previous basis and new data mean_correction = \ - np.sqrt(float(self.n_samples_seen_ * n_samples) / + np.sqrt((self.n_samples_seen_ * n_samples) / n_total_samples) * (self.mean_ - col_batch_mean) X = np.vstack((self.singular_values_.reshape((-1, 1)) * self.components_, X, mean_correction)) From 99ea6d3a6082e02d938eff421bc866dad1e9d333 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 9 Aug 2017 13:24:39 -0500 Subject: [PATCH 3/6] Adds non-regression test for issue #9489 --- .../decomposition/tests/test_incremental_pca.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 87e7f9d7683e1..49dd4d0d8920a 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -273,3 +273,17 @@ def test_whitening(): assert_almost_equal(X, Xinv_ipca, decimal=prec) assert_almost_equal(X, Xinv_pca, decimal=prec) assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec) + + +def test_partial_fit_correct_answer(): + # Non-regression test for issue #9489 + + A = np.array([[6, 7, 3], [5, 2, 1], [3, 5, 6]]) + B = np.array([[1, 2, 4], [5, 3, 6]]) + C = np.array([[3, 2, 1]]) + + ipca = IncrementalPCA(n_components=2) + ipca.partial_fit(A) + ipca.partial_fit(B) + # Know answer is [[-1.48864923, -3.15618645]], want to ensure + np.testing.assert_allclose(ipca.transform(C), [[-1.48864923, -3.15618645]]) From 301639347ccc335b4773ca271e16680af73edbe4 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Thu, 10 Aug 2017 08:52:47 -0500 Subject: [PATCH 4/6] Updates test to remove dependence on a "known answer" --- .../tests/test_incremental_pca.py | 34 ++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/sklearn/decomposition/tests/test_incremental_pca.py b/sklearn/decomposition/tests/test_incremental_pca.py index 49dd4d0d8920a..f9772e84706cc 100644 --- a/sklearn/decomposition/tests/test_incremental_pca.py +++ b/sklearn/decomposition/tests/test_incremental_pca.py @@ -275,15 +275,25 @@ def test_whitening(): assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec) -def test_partial_fit_correct_answer(): - # Non-regression test for issue #9489 - - A = np.array([[6, 7, 3], [5, 2, 1], [3, 5, 6]]) - B = np.array([[1, 2, 4], [5, 3, 6]]) - C = np.array([[3, 2, 1]]) - - ipca = IncrementalPCA(n_components=2) - ipca.partial_fit(A) - ipca.partial_fit(B) - # Know answer is [[-1.48864923, -3.15618645]], want to ensure - np.testing.assert_allclose(ipca.transform(C), [[-1.48864923, -3.15618645]]) +def test_incremental_pca_partial_fit_float_division(): + # Test to ensure float division is used in all versions of Python + # (non-regression test for issue #9489) + + rng = np.random.RandomState(0) + A = rng.randn(5, 3) + 2 + B = rng.randn(7, 3) + 5 + + pca = IncrementalPCA(n_components=2) + pca.partial_fit(A) + # Set n_samples_seen_ to be a floating point number instead of an int + pca.n_samples_seen_ = float(pca.n_samples_seen_) + pca.partial_fit(B) + singular_vals_float_samples_seen = pca.singular_values_ + + pca2 = IncrementalPCA(n_components=2) + pca2.partial_fit(A) + pca2.partial_fit(B) + singular_vals_int_samples_seen = pca2.singular_values_ + + np.testing.assert_allclose(singular_vals_float_samples_seen, + singular_vals_int_samples_seen) From 9aa2e74bbf0fc8d2b8e60ceb48d1f88f68842ba2 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Thu, 10 Aug 2017 21:03:36 -0500 Subject: [PATCH 5/6] Updates doc/whats_new.rst with entry for this PR --- doc/whats_new.rst | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 23a3c7a6f3505..9f4a8a2af1ee2 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -11,6 +11,18 @@ Version 0.20 (under development) Changed models -------------- +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- :class:`decomposition.IncrementalPCA` (bug fix) + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + Changelog --------- @@ -24,6 +36,16 @@ Classifiers and regressors via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071` by `Raghav RV`_ +Bug fixes +......... + +Decomposition, manifold learning and clustering + +- Fixed a bug where the ``partial_fit`` method of + :class:`decomposition.IncrementalPCA` used integer division instead of float + division on Python 2 versions. :issue:`9492` by + :user:`James Bourbeau `. + Version 0.19 ============ @@ -160,7 +182,7 @@ Model selection and evaluation :issue:`8120` by `Neeraj Gangwar`_. - Added a scorer based on :class:`metrics.explained_variance_score`. - :issue:`9259` by `Hanmin Qin `_. + :issue:`9259` by `Hanmin Qin `_. Miscellaneous @@ -197,7 +219,7 @@ Trees and ensembles places. :issue:`8698` by :user:`Guillaume Lemaitre `. - :func:`tree.export_graphviz` now shows configurable number of decimal places. :issue:`8698` by :user:`Guillaume Lemaitre `. - + - Added ``flatten_transform`` parameter to :class:`ensemble.VotingClassifier` to change output shape of `transform` method to 2 dimensional. :issue:`7794` by :user:`Ibraim Ganiev ` and @@ -524,7 +546,7 @@ Decomposition, manifold learning and clustering in :class:`decomposition.PCA`, :class:`decomposition.RandomizedPCA` and :class:`decomposition.IncrementalPCA`. - :issue:`9105` by `Hanmin Qin `_. + :issue:`9105` by `Hanmin Qin `_. - Fixed the implementation of noise_variance_ in :class:`decomposition.PCA`. :issue:`9108` by `Hanmin Qin `_. From 567139c5044a6f5011f2a7a1daec68722f877060 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Sun, 13 Aug 2017 22:36:42 -0500 Subject: [PATCH 6/6] Specifies bug fix is for Python 2 versions in doc/whats_new.rst --- doc/whats_new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 38cc800421a7c..86c6f7c26ca44 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -16,7 +16,7 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- :class:`decomposition.IncrementalPCA` (bug fix) +- :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) Details are listed in the changelog below.