From aeada3145b0c2f4e79febcd864142847dd045f59 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Feb 2022 20:43:02 -0500 Subject: [PATCH 1/7] Initial in-place variance calculation implementation --- sklearn/decomposition/_pca.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index d74f7755d10df..7ab6c3ae1bfe6 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -617,14 +617,25 @@ def _fit_truncated(self, X, n_components, svd_solver): self.components_ = Vt self.n_components_ = n_components + def in_place_var(X): + N = X.shape[0] - 1 + X -= X.mean(axis=0) + np.square(X, out=X) + np.sum(X, axis=0, out=X[0]) + X = X[0] + X /= N + out = np.sum(X) + del X + return out + # Get variance explained by singular values self.explained_variance_ = (S ** 2) / (n_samples - 1) - total_var = np.var(X, ddof=1, axis=0) - self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum() + total_var = in_place_var(X) if self.copy else np.var(X, ddof=1, axis=0).sum() + self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = S.copy() # Store the singular values. if self.n_components_ < min(n_features, n_samples): - self.noise_variance_ = total_var.sum() - self.explained_variance_.sum() + self.noise_variance_ = total_var - self.explained_variance_.sum() self.noise_variance_ /= min(n_features, n_samples) - n_components else: self.noise_variance_ = 0.0 From 8a5a8f2e17e08532690b19c0621efefdfd2c3422 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 19 Feb 2022 20:00:38 -0500 Subject: [PATCH 2/7] Implemented iterative variance calculation --- sklearn/decomposition/_pca.py | 36 +++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 7ab6c3ae1bfe6..0084b768ea212 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -10,7 +10,7 @@ # # License: BSD 3 clause -from math import log, sqrt +from math import log, sqrt, ceil import numbers import numpy as np @@ -23,7 +23,7 @@ from ..utils import check_random_state, check_scalar from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip -from ..utils.extmath import stable_cumsum +from ..utils.extmath import stable_cumsum, _incremental_mean_and_var from ..utils.validation import check_is_fitted @@ -617,20 +617,28 @@ def _fit_truncated(self, X, n_components, svd_solver): self.components_ = Vt self.n_components_ = n_components - def in_place_var(X): - N = X.shape[0] - 1 - X -= X.mean(axis=0) - np.square(X, out=X) - np.sum(X, axis=0, out=X[0]) - X = X[0] - X /= N - out = np.sum(X) - del X - return out - # Get variance explained by singular values self.explained_variance_ = (S ** 2) / (n_samples - 1) - total_var = in_place_var(X) if self.copy else np.var(X, ddof=1, axis=0).sum() + if X.shape[0] > 1000: + chunk = ceil(X.shape[0] / 4) + last_mean, last_variance, last_sample_count = np.zeros((3, X.shape[1])) + + for i in range(3): + last_mean, last_variance, last_sample_count = _incremental_mean_and_var( + X[i * chunk : (i + 1) * chunk, :], + last_mean, + last_variance, + last_sample_count, + ) + last_mean, last_variance, last_sample_count = _incremental_mean_and_var( + X[3 * chunk :, :], last_mean, last_variance, last_sample_count + ) + + del last_mean, last_sample_count + total_var = last_variance.sum() + else: + total_var = np.var(X, ddof=1, axis=0).sum() + self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = S.copy() # Store the singular values. From 04a34c1c8faa6f6d57e8b58dfd1c2194962c1804 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 24 Feb 2022 21:04:24 -0500 Subject: [PATCH 3/7] Added new test, introduced in-place var calculation - Created a new method in `extmath.py` for calculating feature-wise in-place variance - PCA now uses in-place feature variance calculation to avoid mem spike - Added new test to check for correctness of internal `total_var` calculation - Fixed existing whitening test by changing `_X`->`_X.copy` to account for the `copy=False` semantics of accepting in-place changes in `X` --- sklearn/decomposition/_pca.py | 24 +++-------------------- sklearn/decomposition/tests/test_pca.py | 12 +++++++++++- sklearn/utils/extmath.py | 26 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 262e3690110a3..6a7227ca6f2ca 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -10,7 +10,7 @@ # # License: BSD 3 clause -from math import log, sqrt, ceil +from math import log, sqrt import numbers import numpy as np @@ -23,7 +23,7 @@ from ..utils import check_random_state, check_scalar from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip -from ..utils.extmath import stable_cumsum, _incremental_mean_and_var +from ..utils.extmath import stable_cumsum, in_place_feature_var from ..utils.validation import check_is_fitted @@ -619,25 +619,7 @@ def _fit_truncated(self, X, n_components, svd_solver): # Get variance explained by singular values self.explained_variance_ = (S**2) / (n_samples - 1) - if X.shape[0] > 1000: - chunk = ceil(X.shape[0] / 4) - last_mean, last_variance, last_sample_count = np.zeros((3, X.shape[1])) - - for i in range(3): - last_mean, last_variance, last_sample_count = _incremental_mean_and_var( - X[i * chunk : (i + 1) * chunk, :], - last_mean, - last_variance, - last_sample_count, - ) - last_mean, last_variance, last_sample_count = _incremental_mean_and_var( - X[3 * chunk :, :], last_mean, last_variance, last_sample_count - ) - - del last_mean, last_sample_count - total_var = last_variance.sum() - else: - total_var = np.var(X, ddof=1, axis=0).sum() + total_var = in_place_feature_var(X).sum() self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = S.copy() # Store the singular values. diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index b8c95deaee5a5..7638250939aab 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -95,7 +95,7 @@ def test_whitening(solver, copy): X_ = X.copy() pca = PCA( n_components=n_components, whiten=False, copy=copy, svd_solver=solver - ).fit(X_) + ).fit(X_.copy()) X_unwhitened = pca.transform(X_) assert X_unwhitened.shape == (n_samples, n_components) @@ -720,3 +720,13 @@ def test_feature_names_out(): names = pca.get_feature_names_out() assert_array_equal([f"pca{i}" for i in range(2)], names) + + +def test_variance_correctness(): + """Check the accuracy of PCA's internal variance calculation""" + rng = np.random.RandomState(0) + X = rng.randn(1000, 200) + pca = PCA().fit(X) + pca_var = pca.explained_variance_ / pca.explained_variance_ratio_ + true_var = np.var(X, ddof=1, axis=0).sum() + np.testing.assert_allclose(pca_var, true_var) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index a8ca0f7be5caa..673212b040bfd 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -898,6 +898,32 @@ def _safe_accumulator_op(op, x, *args, **kwargs): return result +def in_place_feature_var(X): + """ + Calculate the feature-wise variance of a 2D array. + + This function provides an alternative to `np.var` for calculating + the feature-wise sample variance of a 2D array. This function is equivalent + to `np.var(X, ddof=1, axis=0)`. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Data to use for variance calculation. + + Returns + ------- + var : ndarray of shape (n_features,) + The feature-wise variance + """ + N = X.shape[0] - 1 + X -= X.mean(axis=0) + np.square(X, out=X) + np.sum(X, axis=0, out=X[0]) + var = X[0] / N + return var + + def _incremental_mean_and_var( X, last_mean, last_variance, last_sample_count, sample_weight=None ): From b4ee107f0163da193a4d178518fed279a7f1ef5e Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 24 Feb 2022 21:21:12 -0500 Subject: [PATCH 4/7] Added changelog entry --- doc/whats_new/v1.1.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 2c73262d3ab4d..6867a8db04ed7 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -289,6 +289,10 @@ Changelog :pr:`22300` by :user:`Meekail Zain ` and :pr:`15948` by :user:`sysuresh`. +- |Fix| Greatly reduced peak memory usage in :class:`decomposition.PCA` when + calling `fit` or `fit_transform`. + :pr:`22553` by :user:`Meekail Zain `. + :mod:`sklearn.discriminant_analysis` .................................... From 98daf6b61a65c20b5e5ab36bdc8fa8e848ffc0e6 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 24 Feb 2022 21:21:58 -0500 Subject: [PATCH 5/7] Removed empty line --- sklearn/decomposition/_pca.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 6a7227ca6f2ca..cb3f1a3583fc1 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -620,7 +620,6 @@ def _fit_truncated(self, X, n_components, svd_solver): # Get variance explained by singular values self.explained_variance_ = (S**2) / (n_samples - 1) total_var = in_place_feature_var(X).sum() - self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = S.copy() # Store the singular values. From 59c7eee45b52a8c2f3c60fbe0aaa33c097c5359d Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 24 Feb 2022 21:24:48 -0500 Subject: [PATCH 6/7] Added test parameterization for `copy` --- sklearn/decomposition/tests/test_pca.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_pca.py b/sklearn/decomposition/tests/test_pca.py index 7638250939aab..44b97a5fc1ce7 100644 --- a/sklearn/decomposition/tests/test_pca.py +++ b/sklearn/decomposition/tests/test_pca.py @@ -722,7 +722,8 @@ def test_feature_names_out(): assert_array_equal([f"pca{i}" for i in range(2)], names) -def test_variance_correctness(): +@pytest.mark.parametrize("copy", [True, False]) +def test_variance_correctness(copy): """Check the accuracy of PCA's internal variance calculation""" rng = np.random.RandomState(0) X = rng.randn(1000, 200) From eb1233124ecc2271792943084169664562932152 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Feb 2022 21:15:26 -0500 Subject: [PATCH 7/7] Moved calc out of `extmath.py` and directly into `_pca.py` --- sklearn/decomposition/_pca.py | 11 +++++++++-- sklearn/utils/extmath.py | 26 -------------------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index cb3f1a3583fc1..01cdf5f8ad279 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -23,7 +23,7 @@ from ..utils import check_random_state, check_scalar from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import fast_logdet, randomized_svd, svd_flip -from ..utils.extmath import stable_cumsum, in_place_feature_var +from ..utils.extmath import stable_cumsum from ..utils.validation import check_is_fitted @@ -619,7 +619,14 @@ def _fit_truncated(self, X, n_components, svd_solver): # Get variance explained by singular values self.explained_variance_ = (S**2) / (n_samples - 1) - total_var = in_place_feature_var(X).sum() + + # Workaround in-place variance calculation since at the time numpy + # did not have a way to calculate variance in-place. + N = X.shape[0] - 1 + np.square(X, out=X) + np.sum(X, axis=0, out=X[0]) + total_var = (X[0] / N).sum() + self.explained_variance_ratio_ = self.explained_variance_ / total_var self.singular_values_ = S.copy() # Store the singular values. diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 673212b040bfd..a8ca0f7be5caa 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -898,32 +898,6 @@ def _safe_accumulator_op(op, x, *args, **kwargs): return result -def in_place_feature_var(X): - """ - Calculate the feature-wise variance of a 2D array. - - This function provides an alternative to `np.var` for calculating - the feature-wise sample variance of a 2D array. This function is equivalent - to `np.var(X, ddof=1, axis=0)`. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Data to use for variance calculation. - - Returns - ------- - var : ndarray of shape (n_features,) - The feature-wise variance - """ - N = X.shape[0] - 1 - X -= X.mean(axis=0) - np.square(X, out=X) - np.sum(X, axis=0, out=X[0]) - var = X[0] / N - return var - - def _incremental_mean_and_var( X, last_mean, last_variance, last_sample_count, sample_weight=None ):