From d55e9af30d7f4a616b1b16cbafab30f337605f65 Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Mon, 20 Aug 2018 17:08:59 +0200 Subject: [PATCH 1/4] update pca --- sklearn/decomposition/fastica_.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 7bc78e4e31b8d..6d8fdf51b036d 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -309,10 +309,13 @@ def g(x, fun_args): X -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - u, d, _ = linalg.svd(X, full_matrices=False) + d, u = linalg.eigh(X.dot(X.T)) + + eps = np.finfo(float).eps # For numerical precision + d[d < eps] = eps + + K = (u / np.sqrt(d)).T[:n_components] # see (6.33) p.140 - del _ - K = (u / d).T[:n_components] # see (6.33) p.140 del u, d X1 = np.dot(K, X) # see (13.6) p.267 Here X1 is white and data From ba1cec153e2062f8219acd871ebea7d22efbc833 Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Tue, 21 Aug 2018 11:24:09 +0200 Subject: [PATCH 2/4] change algorithm depending on n --- sklearn/decomposition/fastica_.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 6d8fdf51b036d..c5967012f2017 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -309,13 +309,15 @@ def g(x, fun_args): X -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - d, u = linalg.eigh(X.dot(X.T)) - - eps = np.finfo(float).eps # For numerical precision - d[d < eps] = eps - - K = (u / np.sqrt(d)).T[:n_components] # see (6.33) p.140 - + if n > p: + u, d, _ = linalg.svd(X, full_matrices=False) + else: + D, u = linalg.eigh(X.dot(X.T)) # Faster when n < p + eps = np.finfo(np.double).eps + D[D < eps] = eps # For numerical issues + d = np.sqrt(D) + del D + K = (u / d).T[:n_components] # see (6.33) p.140 del u, d X1 = np.dot(K, X) # see (13.6) p.267 Here X1 is white and data From 7903b6c45fb11ae9282c6f598ff2b51a0840056a Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Wed, 22 Aug 2018 14:49:24 +0200 Subject: [PATCH 3/4] added a choice between solvers for svd --- sklearn/decomposition/fastica_.py | 31 ++++++++++++++++----- sklearn/decomposition/tests/test_fastica.py | 6 ++-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index c5967012f2017..06763688a2c5b 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -151,7 +151,7 @@ def _cube(x, fun_args): def fastica(X, n_components=None, algorithm="parallel", whiten=True, fun="logcosh", fun_args=None, max_iter=200, tol=1e-04, w_init=None, random_state=None, return_X_mean=False, compute_sources=True, - return_n_iter=False): + return_n_iter=False, svd_solver='svd'): """Perform Fast Independent Component Analysis. Read more in the :ref:`User Guide `. @@ -203,6 +203,11 @@ def my_g(x): Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.'s is used. + svd_solver : str, optional + The solver to use for whitening. Can either be 'svd' or 'eigh'. + 'svd' is more stable numerically if the problem is degenerate. + 'eigh' is generally faster. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -309,14 +314,19 @@ def g(x, fun_args): X -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - if n > p: - u, d, _ = linalg.svd(X, full_matrices=False) - else: + if svd_solver == 'eigh' and n < p: D, u = linalg.eigh(X.dot(X.T)) # Faster when n < p eps = np.finfo(np.double).eps - D[D < eps] = eps # For numerical issues + degenerate_idx = D < eps + if np.any(degenerate_idx): + warnings.warn('There are some small singular values, using ' + 'svd_solver = \'svd\' might lead to more ' + 'accurate results.') + D[degenerate_idx] = eps # For numerical issues d = np.sqrt(D) del D + else: + u, d, _ = linalg.svd(X, full_matrices=False) K = (u / d).T[:n_components] # see (6.33) p.140 del u, d X1 = np.dot(K, X) @@ -428,6 +438,11 @@ def my_g(x): w_init : None of an (n_components, n_components) ndarray The mixing matrix to be used to initialize the algorithm. + svd_solver : str, optional + The solver to use for whitening. Can either be 'svd' or 'eigh'. + 'svd' is more stable numerically if the problem is degenerate. + 'eigh' is generally faster. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -457,7 +472,7 @@ def my_g(x): """ def __init__(self, n_components=None, algorithm='parallel', whiten=True, fun='logcosh', fun_args=None, max_iter=200, tol=1e-4, - w_init=None, random_state=None): + w_init=None, svd_solver='svd', random_state=None): super(FastICA, self).__init__() self.n_components = n_components self.algorithm = algorithm @@ -468,6 +483,7 @@ def __init__(self, n_components=None, algorithm='parallel', whiten=True, self.tol = tol self.w_init = w_init self.random_state = random_state + self.svd_solver = svd_solver def _fit(self, X, compute_sources=False): """Fit the model @@ -492,7 +508,8 @@ def _fit(self, X, compute_sources=False): whiten=self.whiten, fun=self.fun, fun_args=fun_args, max_iter=self.max_iter, tol=self.tol, w_init=self.w_init, random_state=self.random_state, return_X_mean=True, - compute_sources=compute_sources, return_n_iter=True) + compute_sources=compute_sources, return_n_iter=True, + svd_solver=self.svd_solver) if self.whiten: self.components_ = np.dot(unmixing, whitening) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 591c4a7615b22..013c24a7743f9 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -115,8 +115,10 @@ def g_test(x): assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class - _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=0) - ica = FastICA(fun=nl, algorithm=algo, random_state=0) + _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=0, + svd_solver='eigh') + ica = FastICA(fun=nl, algorithm=algo, random_state=0, + svd_solver='eigh') sources = ica.fit_transform(m.T) assert_equal(ica.components_.shape, (2, 2)) assert_equal(sources.shape, (1000, 2)) From c5272ece3574ea48eee75d43c749fc8f6d697849 Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Wed, 22 Aug 2018 15:40:51 +0200 Subject: [PATCH 4/4] fix docstring --- sklearn/decomposition/fastica_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 06763688a2c5b..fdd3c82205e82 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -150,8 +150,8 @@ def _cube(x, fun_args): def fastica(X, n_components=None, algorithm="parallel", whiten=True, fun="logcosh", fun_args=None, max_iter=200, tol=1e-04, w_init=None, - random_state=None, return_X_mean=False, compute_sources=True, - return_n_iter=False, svd_solver='svd'): + svd_solver='svd', random_state=None, return_X_mean=False, + compute_sources=True, return_n_iter=False): """Perform Fast Independent Component Analysis. Read more in the :ref:`User Guide `.