From d55e9af30d7f4a616b1b16cbafab30f337605f65 Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Mon, 20 Aug 2018 17:08:59 +0200 Subject: [PATCH 01/55] update pca --- sklearn/decomposition/fastica_.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 7bc78e4e31b8d..6d8fdf51b036d 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -309,10 +309,13 @@ def g(x, fun_args): X -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - u, d, _ = linalg.svd(X, full_matrices=False) + d, u = linalg.eigh(X.dot(X.T)) + + eps = np.finfo(float).eps # For numerical precision + d[d < eps] = eps + + K = (u / np.sqrt(d)).T[:n_components] # see (6.33) p.140 - del _ - K = (u / d).T[:n_components] # see (6.33) p.140 del u, d X1 = np.dot(K, X) # see (13.6) p.267 Here X1 is white and data From ba1cec153e2062f8219acd871ebea7d22efbc833 Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Tue, 21 Aug 2018 11:24:09 +0200 Subject: [PATCH 02/55] change algorithm depending on n --- sklearn/decomposition/fastica_.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 6d8fdf51b036d..c5967012f2017 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -309,13 +309,15 @@ def g(x, fun_args): X -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - d, u = linalg.eigh(X.dot(X.T)) - - eps = np.finfo(float).eps # For numerical precision - d[d < eps] = eps - - K = (u / np.sqrt(d)).T[:n_components] # see (6.33) p.140 - + if n > p: + u, d, _ = linalg.svd(X, full_matrices=False) + else: + D, u = linalg.eigh(X.dot(X.T)) # Faster when n < p + eps = np.finfo(np.double).eps + D[D < eps] = eps # For numerical issues + d = np.sqrt(D) + del D + K = (u / d).T[:n_components] # see (6.33) p.140 del u, d X1 = np.dot(K, X) # see (13.6) p.267 Here X1 is white and data From 7903b6c45fb11ae9282c6f598ff2b51a0840056a Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Wed, 22 Aug 2018 14:49:24 +0200 Subject: [PATCH 03/55] added a choice between solvers for svd --- sklearn/decomposition/fastica_.py | 31 ++++++++++++++++----- sklearn/decomposition/tests/test_fastica.py | 6 ++-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index c5967012f2017..06763688a2c5b 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -151,7 +151,7 @@ def _cube(x, fun_args): def fastica(X, n_components=None, algorithm="parallel", whiten=True, fun="logcosh", fun_args=None, max_iter=200, tol=1e-04, w_init=None, random_state=None, return_X_mean=False, compute_sources=True, - return_n_iter=False): + return_n_iter=False, svd_solver='svd'): """Perform Fast Independent Component Analysis. Read more in the :ref:`User Guide `. @@ -203,6 +203,11 @@ def my_g(x): Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.'s is used. + svd_solver : str, optional + The solver to use for whitening. Can either be 'svd' or 'eigh'. + 'svd' is more stable numerically if the problem is degenerate. + 'eigh' is generally faster. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -309,14 +314,19 @@ def g(x, fun_args): X -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - if n > p: - u, d, _ = linalg.svd(X, full_matrices=False) - else: + if svd_solver == 'eigh' and n < p: D, u = linalg.eigh(X.dot(X.T)) # Faster when n < p eps = np.finfo(np.double).eps - D[D < eps] = eps # For numerical issues + degenerate_idx = D < eps + if np.any(degenerate_idx): + warnings.warn('There are some small singular values, using ' + 'svd_solver = \'svd\' might lead to more ' + 'accurate results.') + D[degenerate_idx] = eps # For numerical issues d = np.sqrt(D) del D + else: + u, d, _ = linalg.svd(X, full_matrices=False) K = (u / d).T[:n_components] # see (6.33) p.140 del u, d X1 = np.dot(K, X) @@ -428,6 +438,11 @@ def my_g(x): w_init : None of an (n_components, n_components) ndarray The mixing matrix to be used to initialize the algorithm. + svd_solver : str, optional + The solver to use for whitening. Can either be 'svd' or 'eigh'. + 'svd' is more stable numerically if the problem is degenerate. + 'eigh' is generally faster. + random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; @@ -457,7 +472,7 @@ def my_g(x): """ def __init__(self, n_components=None, algorithm='parallel', whiten=True, fun='logcosh', fun_args=None, max_iter=200, tol=1e-4, - w_init=None, random_state=None): + w_init=None, svd_solver='svd', random_state=None): super(FastICA, self).__init__() self.n_components = n_components self.algorithm = algorithm @@ -468,6 +483,7 @@ def __init__(self, n_components=None, algorithm='parallel', whiten=True, self.tol = tol self.w_init = w_init self.random_state = random_state + self.svd_solver = svd_solver def _fit(self, X, compute_sources=False): """Fit the model @@ -492,7 +508,8 @@ def _fit(self, X, compute_sources=False): whiten=self.whiten, fun=self.fun, fun_args=fun_args, max_iter=self.max_iter, tol=self.tol, w_init=self.w_init, random_state=self.random_state, return_X_mean=True, - compute_sources=compute_sources, return_n_iter=True) + compute_sources=compute_sources, return_n_iter=True, + svd_solver=self.svd_solver) if self.whiten: self.components_ = np.dot(unmixing, whitening) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 591c4a7615b22..013c24a7743f9 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -115,8 +115,10 @@ def g_test(x): assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class - _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=0) - ica = FastICA(fun=nl, algorithm=algo, random_state=0) + _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=0, + svd_solver='eigh') + ica = FastICA(fun=nl, algorithm=algo, random_state=0, + svd_solver='eigh') sources = ica.fit_transform(m.T) assert_equal(ica.components_.shape, (2, 2)) assert_equal(sources.shape, (1000, 2)) From c5272ece3574ea48eee75d43c749fc8f6d697849 Mon Sep 17 00:00:00 2001 From: Pierre Ablin Date: Wed, 22 Aug 2018 15:40:51 +0200 Subject: [PATCH 04/55] fix docstring --- sklearn/decomposition/fastica_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py index 06763688a2c5b..fdd3c82205e82 100644 --- a/sklearn/decomposition/fastica_.py +++ b/sklearn/decomposition/fastica_.py @@ -150,8 +150,8 @@ def _cube(x, fun_args): def fastica(X, n_components=None, algorithm="parallel", whiten=True, fun="logcosh", fun_args=None, max_iter=200, tol=1e-04, w_init=None, - random_state=None, return_X_mean=False, compute_sources=True, - return_n_iter=False, svd_solver='svd'): + svd_solver='svd', random_state=None, return_X_mean=False, + compute_sources=True, return_n_iter=False): """Perform Fast Independent Component Analysis. Read more in the :ref:`User Guide `. From ff167f9eabef2d1728bdae498be535aff17bf44a Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 17 Feb 2022 19:28:18 -0500 Subject: [PATCH 05/55] Added debugging statements --- sklearn/decomposition/_fastica.py | 5 ++-- sklearn/decomposition/tests/test_fastica.py | 33 +++++++++++---------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 2c1ba93f795ab..9d08dab760e64 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -575,12 +575,13 @@ def g(x, fun_args): d = np.sqrt(D) del D else: - u, d, _ = linalg.svd(X, full_matrices=False) + u, d, _ = linalg.svd(X, full_matrices=False, check_finite=False) del _ K = (u / d).T[:n_components] # see (6.33) p.140 del u, d - X1 = np.dot(K, XT) + print(f"DEBUG *** {X.shape=}|{XT.shape=}|{K.shape=}|{self.svd_solver=}") + X1 = np.matmul(K, XT) # see (13.6) p.267 Here X1 is white and data # in X has been projected onto a subspace by PCA X1 *= np.sqrt(n_samples) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 368770f1b3344..627e1510cfb72 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -119,26 +119,27 @@ def g_test(x): assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class - _, _, sources_fun = fastica( - m.T, fun=nl, algorithm=algo, random_state=0, svd_solver="eigh" - ) - ica = FastICA(fun=nl, algorithm=algo, random_state=0, svd_solver="eigh") - sources = ica.fit_transform(m.T) - assert ica.components_.shape == (2, 2) - assert sources.shape == (1000, 2) + for solver in ("eigh", "svd"): + _, _, sources_fun = fastica( + m.T, fun=nl, algorithm=algo, random_state=0, svd_solver=solver + ) + ica = FastICA(fun=nl, algorithm=algo, random_state=0, svd_solver=solver) + sources = ica.fit_transform(m.T) + assert ica.components_.shape == (2, 2) + assert sources.shape == (1000, 2) - assert_array_almost_equal(sources_fun, sources) - assert_array_almost_equal(sources, ica.transform(m.T)) + assert_array_almost_equal(sources_fun, sources) + assert_array_almost_equal(sources, ica.transform(m.T)) - assert ica.mixing_.shape == (2, 2) + assert ica.mixing_.shape == (2, 2) - for fn in [np.tanh, "exp(-.5(x^2))"]: - ica = FastICA(fun=fn, algorithm=algo) - with pytest.raises(ValueError): - ica.fit(m.T) + for fn in [np.tanh, "exp(-.5(x^2))"]: + ica = FastICA(fun=fn, algorithm=algo) + with pytest.raises(ValueError): + ica.fit(m.T) - with pytest.raises(TypeError): - FastICA(fun=range(10)).fit(m.T) + with pytest.raises(TypeError): + FastICA(fun=range(10)).fit(m.T) def test_fastica_nowhiten(): From 8f923a22a8295f6f7ff5eb49e1eb7d5bae0f3280 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 17 Feb 2022 20:36:23 -0500 Subject: [PATCH 06/55] Slightly improved test coverage and corrected implementation - Implementation was changed to account for the fact that now `X` is transposed and accessed via `XT` --- sklearn/decomposition/_fastica.py | 12 +++++++----- sklearn/decomposition/tests/test_fastica.py | 7 +++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 9d08dab760e64..dfbc9306e6abe 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -562,7 +562,9 @@ def g(x, fun_args): # Whitening and preprocessing by PCA if self.svd_solver == "eigh": - D, u = linalg.eigh(X.dot(X.T)) # Faster when n < p + D, u = linalg.eigh(X.T.dot(X)) # Faster when n < p + idx = D.argsort()[::-1] + D, u = D[idx], u[idx] eps = np.finfo(np.double).eps degenerate_idx = D < eps if np.any(degenerate_idx): @@ -575,13 +577,13 @@ def g(x, fun_args): d = np.sqrt(D) del D else: - u, d, _ = linalg.svd(X, full_matrices=False, check_finite=False) + u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] + idx = d.argsort()[::-1] + d, u = d[idx], u[idx] - del _ K = (u / d).T[:n_components] # see (6.33) p.140 del u, d - print(f"DEBUG *** {X.shape=}|{XT.shape=}|{K.shape=}|{self.svd_solver=}") - X1 = np.matmul(K, XT) + X1 = np.dot(K, XT) # see (13.6) p.267 Here X1 is white and data # in X has been projected onto a subspace by PCA X1 *= np.sqrt(n_samples) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 627e1510cfb72..2a615ff8b5f2a 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -119,12 +119,14 @@ def g_test(x): assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class + outs = {} for solver in ("eigh", "svd"): _, _, sources_fun = fastica( m.T, fun=nl, algorithm=algo, random_state=0, svd_solver=solver ) ica = FastICA(fun=nl, algorithm=algo, random_state=0, svd_solver=solver) sources = ica.fit_transform(m.T) + outs[solver] = sources assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) @@ -134,12 +136,13 @@ def g_test(x): assert ica.mixing_.shape == (2, 2) for fn in [np.tanh, "exp(-.5(x^2))"]: - ica = FastICA(fun=fn, algorithm=algo) + ica = FastICA(fun=fn, algorithm=algo, svd_solver=solver) with pytest.raises(ValueError): ica.fit(m.T) with pytest.raises(TypeError): - FastICA(fun=range(10)).fit(m.T) + FastICA(fun=range(10), svd_solver=solver).fit(m.T) + # assert_array_almost_equal(outs["eigh"], outs["svd"]) def test_fastica_nowhiten(): From 2cd481a3e2b2ed435cc1a463fe9d918852ea8141 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 17 Feb 2022 21:05:24 -0500 Subject: [PATCH 07/55] Added to changelong as re-enabled failing test --- doc/whats_new/v1.1.rst | 7 +++++++ sklearn/decomposition/tests/test_fastica.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 91aa2a2859fd8..cfd30541c53c0 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -279,6 +279,13 @@ Changelog :pr:`22300` by :user:`Meekail Zain ` and :pr:`15948` by :user:`sysuresh`. +- |Enhancement| :class:`decomposition.FastICA` now allows the user to select + how whitening is performed through the new `svd_solver` parameter, which + supports `svd` and `eigh`. `svd_solver` defaults to `svd` although `eigh` may + be faster in cases where `num_features > num_samples` :pr:`11860` by + :user:`Pierre Ablin ` and :pr:`22527` by + :user:`Meekail Zain `. + :mod:`sklearn.discriminant_analysis` .................................... diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 2a615ff8b5f2a..34222a220ee05 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -142,7 +142,7 @@ def g_test(x): with pytest.raises(TypeError): FastICA(fun=range(10), svd_solver=solver).fit(m.T) - # assert_array_almost_equal(outs["eigh"], outs["svd"]) + assert_array_almost_equal(outs["eigh"], outs["svd"]) def test_fastica_nowhiten(): From a8ceb9e20a6b9277de79c15b50f7f1678496ce89 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 17 Feb 2022 21:13:57 -0500 Subject: [PATCH 08/55] Removed old debugging code --- sklearn/decomposition/_fastica.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index dfbc9306e6abe..f9353df2e6b65 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -563,8 +563,6 @@ def g(x, fun_args): # Whitening and preprocessing by PCA if self.svd_solver == "eigh": D, u = linalg.eigh(X.T.dot(X)) # Faster when n < p - idx = D.argsort()[::-1] - D, u = D[idx], u[idx] eps = np.finfo(np.double).eps degenerate_idx = D < eps if np.any(degenerate_idx): @@ -578,8 +576,6 @@ def g(x, fun_args): del D else: u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] - idx = d.argsort()[::-1] - d, u = d[idx], u[idx] K = (u / d).T[:n_components] # see (6.33) p.140 del u, d From 0bc73433c0d02f3f7db519dcb17f49d518bb0392 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 18 Feb 2022 00:44:18 -0500 Subject: [PATCH 09/55] Added temporary benchmark file --- bench.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ saved_df | Bin 0 -> 6382 bytes 2 files changed, 80 insertions(+) create mode 100644 bench.py create mode 100644 saved_df diff --git a/bench.py b/bench.py new file mode 100644 index 0000000000000..ed637a4c42940 --- /dev/null +++ b/bench.py @@ -0,0 +1,80 @@ +import time +from sklearn.decomposition import FastICA +import numpy as np +import pandas as pd +import streamlit as st +import altair as alt +import argparse + +parser = argparse.ArgumentParser( + description="Determine whether to save/load the dataframe." +) +parser.add_argument( + "--save", + type=str, + default="", + metavar="s", + help="Saves the dataframe to the path if provided", +) +parser.add_argument( + "--load", + type=str, + default="", + metavar="l", + help="Loads the dataframe at the path if provided", +) +args = parser.parse_args() + + +def main() -> None: + + df = None + if args.load: + df = on_load(args.load) + else: + X_shapes = [] + for i in range(10): + X_shapes.extend( + (10 * int(10 ** (i / 3)), 10 * int(10 ** (j / 3))) for j in range(10) + ) + + solvers = ("svd", "eigh") + transformers = { + s: FastICA(n_components=7, random_state=0, svd_solver=s) for s in solvers + } + data = [] + for shape in X_shapes: + X = np.random.rand(*shape) + for s in transformers: + start = time.time() + transformers[s].fit_transform(X) + data.append( + {"shape": str(shape), "solver": s, "time": time.time() - start} + ) + df = pd.DataFrame(data) + + if args.save and not args.load: + df.to_pickle(args.save) + print(f"Dataframe saved to {args.save}") + + chart = ( + alt.Chart(df, width=300) + .mark_bar() + .encode(x="solver", y="time", column="shape") + .properties(title="time by shape") + ).resolve_scale(y="independent") + + st.altair_chart(chart) + + pt = pd.pivot_table(df, values="time", index=["shape"], columns=["solver"]) + ratio_df = pd.DataFrame(pt["eigh"] / pt["svd"], columns=["eigh/svd time"]) + st.table(ratio_df.sort_values(by="eigh/svd time")) + + +def on_load(pth: str) -> pd.DataFrame: + print(f"Dataframe loaded from {args.load}") + return pd.read_pickle(pth) + + +if __name__ == "__main__": + main() diff --git a/saved_df b/saved_df new file mode 100644 index 0000000000000000000000000000000000000000..e2c6196ac3fc3a0595171ed928272b707e3894f1 GIT binary patch literal 6382 zcmeHMdvH`&8NYdwcTJ>$#kz1bu>FgK%_D-WGZ_hJ(P&t7 z&Wg!#EhLA*Uz3~357Wia(ulIthd>POH#eCswQA{Z(^2Ljlt$_jSgjJ^o~8rxF*FT~E1-DOvcNia)UiY{zK>t+ZQ1t9QlFy7yVQgZhVe zksVaYjxgR!>rBlL)&qX@*F>Q6!F!U zX#5AuC_Y!`F#0Rz{0pu7F`WPKpDrxMc+_(*g8xix-J^)5?_xeu&Gk2M{N%rBz1V@5 z5MQ~U_@vDgcWo>A^`oTU_{6g~PTEiU#pXVTcz7@A=jMIc)0#s6rDYgz>W;<_5sz#{ z{_#uiQGCO%BY*eT>nOes&eJ$|7RL{r{U*&jPI2jb4%8@@RLc6db{#W%g}RlxHuH&T45P94YTJI|nvF-Ob z|A+7W2CUxy<;!5*Z@og-$Jg$h0-U;b6XQ!=+k6f1xr=8gzR7c>-_WU(;FomX7jq@) zRn2*i9j7?neU0>JyGVL`f6Exo`{P{lzwdho@x=4L0IQ#V`EQ5^w@e^@eyZ3l{He8v zuLF)PX`2nWW62Xs!0M+TSO9i(UBMQxlKURg?TzEKZ~7;wZpODBL*D8mPoo`6(eLco)MpFsbrEBCDi z{P;&V>cC!q^=cc~{`2eU{@2EJePRpM^*HP3>U$2?OD*~l+Lq7$60Ns}_HFDdSa15m zGZat$LzkdmKep>1Xit0#jBHwV_!x4-)V_}c~^ZvyNd--tS@Wp8f;JaqZzpAt3| zUh|C@;Kld!ZgrbR5k7B(4QhQc1s-GhovX~l1Bc)Nxz;aOhSIgC^^dnE5Z9VPo;zy=0 z9P%gX<#=4~6C^c0U#zd!myEJ zj4hlZ20u8u^-Gmr2t;kN`r0^Gd=oHcCA4TZNjwa^}Ue1}O0b41vrQ24{NWwYeRL(iw zF0O2{C&H z>Cy%nQ46R=j0!hBCt?Y>FB3WH8!0iuiZmz0m<3B+A|}4bbF?{k9OG=W4(Mkcn1^(j z34{mt6nip&r33a@Iv@~Yz$8(-6r&T_nPXojvUOle*1={W9WtP(+Lp8qGuoUx`kigo z0sX84&!uc<0w1L60Q_x z2I_1q?#SXfV{4NGupHEr0azRAVQnk|$}tzR)L=ra7PuN93QHR&s<0zuF>F$j%HqKr zr!^RyB3ldE+E~OGZKZ>mv`JF~7B_1lqODC1z;aMe24HQdhqbW?fE%s?;&viS8zy9J zn1t)biJp*HR~hp|?AZ=QynpT%udIIc zK93|`?f=(XyV|HWJKm~S;ajx&8OPhRx<*|Kufl4p^F??AzT_4pb_B(tmlEny&F*D+ zlGI%Xe{Q#KYjC>`{;#eD=Y1tloCnzVl|v@{+MVb85g&>Km0l%L@0SzecX*i(&dN7{ zPyDFvULgv@D6$})-3P^azB%(YQy}g+bekzksIoZEFp3k=@D6cmZWL-E@g)Fa5-0BP zQNWvolZoz?Kt4ZLEO9vkhXQ4rz&cj%UN5G2#ZiYQ&i=(uexpzvAjC}17&H{o$U|lY z=*3qCan>Myq(Q*e#|U4KqRE)SHgRGiYnu4XAU*^X`#QsN7dGe0;c#?k!WT@&#CeLW RDZa4mSHuU0fke_@|6k|j5qtmu literal 0 HcmV?d00001 From 43c1e86347a8af926ba8e95bac24681ebe103319 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Fri, 18 Feb 2022 18:41:45 -0500 Subject: [PATCH 10/55] Update sklearn/decomposition/_fastica.py Co-authored-by: Thomas J. Fan --- sklearn/decomposition/_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index f9353df2e6b65..85bb2fd05ffef 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -375,7 +375,7 @@ def my_g(x): w_init : ndarray of shape (n_components, n_components), default=None The mixing matrix to be used to initialize the algorithm. - svd_solver : str, default='svd' + whiten_solver : str, default='svd' The solver to use for whitening. Can either be 'svd' or 'eigh'. 'svd' is more stable numerically if the problem is degenerate. 'eigh' is generally faster. From fcd25428c86bebd7a0b661767ae61528ca5f4472 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Fri, 18 Feb 2022 18:44:27 -0500 Subject: [PATCH 11/55] Updated benchmark file to use csv instead of pickle --- bench.py | 4 ++-- sklearn/decomposition/_fastica.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index ed637a4c42940..5c6b89b52b26e 100644 --- a/bench.py +++ b/bench.py @@ -54,7 +54,7 @@ def main() -> None: df = pd.DataFrame(data) if args.save and not args.load: - df.to_pickle(args.save) + df.to_csv(args.save) print(f"Dataframe saved to {args.save}") chart = ( @@ -73,7 +73,7 @@ def main() -> None: def on_load(pth: str) -> pd.DataFrame: print(f"Dataframe loaded from {args.load}") - return pd.read_pickle(pth) + return pd.read_csv(pth) if __name__ == "__main__": diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 85bb2fd05ffef..2f7103c9ef7fd 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -562,7 +562,8 @@ def g(x, fun_args): # Whitening and preprocessing by PCA if self.svd_solver == "eigh": - D, u = linalg.eigh(X.T.dot(X)) # Faster when n < p + # Faster when num_samples >> n_features + D, u = linalg.eigh(X.T.dot(X)) eps = np.finfo(np.double).eps degenerate_idx = D < eps if np.any(degenerate_idx): From 1616deb4292133d02afd34235884c541274acb8d Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 19 Feb 2022 18:43:31 -0500 Subject: [PATCH 12/55] Minor benchmark generator update --- bench.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bench.py b/bench.py index ed637a4c42940..2de228730b4f2 100644 --- a/bench.py +++ b/bench.py @@ -42,19 +42,23 @@ def main() -> None: transformers = { s: FastICA(n_components=7, random_state=0, svd_solver=s) for s in solvers } + total_reps = len(solvers) * len(X_shapes) + count = 0 data = [] for shape in X_shapes: X = np.random.rand(*shape) for s in transformers: + count += 1 start = time.time() transformers[s].fit_transform(X) + print(f"Progress: {count}/{total_reps}") data.append( - {"shape": str(shape), "solver": s, "time": time.time() - start} + {"shape": str(shape), "solver": str(s), "time": time.time() - start} ) df = pd.DataFrame(data) if args.save and not args.load: - df.to_pickle(args.save) + df.to_csv(args.save) print(f"Dataframe saved to {args.save}") chart = ( @@ -73,7 +77,7 @@ def main() -> None: def on_load(pth: str) -> pd.DataFrame: print(f"Dataframe loaded from {args.load}") - return pd.read_pickle(pth) + return pd.read_csv(pth) if __name__ == "__main__": From 93e137590b8b265fa9f407bd698eeaeffdf8be37 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 20 Feb 2022 17:37:52 -0500 Subject: [PATCH 13/55] Updated benchmark file and added csv --- bench.py | 24 +++--- benchmark_df.csv | 201 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+), 9 deletions(-) create mode 100644 benchmark_df.csv diff --git a/bench.py b/bench.py index 2de228730b4f2..636429ff72fea 100644 --- a/bench.py +++ b/bench.py @@ -47,18 +47,24 @@ def main() -> None: data = [] for shape in X_shapes: X = np.random.rand(*shape) - for s in transformers: - count += 1 - start = time.time() - transformers[s].fit_transform(X) - print(f"Progress: {count}/{total_reps}") - data.append( - {"shape": str(shape), "solver": str(s), "time": time.time() - start} - ) + + count += 1 + start = time.time() + transformers["svd"].fit_transform(X) + print(f"Progress: {count}/{total_reps}") + svd_time = time.time() - start + + count += 1 + start = time.time() + transformers["eigh"].fit_transform(X) + print(f"Progress: {count}/{total_reps}") + eigh_time = time.time() - start + + data.append({"shape": str(shape), "svd": svd_time, "eigh": eigh_time}) df = pd.DataFrame(data) if args.save and not args.load: - df.to_csv(args.save) + df.to_csv(args.save, index=False) print(f"Dataframe saved to {args.save}") chart = ( diff --git a/benchmark_df.csv b/benchmark_df.csv new file mode 100644 index 0000000000000..5e4c69e72cb09 --- /dev/null +++ b/benchmark_df.csv @@ -0,0 +1,201 @@ +,shape,solver,time +0,"(10, 10)",svd,0.016014575958251953 +1,"(10, 10)",eigh,0.0070073604583740234 +2,"(10, 20)",svd,0.007004976272583008 +3,"(10, 20)",eigh,0.0020051002502441406 +4,"(10, 40)",svd,0.0070037841796875 +5,"(10, 40)",eigh,0.0020029544830322266 +6,"(10, 100)",svd,0.006005048751831055 +7,"(10, 100)",eigh,0.003001689910888672 +8,"(10, 210)",svd,0.009008407592773438 +9,"(10, 210)",eigh,0.008006811141967773 +10,"(10, 460)",svd,0.0050046443939208984 +11,"(10, 460)",eigh,0.021019697189331055 +12,"(10, 1000)",svd,0.0070056915283203125 +13,"(10, 1000)",eigh,0.1511387825012207 +14,"(10, 2150)",svd,0.05505013465881348 +15,"(10, 2150)",eigh,1.114013671875 +16,"(10, 4640)",svd,0.03903484344482422 +17,"(10, 4640)",eigh,9.652104139328003 +18,"(10, 10000)",svd,0.019017934799194336 +19,"(10, 10000)",eigh,89.20785236358643 +20,"(20, 10)",svd,0.0570528507232666 +21,"(20, 10)",eigh,0.09708738327026367 +22,"(20, 20)",svd,0.059058189392089844 +23,"(20, 20)",eigh,0.057051897048950195 +24,"(20, 40)",svd,0.03963637351989746 +25,"(20, 40)",eigh,0.002001523971557617 +26,"(20, 100)",svd,0.05606436729431152 +27,"(20, 100)",eigh,0.0019884109497070312 +28,"(20, 210)",svd,0.05614113807678223 +29,"(20, 210)",eigh,0.004004240036010742 +30,"(20, 460)",svd,0.061258554458618164 +31,"(20, 460)",eigh,0.018904447555541992 +32,"(20, 1000)",svd,0.018016576766967773 +33,"(20, 1000)",eigh,0.12298727035522461 +34,"(20, 2150)",svd,0.014985322952270508 +35,"(20, 2150)",eigh,0.9478616714477539 +36,"(20, 4640)",svd,0.05555558204650879 +37,"(20, 4640)",eigh,10.271839380264282 +38,"(20, 10000)",svd,0.022020816802978516 +39,"(20, 10000)",eigh,98.05129623413086 +40,"(40, 10)",svd,0.031028032302856445 +41,"(40, 10)",eigh,0.10209417343139648 +42,"(40, 20)",svd,0.01901531219482422 +43,"(40, 20)",eigh,0.06105518341064453 +44,"(40, 40)",svd,0.0620570182800293 +45,"(40, 40)",eigh,0.06505942344665527 +46,"(40, 100)",svd,0.00700688362121582 +47,"(40, 100)",eigh,0.003003358840942383 +48,"(40, 210)",svd,0.06605911254882812 +49,"(40, 210)",eigh,0.005003929138183594 +50,"(40, 460)",svd,0.06605982780456543 +51,"(40, 460)",eigh,0.020018339157104492 +52,"(40, 1000)",svd,0.06406140327453613 +53,"(40, 1000)",eigh,0.13011837005615234 +54,"(40, 2150)",svd,0.018017053604125977 +55,"(40, 2150)",eigh,1.122772216796875 +56,"(40, 4640)",svd,0.04704141616821289 +57,"(40, 4640)",eigh,9.632064819335938 +58,"(40, 10000)",svd,0.07506847381591797 +59,"(40, 10000)",eigh,95.49031686782837 +60,"(100, 10)",svd,0.029026269912719727 +61,"(100, 10)",eigh,0.047041893005371094 +62,"(100, 20)",svd,0.05805349349975586 +63,"(100, 20)",eigh,0.06005430221557617 +64,"(100, 40)",svd,0.016014575958251953 +65,"(100, 40)",eigh,0.0570521354675293 +66,"(100, 100)",svd,0.06505918502807617 +67,"(100, 100)",eigh,0.017016172409057617 +68,"(100, 210)",svd,0.030025959014892578 +69,"(100, 210)",eigh,0.00600743293762207 +70,"(100, 460)",svd,0.06806063652038574 +71,"(100, 460)",eigh,0.021019697189331055 +72,"(100, 1000)",svd,0.07506847381591797 +73,"(100, 1000)",eigh,0.12311196327209473 +74,"(100, 2150)",svd,0.06906342506408691 +75,"(100, 2150)",eigh,1.0169248580932617 +76,"(100, 4640)",svd,0.0770723819732666 +77,"(100, 4640)",eigh,9.727842092514038 +78,"(100, 10000)",svd,0.10009002685546875 +79,"(100, 10000)",eigh,90.43684029579163 +80,"(210, 10)",svd,0.006005525588989258 +81,"(210, 10)",eigh,0.09508752822875977 +82,"(210, 20)",svd,0.07206559181213379 +83,"(210, 20)",eigh,0.06305646896362305 +84,"(210, 40)",svd,0.06505990028381348 +85,"(210, 40)",eigh,0.06806135177612305 +86,"(210, 100)",svd,0.06706047058105469 +87,"(210, 100)",eigh,0.06405878067016602 +88,"(210, 210)",svd,0.07506799697875977 +89,"(210, 210)",eigh,0.06906247138977051 +90,"(210, 460)",svd,0.0790719985961914 +91,"(210, 460)",eigh,0.02302098274230957 +92,"(210, 1000)",svd,0.06306076049804688 +93,"(210, 1000)",eigh,0.14012384414672852 +94,"(210, 2150)",svd,0.1141045093536377 +95,"(210, 2150)",eigh,1.043950080871582 +96,"(210, 4640)",svd,0.12111163139343262 +97,"(210, 4640)",eigh,9.615741729736328 +98,"(210, 10000)",svd,0.16915392875671387 +99,"(210, 10000)",eigh,90.32373642921448 +100,"(460, 10)",svd,0.008007287979125977 +101,"(460, 10)",eigh,0.03803515434265137 +102,"(460, 20)",svd,0.008007526397705078 +103,"(460, 20)",eigh,0.07006335258483887 +104,"(460, 40)",svd,0.054048776626586914 +105,"(460, 40)",eigh,0.08107376098632812 +106,"(460, 100)",svd,0.08207535743713379 +107,"(460, 100)",eigh,0.07806992530822754 +108,"(460, 210)",svd,0.07907223701477051 +109,"(460, 210)",eigh,0.022019386291503906 +110,"(460, 460)",svd,0.12711548805236816 +111,"(460, 460)",eigh,0.0990908145904541 +112,"(460, 1000)",svd,0.15214014053344727 +113,"(460, 1000)",eigh,0.1461317539215088 +114,"(460, 2150)",svd,0.20218324661254883 +115,"(460, 2150)",eigh,1.00691556930542 +116,"(460, 4640)",svd,0.2642397880554199 +117,"(460, 4640)",eigh,9.623390913009644 +118,"(460, 10000)",svd,0.4153778553009033 +119,"(460, 10000)",eigh,93.83080005645752 +120,"(1000, 10)",svd,0.006006479263305664 +121,"(1000, 10)",eigh,0.047041893005371094 +122,"(1000, 20)",svd,0.011009931564331055 +123,"(1000, 20)",eigh,0.07306623458862305 +124,"(1000, 40)",svd,0.10209298133850098 +125,"(1000, 40)",eigh,0.10509538650512695 +126,"(1000, 100)",svd,0.10709714889526367 +127,"(1000, 100)",eigh,0.01801753044128418 +128,"(1000, 210)",svd,0.12311363220214844 +129,"(1000, 210)",eigh,0.10909676551818848 +130,"(1000, 460)",svd,0.18416738510131836 +131,"(1000, 460)",eigh,0.13011908531188965 +132,"(1000, 1000)",svd,0.3693380355834961 +133,"(1000, 1000)",eigh,0.2812533378601074 +134,"(1000, 2150)",svd,0.6415841579437256 +135,"(1000, 2150)",eigh,1.2638959884643555 +136,"(1000, 4640)",svd,0.9638760089874268 +137,"(1000, 4640)",eigh,10.057524681091309 +138,"(1000, 10000)",svd,1.5754334926605225 +139,"(1000, 10000)",eigh,93.95306539535522 +140,"(2150, 10)",svd,0.01000833511352539 +141,"(2150, 10)",eigh,0.03803443908691406 +142,"(2150, 20)",svd,0.026024341583251953 +143,"(2150, 20)",eigh,0.13412165641784668 +144,"(2150, 40)",svd,0.031028032302856445 +145,"(2150, 40)",eigh,0.13512301445007324 +146,"(2150, 100)",svd,0.03203105926513672 +147,"(2150, 100)",eigh,0.1331193447113037 +148,"(2150, 210)",svd,0.1681530475616455 +149,"(2150, 210)",eigh,0.14513158798217773 +150,"(2150, 460)",svd,0.2952694892883301 +151,"(2150, 460)",eigh,0.1731574535369873 +152,"(2150, 1000)",svd,0.6505939960479736 +153,"(2150, 1000)",eigh,0.37934279441833496 +154,"(2150, 2150)",svd,3.037769317626953 +155,"(2150, 2150)",eigh,1.6424922943115234 +156,"(2150, 4640)",svd,4.360964775085449 +157,"(2150, 4640)",eigh,11.373339653015137 +158,"(2150, 10000)",svd,6.593994855880737 +159,"(2150, 10000)",eigh,93.20475888252258 +160,"(4640, 10)",svd,0.020016908645629883 +161,"(4640, 10)",eigh,0.044040679931640625 +162,"(4640, 20)",svd,0.05304908752441406 +163,"(4640, 20)",eigh,0.015014171600341797 +164,"(4640, 40)",svd,0.03903460502624512 +165,"(4640, 40)",eigh,0.2151956558227539 +166,"(4640, 100)",svd,0.24322104454040527 +167,"(4640, 100)",eigh,0.220200777053833 +168,"(4640, 210)",svd,0.27625179290771484 +169,"(4640, 210)",eigh,0.24622368812561035 +170,"(4640, 460)",svd,0.6005492210388184 +171,"(4640, 460)",eigh,0.26824188232421875 +172,"(4640, 1000)",svd,1.9878089427947998 +173,"(4640, 1000)",eigh,0.5104618072509766 +174,"(4640, 2150)",svd,7.3907201290130615 +175,"(4640, 2150)",eigh,2.0078253746032715 +176,"(4640, 4640)",svd,27.215744256973267 +177,"(4640, 4640)",eigh,13.427207231521606 +178,"(4640, 10000)",svd,41.907105684280396 +179,"(4640, 10000)",eigh,98.82284474372864 +180,"(10000, 10)",svd,0.06305861473083496 +181,"(10000, 10)",eigh,0.09708738327026367 +182,"(10000, 20)",svd,0.09108400344848633 +183,"(10000, 20)",eigh,0.02802443504333496 +184,"(10000, 40)",svd,0.07707047462463379 +185,"(10000, 40)",eigh,0.0620572566986084 +186,"(10000, 100)",svd,0.13512325286865234 +187,"(10000, 100)",eigh,0.4584167003631592 +188,"(10000, 210)",svd,0.6856241226196289 +189,"(10000, 210)",eigh,0.4213829040527344 +190,"(10000, 460)",svd,0.8637833595275879 +191,"(10000, 460)",eigh,0.4854426383972168 +192,"(10000, 1000)",svd,5.033576726913452 +193,"(10000, 1000)",eigh,0.8137402534484863 +194,"(10000, 2150)",svd,18.21756386756897 +195,"(10000, 2150)",eigh,2.6454038619995117 +196,"(10000, 4640)",svd,72.08836793899536 +197,"(10000, 4640)",eigh,15.527116298675537 +198,"(10000, 10000)",svd,260.91021251678467 +199,"(10000, 10000)",eigh,116.10355758666992 From af1a48243895fc1b0252f4fb7959de46ec3e45e8 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Feb 2022 20:54:52 -0500 Subject: [PATCH 14/55] Removed old files, improved benchmark file --- bench.py | 39 +++--- benchmark_df.csv | 201 ------------------------------ saved_df | Bin 6382 -> 0 bytes sklearn/decomposition/_fastica.py | 2 +- 4 files changed, 20 insertions(+), 222 deletions(-) delete mode 100644 benchmark_df.csv delete mode 100644 saved_df diff --git a/bench.py b/bench.py index 636429ff72fea..0cab2c1a4dc76 100644 --- a/bench.py +++ b/bench.py @@ -1,10 +1,11 @@ import time -from sklearn.decomposition import FastICA import numpy as np import pandas as pd -import streamlit as st -import altair as alt import argparse +from scipy import linalg + +# import streamlit as st +# import altair as alt parser = argparse.ArgumentParser( description="Determine whether to save/load the dataframe." @@ -33,30 +34,28 @@ def main() -> None: df = on_load(args.load) else: X_shapes = [] - for i in range(10): - X_shapes.extend( - (10 * int(10 ** (i / 3)), 10 * int(10 ** (j / 3))) for j in range(10) - ) - - solvers = ("svd", "eigh") - transformers = { - s: FastICA(n_components=7, random_state=0, svd_solver=s) for s in solvers + for i in range(3): + X_shapes.extend((int(10 ** (2 + i)), int(10 ** (1 + j))) for j in range(3)) + + solvers = { + "svd": linalg.svd, + "eigh": linalg.eigh, } total_reps = len(solvers) * len(X_shapes) count = 0 data = [] for shape in X_shapes: - X = np.random.rand(*shape) + XT = np.random.rand(*shape).T count += 1 start = time.time() - transformers["svd"].fit_transform(X) + solvers["svd"](XT) print(f"Progress: {count}/{total_reps}") svd_time = time.time() - start count += 1 start = time.time() - transformers["eigh"].fit_transform(X) + solvers["eigh"](XT.dot(XT.T)) print(f"Progress: {count}/{total_reps}") eigh_time = time.time() - start @@ -67,23 +66,23 @@ def main() -> None: df.to_csv(args.save, index=False) print(f"Dataframe saved to {args.save}") + """ chart = ( alt.Chart(df, width=300) .mark_bar() - .encode(x="solver", y="time", column="shape") + .encode(x="shape", y=["svd","eigh"], column="shape") .properties(title="time by shape") ).resolve_scale(y="independent") st.altair_chart(chart) - - pt = pd.pivot_table(df, values="time", index=["shape"], columns=["solver"]) - ratio_df = pd.DataFrame(pt["eigh"] / pt["svd"], columns=["eigh/svd time"]) - st.table(ratio_df.sort_values(by="eigh/svd time")) + """ def on_load(pth: str) -> pd.DataFrame: print(f"Dataframe loaded from {args.load}") - return pd.read_csv(pth) + df = pd.read_csv(pth) + df["shape"] = df["shape"].astype("string") + return df if __name__ == "__main__": diff --git a/benchmark_df.csv b/benchmark_df.csv deleted file mode 100644 index 5e4c69e72cb09..0000000000000 --- a/benchmark_df.csv +++ /dev/null @@ -1,201 +0,0 @@ -,shape,solver,time -0,"(10, 10)",svd,0.016014575958251953 -1,"(10, 10)",eigh,0.0070073604583740234 -2,"(10, 20)",svd,0.007004976272583008 -3,"(10, 20)",eigh,0.0020051002502441406 -4,"(10, 40)",svd,0.0070037841796875 -5,"(10, 40)",eigh,0.0020029544830322266 -6,"(10, 100)",svd,0.006005048751831055 -7,"(10, 100)",eigh,0.003001689910888672 -8,"(10, 210)",svd,0.009008407592773438 -9,"(10, 210)",eigh,0.008006811141967773 -10,"(10, 460)",svd,0.0050046443939208984 -11,"(10, 460)",eigh,0.021019697189331055 -12,"(10, 1000)",svd,0.0070056915283203125 -13,"(10, 1000)",eigh,0.1511387825012207 -14,"(10, 2150)",svd,0.05505013465881348 -15,"(10, 2150)",eigh,1.114013671875 -16,"(10, 4640)",svd,0.03903484344482422 -17,"(10, 4640)",eigh,9.652104139328003 -18,"(10, 10000)",svd,0.019017934799194336 -19,"(10, 10000)",eigh,89.20785236358643 -20,"(20, 10)",svd,0.0570528507232666 -21,"(20, 10)",eigh,0.09708738327026367 -22,"(20, 20)",svd,0.059058189392089844 -23,"(20, 20)",eigh,0.057051897048950195 -24,"(20, 40)",svd,0.03963637351989746 -25,"(20, 40)",eigh,0.002001523971557617 -26,"(20, 100)",svd,0.05606436729431152 -27,"(20, 100)",eigh,0.0019884109497070312 -28,"(20, 210)",svd,0.05614113807678223 -29,"(20, 210)",eigh,0.004004240036010742 -30,"(20, 460)",svd,0.061258554458618164 -31,"(20, 460)",eigh,0.018904447555541992 -32,"(20, 1000)",svd,0.018016576766967773 -33,"(20, 1000)",eigh,0.12298727035522461 -34,"(20, 2150)",svd,0.014985322952270508 -35,"(20, 2150)",eigh,0.9478616714477539 -36,"(20, 4640)",svd,0.05555558204650879 -37,"(20, 4640)",eigh,10.271839380264282 -38,"(20, 10000)",svd,0.022020816802978516 -39,"(20, 10000)",eigh,98.05129623413086 -40,"(40, 10)",svd,0.031028032302856445 -41,"(40, 10)",eigh,0.10209417343139648 -42,"(40, 20)",svd,0.01901531219482422 -43,"(40, 20)",eigh,0.06105518341064453 -44,"(40, 40)",svd,0.0620570182800293 -45,"(40, 40)",eigh,0.06505942344665527 -46,"(40, 100)",svd,0.00700688362121582 -47,"(40, 100)",eigh,0.003003358840942383 -48,"(40, 210)",svd,0.06605911254882812 -49,"(40, 210)",eigh,0.005003929138183594 -50,"(40, 460)",svd,0.06605982780456543 -51,"(40, 460)",eigh,0.020018339157104492 -52,"(40, 1000)",svd,0.06406140327453613 -53,"(40, 1000)",eigh,0.13011837005615234 -54,"(40, 2150)",svd,0.018017053604125977 -55,"(40, 2150)",eigh,1.122772216796875 -56,"(40, 4640)",svd,0.04704141616821289 -57,"(40, 4640)",eigh,9.632064819335938 -58,"(40, 10000)",svd,0.07506847381591797 -59,"(40, 10000)",eigh,95.49031686782837 -60,"(100, 10)",svd,0.029026269912719727 -61,"(100, 10)",eigh,0.047041893005371094 -62,"(100, 20)",svd,0.05805349349975586 -63,"(100, 20)",eigh,0.06005430221557617 -64,"(100, 40)",svd,0.016014575958251953 -65,"(100, 40)",eigh,0.0570521354675293 -66,"(100, 100)",svd,0.06505918502807617 -67,"(100, 100)",eigh,0.017016172409057617 -68,"(100, 210)",svd,0.030025959014892578 -69,"(100, 210)",eigh,0.00600743293762207 -70,"(100, 460)",svd,0.06806063652038574 -71,"(100, 460)",eigh,0.021019697189331055 -72,"(100, 1000)",svd,0.07506847381591797 -73,"(100, 1000)",eigh,0.12311196327209473 -74,"(100, 2150)",svd,0.06906342506408691 -75,"(100, 2150)",eigh,1.0169248580932617 -76,"(100, 4640)",svd,0.0770723819732666 -77,"(100, 4640)",eigh,9.727842092514038 -78,"(100, 10000)",svd,0.10009002685546875 -79,"(100, 10000)",eigh,90.43684029579163 -80,"(210, 10)",svd,0.006005525588989258 -81,"(210, 10)",eigh,0.09508752822875977 -82,"(210, 20)",svd,0.07206559181213379 -83,"(210, 20)",eigh,0.06305646896362305 -84,"(210, 40)",svd,0.06505990028381348 -85,"(210, 40)",eigh,0.06806135177612305 -86,"(210, 100)",svd,0.06706047058105469 -87,"(210, 100)",eigh,0.06405878067016602 -88,"(210, 210)",svd,0.07506799697875977 -89,"(210, 210)",eigh,0.06906247138977051 -90,"(210, 460)",svd,0.0790719985961914 -91,"(210, 460)",eigh,0.02302098274230957 -92,"(210, 1000)",svd,0.06306076049804688 -93,"(210, 1000)",eigh,0.14012384414672852 -94,"(210, 2150)",svd,0.1141045093536377 -95,"(210, 2150)",eigh,1.043950080871582 -96,"(210, 4640)",svd,0.12111163139343262 -97,"(210, 4640)",eigh,9.615741729736328 -98,"(210, 10000)",svd,0.16915392875671387 -99,"(210, 10000)",eigh,90.32373642921448 -100,"(460, 10)",svd,0.008007287979125977 -101,"(460, 10)",eigh,0.03803515434265137 -102,"(460, 20)",svd,0.008007526397705078 -103,"(460, 20)",eigh,0.07006335258483887 -104,"(460, 40)",svd,0.054048776626586914 -105,"(460, 40)",eigh,0.08107376098632812 -106,"(460, 100)",svd,0.08207535743713379 -107,"(460, 100)",eigh,0.07806992530822754 -108,"(460, 210)",svd,0.07907223701477051 -109,"(460, 210)",eigh,0.022019386291503906 -110,"(460, 460)",svd,0.12711548805236816 -111,"(460, 460)",eigh,0.0990908145904541 -112,"(460, 1000)",svd,0.15214014053344727 -113,"(460, 1000)",eigh,0.1461317539215088 -114,"(460, 2150)",svd,0.20218324661254883 -115,"(460, 2150)",eigh,1.00691556930542 -116,"(460, 4640)",svd,0.2642397880554199 -117,"(460, 4640)",eigh,9.623390913009644 -118,"(460, 10000)",svd,0.4153778553009033 -119,"(460, 10000)",eigh,93.83080005645752 -120,"(1000, 10)",svd,0.006006479263305664 -121,"(1000, 10)",eigh,0.047041893005371094 -122,"(1000, 20)",svd,0.011009931564331055 -123,"(1000, 20)",eigh,0.07306623458862305 -124,"(1000, 40)",svd,0.10209298133850098 -125,"(1000, 40)",eigh,0.10509538650512695 -126,"(1000, 100)",svd,0.10709714889526367 -127,"(1000, 100)",eigh,0.01801753044128418 -128,"(1000, 210)",svd,0.12311363220214844 -129,"(1000, 210)",eigh,0.10909676551818848 -130,"(1000, 460)",svd,0.18416738510131836 -131,"(1000, 460)",eigh,0.13011908531188965 -132,"(1000, 1000)",svd,0.3693380355834961 -133,"(1000, 1000)",eigh,0.2812533378601074 -134,"(1000, 2150)",svd,0.6415841579437256 -135,"(1000, 2150)",eigh,1.2638959884643555 -136,"(1000, 4640)",svd,0.9638760089874268 -137,"(1000, 4640)",eigh,10.057524681091309 -138,"(1000, 10000)",svd,1.5754334926605225 -139,"(1000, 10000)",eigh,93.95306539535522 -140,"(2150, 10)",svd,0.01000833511352539 -141,"(2150, 10)",eigh,0.03803443908691406 -142,"(2150, 20)",svd,0.026024341583251953 -143,"(2150, 20)",eigh,0.13412165641784668 -144,"(2150, 40)",svd,0.031028032302856445 -145,"(2150, 40)",eigh,0.13512301445007324 -146,"(2150, 100)",svd,0.03203105926513672 -147,"(2150, 100)",eigh,0.1331193447113037 -148,"(2150, 210)",svd,0.1681530475616455 -149,"(2150, 210)",eigh,0.14513158798217773 -150,"(2150, 460)",svd,0.2952694892883301 -151,"(2150, 460)",eigh,0.1731574535369873 -152,"(2150, 1000)",svd,0.6505939960479736 -153,"(2150, 1000)",eigh,0.37934279441833496 -154,"(2150, 2150)",svd,3.037769317626953 -155,"(2150, 2150)",eigh,1.6424922943115234 -156,"(2150, 4640)",svd,4.360964775085449 -157,"(2150, 4640)",eigh,11.373339653015137 -158,"(2150, 10000)",svd,6.593994855880737 -159,"(2150, 10000)",eigh,93.20475888252258 -160,"(4640, 10)",svd,0.020016908645629883 -161,"(4640, 10)",eigh,0.044040679931640625 -162,"(4640, 20)",svd,0.05304908752441406 -163,"(4640, 20)",eigh,0.015014171600341797 -164,"(4640, 40)",svd,0.03903460502624512 -165,"(4640, 40)",eigh,0.2151956558227539 -166,"(4640, 100)",svd,0.24322104454040527 -167,"(4640, 100)",eigh,0.220200777053833 -168,"(4640, 210)",svd,0.27625179290771484 -169,"(4640, 210)",eigh,0.24622368812561035 -170,"(4640, 460)",svd,0.6005492210388184 -171,"(4640, 460)",eigh,0.26824188232421875 -172,"(4640, 1000)",svd,1.9878089427947998 -173,"(4640, 1000)",eigh,0.5104618072509766 -174,"(4640, 2150)",svd,7.3907201290130615 -175,"(4640, 2150)",eigh,2.0078253746032715 -176,"(4640, 4640)",svd,27.215744256973267 -177,"(4640, 4640)",eigh,13.427207231521606 -178,"(4640, 10000)",svd,41.907105684280396 -179,"(4640, 10000)",eigh,98.82284474372864 -180,"(10000, 10)",svd,0.06305861473083496 -181,"(10000, 10)",eigh,0.09708738327026367 -182,"(10000, 20)",svd,0.09108400344848633 -183,"(10000, 20)",eigh,0.02802443504333496 -184,"(10000, 40)",svd,0.07707047462463379 -185,"(10000, 40)",eigh,0.0620572566986084 -186,"(10000, 100)",svd,0.13512325286865234 -187,"(10000, 100)",eigh,0.4584167003631592 -188,"(10000, 210)",svd,0.6856241226196289 -189,"(10000, 210)",eigh,0.4213829040527344 -190,"(10000, 460)",svd,0.8637833595275879 -191,"(10000, 460)",eigh,0.4854426383972168 -192,"(10000, 1000)",svd,5.033576726913452 -193,"(10000, 1000)",eigh,0.8137402534484863 -194,"(10000, 2150)",svd,18.21756386756897 -195,"(10000, 2150)",eigh,2.6454038619995117 -196,"(10000, 4640)",svd,72.08836793899536 -197,"(10000, 4640)",eigh,15.527116298675537 -198,"(10000, 10000)",svd,260.91021251678467 -199,"(10000, 10000)",eigh,116.10355758666992 diff --git a/saved_df b/saved_df deleted file mode 100644 index e2c6196ac3fc3a0595171ed928272b707e3894f1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6382 zcmeHMdvH`&8NYdwcTJ>$#kz1bu>FgK%_D-WGZ_hJ(P&t7 z&Wg!#EhLA*Uz3~357Wia(ulIthd>POH#eCswQA{Z(^2Ljlt$_jSgjJ^o~8rxF*FT~E1-DOvcNia)UiY{zK>t+ZQ1t9QlFy7yVQgZhVe zksVaYjxgR!>rBlL)&qX@*F>Q6!F!U zX#5AuC_Y!`F#0Rz{0pu7F`WPKpDrxMc+_(*g8xix-J^)5?_xeu&Gk2M{N%rBz1V@5 z5MQ~U_@vDgcWo>A^`oTU_{6g~PTEiU#pXVTcz7@A=jMIc)0#s6rDYgz>W;<_5sz#{ z{_#uiQGCO%BY*eT>nOes&eJ$|7RL{r{U*&jPI2jb4%8@@RLc6db{#W%g}RlxHuH&T45P94YTJI|nvF-Ob z|A+7W2CUxy<;!5*Z@og-$Jg$h0-U;b6XQ!=+k6f1xr=8gzR7c>-_WU(;FomX7jq@) zRn2*i9j7?neU0>JyGVL`f6Exo`{P{lzwdho@x=4L0IQ#V`EQ5^w@e^@eyZ3l{He8v zuLF)PX`2nWW62Xs!0M+TSO9i(UBMQxlKURg?TzEKZ~7;wZpODBL*D8mPoo`6(eLco)MpFsbrEBCDi z{P;&V>cC!q^=cc~{`2eU{@2EJePRpM^*HP3>U$2?OD*~l+Lq7$60Ns}_HFDdSa15m zGZat$LzkdmKep>1Xit0#jBHwV_!x4-)V_}c~^ZvyNd--tS@Wp8f;JaqZzpAt3| zUh|C@;Kld!ZgrbR5k7B(4QhQc1s-GhovX~l1Bc)Nxz;aOhSIgC^^dnE5Z9VPo;zy=0 z9P%gX<#=4~6C^c0U#zd!myEJ zj4hlZ20u8u^-Gmr2t;kN`r0^Gd=oHcCA4TZNjwa^}Ue1}O0b41vrQ24{NWwYeRL(iw zF0O2{C&H z>Cy%nQ46R=j0!hBCt?Y>FB3WH8!0iuiZmz0m<3B+A|}4bbF?{k9OG=W4(Mkcn1^(j z34{mt6nip&r33a@Iv@~Yz$8(-6r&T_nPXojvUOle*1={W9WtP(+Lp8qGuoUx`kigo z0sX84&!uc<0w1L60Q_x z2I_1q?#SXfV{4NGupHEr0azRAVQnk|$}tzR)L=ra7PuN93QHR&s<0zuF>F$j%HqKr zr!^RyB3ldE+E~OGZKZ>mv`JF~7B_1lqODC1z;aMe24HQdhqbW?fE%s?;&viS8zy9J zn1t)biJp*HR~hp|?AZ=QynpT%udIIc zK93|`?f=(XyV|HWJKm~S;ajx&8OPhRx<*|Kufl4p^F??AzT_4pb_B(tmlEny&F*D+ zlGI%Xe{Q#KYjC>`{;#eD=Y1tloCnzVl|v@{+MVb85g&>Km0l%L@0SzecX*i(&dN7{ zPyDFvULgv@D6$})-3P^azB%(YQy}g+bekzksIoZEFp3k=@D6cmZWL-E@g)Fa5-0BP zQNWvolZoz?Kt4ZLEO9vkhXQ4rz&cj%UN5G2#ZiYQ&i=(uexpzvAjC}17&H{o$U|lY z=*3qCan>Myq(Q*e#|U4KqRE)SHgRGiYnu4XAU*^X`#QsN7dGe0;c#?k!WT@&#CeLW RDZa4mSHuU0fke_@|6k|j5qtmu diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 2f7103c9ef7fd..fdfdee2830480 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -563,7 +563,7 @@ def g(x, fun_args): # Whitening and preprocessing by PCA if self.svd_solver == "eigh": # Faster when num_samples >> n_features - D, u = linalg.eigh(X.T.dot(X)) + D, u = linalg.eigh(XT.dot(X)) eps = np.finfo(np.double).eps degenerate_idx = D < eps if np.any(degenerate_idx): From 202afa09a8c4a7510e0ff74cedf0e52bec44ea04 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Feb 2022 21:10:42 -0500 Subject: [PATCH 15/55] Added ratio column --- bench.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bench.py b/bench.py index 0cab2c1a4dc76..57ce890712214 100644 --- a/bench.py +++ b/bench.py @@ -59,7 +59,14 @@ def main() -> None: print(f"Progress: {count}/{total_reps}") eigh_time = time.time() - start - data.append({"shape": str(shape), "svd": svd_time, "eigh": eigh_time}) + data.append( + { + "shape": str(shape), + "svd": svd_time, + "eigh": eigh_time, + "eigh/svd": eigh_time / svd_time, + } + ) df = pd.DataFrame(data) if args.save and not args.load: From 2f4bd2e9a573e9e60b914ea471aa97a371b7079e Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 3 Mar 2022 17:58:28 -0500 Subject: [PATCH 16/55] Added matrix reordering and reduced equality strictness (up to parity) --- sklearn/decomposition/_fastica.py | 17 +++++++++++------ sklearn/decomposition/tests/test_fastica.py | 6 ++++++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index fdfdee2830480..e6c2cf97af939 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -13,7 +13,6 @@ import numpy as np from scipy import linalg - from ..base import BaseEstimator, TransformerMixin, _ClassNamePrefixFeaturesOutMixin from ..exceptions import ConvergenceWarning @@ -563,18 +562,24 @@ def g(x, fun_args): # Whitening and preprocessing by PCA if self.svd_solver == "eigh": # Faster when num_samples >> n_features - D, u = linalg.eigh(XT.dot(X)) + d, u = linalg.eigh(XT.dot(X)) + sort_indices = np.argsort(d)[::-1] eps = np.finfo(np.double).eps - degenerate_idx = D < eps + degenerate_idx = d < eps if np.any(degenerate_idx): warnings.warn( "There are some small singular values, using " "svd_solver = 'svd' might lead to more " "accurate results." ) - D[degenerate_idx] = eps # For numerical issues - d = np.sqrt(D) - del D + d[degenerate_idx] = eps # For numerical issues + d = np.sqrt(d, d) + d, u = d[sort_indices], u[sort_indices] + # Resize and reorder to match svd + u = u[:, : min(X.shape)] + L = np.eye(u.shape[0])[::-1] + R = np.eye(u.shape[1])[::-1] + u = L @ u @ R else: u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 34222a220ee05..6b93fd03a5422 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -142,6 +142,12 @@ def g_test(x): with pytest.raises(TypeError): FastICA(fun=range(10), svd_solver=solver).fit(m.T) + + # Check equality up to column parity + for A in (outs["eigh"], outs["svd"]): + for c in range(A.shape[1]): + if A[0, c] < 0: + A[:, c] *= -1 assert_array_almost_equal(outs["eigh"], outs["svd"]) From 035ae9c7cd025750f230780be5b0821e1102679b Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 4 Mar 2022 19:26:16 -0500 Subject: [PATCH 17/55] Simplified reorder/flip in `eigh` solver --- sklearn/decomposition/_fastica.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index e6c2cf97af939..3fb504e583358 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -577,9 +577,7 @@ def g(x, fun_args): d, u = d[sort_indices], u[sort_indices] # Resize and reorder to match svd u = u[:, : min(X.shape)] - L = np.eye(u.shape[0])[::-1] - R = np.eye(u.shape[1])[::-1] - u = L @ u @ R + u = u[::-1, ::-1] else: u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] From 7676facfe1fb99cbc284a5a3b3788b0887f481d8 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 4 Mar 2022 19:30:24 -0500 Subject: [PATCH 18/55] Removed benchmark file (in provided gist links) --- bench.py | 96 -------------------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 bench.py diff --git a/bench.py b/bench.py deleted file mode 100644 index 57ce890712214..0000000000000 --- a/bench.py +++ /dev/null @@ -1,96 +0,0 @@ -import time -import numpy as np -import pandas as pd -import argparse -from scipy import linalg - -# import streamlit as st -# import altair as alt - -parser = argparse.ArgumentParser( - description="Determine whether to save/load the dataframe." -) -parser.add_argument( - "--save", - type=str, - default="", - metavar="s", - help="Saves the dataframe to the path if provided", -) -parser.add_argument( - "--load", - type=str, - default="", - metavar="l", - help="Loads the dataframe at the path if provided", -) -args = parser.parse_args() - - -def main() -> None: - - df = None - if args.load: - df = on_load(args.load) - else: - X_shapes = [] - for i in range(3): - X_shapes.extend((int(10 ** (2 + i)), int(10 ** (1 + j))) for j in range(3)) - - solvers = { - "svd": linalg.svd, - "eigh": linalg.eigh, - } - total_reps = len(solvers) * len(X_shapes) - count = 0 - data = [] - for shape in X_shapes: - XT = np.random.rand(*shape).T - - count += 1 - start = time.time() - solvers["svd"](XT) - print(f"Progress: {count}/{total_reps}") - svd_time = time.time() - start - - count += 1 - start = time.time() - solvers["eigh"](XT.dot(XT.T)) - print(f"Progress: {count}/{total_reps}") - eigh_time = time.time() - start - - data.append( - { - "shape": str(shape), - "svd": svd_time, - "eigh": eigh_time, - "eigh/svd": eigh_time / svd_time, - } - ) - df = pd.DataFrame(data) - - if args.save and not args.load: - df.to_csv(args.save, index=False) - print(f"Dataframe saved to {args.save}") - - """ - chart = ( - alt.Chart(df, width=300) - .mark_bar() - .encode(x="shape", y=["svd","eigh"], column="shape") - .properties(title="time by shape") - ).resolve_scale(y="independent") - - st.altair_chart(chart) - """ - - -def on_load(pth: str) -> pd.DataFrame: - print(f"Dataframe loaded from {args.load}") - df = pd.read_csv(pth) - df["shape"] = df["shape"].astype("string") - return df - - -if __name__ == "__main__": - main() From 66dd7070a5d872db24c4f311fadb46a7fdb8258a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Mar 2022 13:20:02 -0500 Subject: [PATCH 19/55] TST Check solvers --- sklearn/decomposition/_fastica.py | 7 +-- sklearn/decomposition/tests/test_fastica.py | 51 +++++++++++---------- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index d03c99d4a4405..bb4a255eafbc8 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -574,12 +574,13 @@ def g(x, fun_args): ) d[degenerate_idx] = eps # For numerical issues d = np.sqrt(d, d) - d, u = d[sort_indices], u[sort_indices] - # Resize and reorder to match svd - u = u[::-1, : min(X.shape) : -1] + d, u = d[sort_indices], u[:, sort_indices] else: u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] + signs = np.sign(u[0]) + u *= signs + K = (u / d).T[:n_components] # see (6.33) p.140 del u, d X1 = np.dot(K, XT) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 6b93fd03a5422..bb97d37cc7878 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -118,36 +118,39 @@ def g_test(x): assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) - # Test FastICA class + +@pytest.mark.parametrize("add_noise", [True, False]) +@pytest.mark.parametrize("seed", range(1)) +def test_fastica_simple_different_solvers(add_noise, seed): + """Test FastICA is consistent between svd_solvers.""" + rng = np.random.RandomState(seed) + # scipy.stats uses the global RNG: + n_samples = 1000 + # Generate two sources: + s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 + s2 = stats.t.rvs(1, size=n_samples) + s = np.c_[s1, s2].T + center_and_norm(s) + s1, s2 = s + + # Mixing angle + phi = 0.6 + mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) + m = np.dot(mixing, s) + + if add_noise: + m += 0.1 * rng.randn(2, 1000) + + center_and_norm(m) + outs = {} - for solver in ("eigh", "svd"): - _, _, sources_fun = fastica( - m.T, fun=nl, algorithm=algo, random_state=0, svd_solver=solver - ) - ica = FastICA(fun=nl, algorithm=algo, random_state=0, svd_solver=solver) + for solver in ("svd", "eigh"): + ica = FastICA(random_state=0, svd_solver=solver) sources = ica.fit_transform(m.T) outs[solver] = sources assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) - assert_array_almost_equal(sources_fun, sources) - assert_array_almost_equal(sources, ica.transform(m.T)) - - assert ica.mixing_.shape == (2, 2) - - for fn in [np.tanh, "exp(-.5(x^2))"]: - ica = FastICA(fun=fn, algorithm=algo, svd_solver=solver) - with pytest.raises(ValueError): - ica.fit(m.T) - - with pytest.raises(TypeError): - FastICA(fun=range(10), svd_solver=solver).fit(m.T) - - # Check equality up to column parity - for A in (outs["eigh"], outs["svd"]): - for c in range(A.shape[1]): - if A[0, c] < 0: - A[:, c] *= -1 assert_array_almost_equal(outs["eigh"], outs["svd"]) From e6e960209f31662563e3a690f3a07179405075b2 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Mar 2022 13:21:09 -0500 Subject: [PATCH 20/55] TST Create new tests --- sklearn/decomposition/tests/test_fastica.py | 80 +++++++++++++-------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index bb97d37cc7878..b831d9e976f2c 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -118,40 +118,25 @@ def g_test(x): assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) + # Test FastICA class + _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=seed) + ica = FastICA(fun=nl, algorithm=algo, random_state=seed) + sources = ica.fit_transform(m.T) + assert ica.components_.shape == (2, 2) + assert sources.shape == (1000, 2) -@pytest.mark.parametrize("add_noise", [True, False]) -@pytest.mark.parametrize("seed", range(1)) -def test_fastica_simple_different_solvers(add_noise, seed): - """Test FastICA is consistent between svd_solvers.""" - rng = np.random.RandomState(seed) - # scipy.stats uses the global RNG: - n_samples = 1000 - # Generate two sources: - s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 - s2 = stats.t.rvs(1, size=n_samples) - s = np.c_[s1, s2].T - center_and_norm(s) - s1, s2 = s + assert_array_almost_equal(sources_fun, sources) + assert_array_almost_equal(sources, ica.transform(m.T)) - # Mixing angle - phi = 0.6 - mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) - m = np.dot(mixing, s) + assert ica.mixing_.shape == (2, 2) - if add_noise: - m += 0.1 * rng.randn(2, 1000) - - center_and_norm(m) - - outs = {} - for solver in ("svd", "eigh"): - ica = FastICA(random_state=0, svd_solver=solver) - sources = ica.fit_transform(m.T) - outs[solver] = sources - assert ica.components_.shape == (2, 2) - assert sources.shape == (1000, 2) + for fn in [np.tanh, "exp(-.5(x^2))"]: + ica = FastICA(fun=fn, algorithm=algo) + with pytest.raises(ValueError): + ica.fit(m.T) - assert_array_almost_equal(outs["eigh"], outs["svd"]) + with pytest.raises(TypeError): + FastICA(fun=range(10)).fit(m.T) def test_fastica_nowhiten(): @@ -402,3 +387,38 @@ def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): assert len(out) == expected_len if not whiten: assert out[0] is None + + +@pytest.mark.parametrize("add_noise", [True, False]) +@pytest.mark.parametrize("seed", range(1)) +def test_fastica_simple_different_solvers(add_noise, seed): + """Test FastICA is consistent between svd_solvers.""" + rng = np.random.RandomState(seed) + # scipy.stats uses the global RNG: + n_samples = 1000 + # Generate two sources: + s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 + s2 = stats.t.rvs(1, size=n_samples) + s = np.c_[s1, s2].T + center_and_norm(s) + s1, s2 = s + + # Mixing angle + phi = 0.6 + mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) + m = np.dot(mixing, s) + + if add_noise: + m += 0.1 * rng.randn(2, 1000) + + center_and_norm(m) + + outs = {} + for solver in ("svd", "eigh"): + ica = FastICA(random_state=0, svd_solver=solver) + sources = ica.fit_transform(m.T) + outs[solver] = sources + assert ica.components_.shape == (2, 2) + assert sources.shape == (1000, 2) + + assert_array_almost_equal(outs["eigh"], outs["svd"]) From 9cdbef9b2919c629218b2f1f272a1cc3f8a4a1ef Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Mar 2022 13:21:39 -0500 Subject: [PATCH 21/55] CLN Slightly better --- sklearn/decomposition/_fastica.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index bb4a255eafbc8..20e4bbc103a35 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -578,8 +578,7 @@ def g(x, fun_args): else: u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] - signs = np.sign(u[0]) - u *= signs + u *= np.sign(u[0]) K = (u / d).T[:n_components] # see (6.33) p.140 del u, d From c5ef5a967abcda93e718b9a6f590ae799319e3b0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Mar 2022 13:24:24 -0500 Subject: [PATCH 22/55] TST Adjust seed --- sklearn/decomposition/tests/test_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index b831d9e976f2c..90ed63a187660 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -390,7 +390,7 @@ def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): @pytest.mark.parametrize("add_noise", [True, False]) -@pytest.mark.parametrize("seed", range(1)) +@pytest.mark.parametrize("seed", range(2)) def test_fastica_simple_different_solvers(add_noise, seed): """Test FastICA is consistent between svd_solvers.""" rng = np.random.RandomState(seed) From 77422c879e1dfc5e3ea621c5f8ffa16af5f83d19 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Mar 2022 13:27:29 -0500 Subject: [PATCH 23/55] DOC Adds comment --- sklearn/decomposition/_fastica.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 20e4bbc103a35..4c5a8e265838c 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -578,6 +578,7 @@ def g(x, fun_args): else: u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] + # Give consistent eigenvectors for both svd solvers u *= np.sign(u[0]) K = (u / d).T[:n_components] # see (6.33) p.140 From deaab6ebfb0a5e07c6051e4b705ae10fe750e135 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Mar 2022 13:48:32 -0500 Subject: [PATCH 24/55] FIX Give a random state --- sklearn/decomposition/tests/test_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 90ed63a187660..a1129b355be18 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -398,7 +398,7 @@ def test_fastica_simple_different_solvers(add_noise, seed): n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 - s2 = stats.t.rvs(1, size=n_samples) + s2 = stats.t.rvs(1, size=n_samples, random_state=rng) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s From c077bfe4098603bc77174308bfb443fdbece8baf Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 9 Mar 2022 13:48:44 -0500 Subject: [PATCH 25/55] FIX Give a random state --- sklearn/decomposition/tests/test_fastica.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index a1129b355be18..39e9d849ac970 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -394,7 +394,6 @@ def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): def test_fastica_simple_different_solvers(add_noise, seed): """Test FastICA is consistent between svd_solvers.""" rng = np.random.RandomState(seed) - # scipy.stats uses the global RNG: n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 From c085f5eb8c33210cb314303d31b5f58405043470 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Mar 2022 17:00:26 -0400 Subject: [PATCH 26/55] Updated changelog --- doc/whats_new/v1.1.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 8f2c981cc3a8d..fb9786799f2df 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -296,8 +296,9 @@ Changelog how whitening is performed through the new `svd_solver` parameter, which supports `svd` and `eigh`. `svd_solver` defaults to `svd` although `eigh` may be faster in cases where `num_features > num_samples` :pr:`11860` by - :user:`Pierre Ablin ` and :pr:`22527` by - :user:`Meekail Zain `. + :user:`Pierre Ablin `, :pr:`22527` by + :user:`Meekail Zain ` and :user:`Thomas Fan`_. + - |Fix| Greatly reduced peak memory usage in :class:`decomposition.PCA` when calling `fit` or `fit_transform`. :pr:`22553` by :user:`Meekail Zain `. From 7a0a130f5092db1510fd3a85021a1244e5865b16 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Mar 2022 17:52:40 -0400 Subject: [PATCH 27/55] Corrected `svd_solver`->`whiten_solver` --- sklearn/decomposition/_fastica.py | 32 ++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 4c5a8e265838c..9b960abe3cae3 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -157,7 +157,7 @@ def fastica( max_iter=200, tol=1e-04, w_init=None, - svd_solver="svd", + whiten_solver="svd", random_state=None, return_X_mean=False, compute_sources=True, @@ -223,10 +223,15 @@ def my_g(x): Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.'s is used. - svd_solver : str, default='svd' + whiten_solver : str, default='svd' The solver to use for whitening. Can either be 'svd' or 'eigh'. - 'svd' is more stable numerically if the problem is degenerate. - 'eigh' is generally faster. + + * 'svd' is more stable numerically if the problem is degenerate, and + often faster when `num_samples<=num_features`. + + * 'eigh' is generally more memory efficient when + `num_samples>=num_features`, and can be faster when + `num_samples>= 50*num_features`. random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a @@ -300,7 +305,7 @@ def my_g(x): max_iter=max_iter, tol=tol, w_init=w_init, - svd_solver=svd_solver, + whiten_solver=whiten_solver, random_state=random_state, ) S = est._fit(X, compute_sources=compute_sources) @@ -376,8 +381,13 @@ def my_g(x): whiten_solver : str, default='svd' The solver to use for whitening. Can either be 'svd' or 'eigh'. - 'svd' is more stable numerically if the problem is degenerate. - 'eigh' is generally faster. + + * 'svd' is more stable numerically if the problem is degenerate, and + often faster when `num_samples<=num_features`. + + * 'eigh' is generally more memory efficient when + `num_samples>=num_features`, and can be faster when + `num_samples>= 50*num_features`. random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a @@ -458,7 +468,7 @@ def __init__( max_iter=200, tol=1e-4, w_init=None, - svd_solver="svd", + whiten_solver="svd", random_state=None, ): super().__init__() @@ -470,7 +480,7 @@ def __init__( self.max_iter = max_iter self.tol = tol self.w_init = w_init - self.svd_solver = svd_solver + self.whiten_solver = whiten_solver self.random_state = random_state def _fit(self, X, compute_sources=False): @@ -560,7 +570,7 @@ def g(x, fun_args): XT -= X_mean[:, np.newaxis] # Whitening and preprocessing by PCA - if self.svd_solver == "eigh": + if self.whiten_solver == "eigh": # Faster when num_samples >> n_features d, u = linalg.eigh(XT.dot(X)) sort_indices = np.argsort(d)[::-1] @@ -569,7 +579,7 @@ def g(x, fun_args): if np.any(degenerate_idx): warnings.warn( "There are some small singular values, using " - "svd_solver = 'svd' might lead to more " + "whiten_solver = 'svd' might lead to more " "accurate results." ) d[degenerate_idx] = eps # For numerical issues From 38fcc3a59ce77f3d7954142866f7829acf63a992 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Sun, 13 Mar 2022 17:55:48 -0400 Subject: [PATCH 28/55] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.1.rst | 2 +- sklearn/decomposition/_fastica.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index f4002f443e202..f700fc4c8817c 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -308,7 +308,7 @@ Changelog supports `svd` and `eigh`. `svd_solver` defaults to `svd` although `eigh` may be faster in cases where `num_features > num_samples` :pr:`11860` by :user:`Pierre Ablin `, :pr:`22527` by - :user:`Meekail Zain ` and :user:`Thomas Fan`_. + :user:`Meekail Zain ` and `Thomas Fan`_. - |Fix| Greatly reduced peak memory usage in :class:`decomposition.PCA` when calling `fit` or `fit_transform`. diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 9b960abe3cae3..71d05054ae606 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -583,7 +583,7 @@ def g(x, fun_args): "accurate results." ) d[degenerate_idx] = eps # For numerical issues - d = np.sqrt(d, d) + np.sqrt(d, out=d) d, u = d[sort_indices], u[:, sort_indices] else: u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] From 5916ab82a3bef0d211d3c595a3eeb3e040cb4a9f Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Mar 2022 18:01:26 -0400 Subject: [PATCH 29/55] Update changelog --- doc/whats_new/v1.1.rst | 12 +++++++++--- sklearn/decomposition/_fastica.py | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index f700fc4c8817c..935c18dc1c025 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -34,6 +34,11 @@ random sampling procedures. same solution, up to numerical rounding errors, but in general Lloyd's algorithm uses much less memory, and it is often faster. +- |Enhancement| :class:`decomposition.FastICA` now allows the user to select + how whitening is performed through the new `whiten_solver` parameter, which + supports `svd` and `eigh`. Minor parity changes in the internal workings of + `decomposition.FastICA` may result in different outputs. + - |Fix| The eigenvectors initialization for :class:`cluster.SpectralClustering` and :class:`manifold.SpectralEmbedding` now samples from a Gaussian when using the `'amg'` or `'lobpcg'` solver. This change improves numerical @@ -304,9 +309,10 @@ Changelog :user:`sysuresh`. - |Enhancement| :class:`decomposition.FastICA` now allows the user to select - how whitening is performed through the new `svd_solver` parameter, which - supports `svd` and `eigh`. `svd_solver` defaults to `svd` although `eigh` may - be faster in cases where `num_features > num_samples` :pr:`11860` by + how whitening is performed through the new `whiten_solver` parameter, which + supports `svd` and `eigh`. `whiten_solver` defaults to `svd` although `eigh` + may be faster and more memory efficient in cases where + `num_features > num_samples` :pr:`11860` by :user:`Pierre Ablin `, :pr:`22527` by :user:`Meekail Zain ` and `Thomas Fan`_. diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 71d05054ae606..d0344b0839d29 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -589,7 +589,10 @@ def g(x, fun_args): u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] # Give consistent eigenvectors for both svd solvers - u *= np.sign(u[0]) + # Uses same semantics as svd_flip + max_abs_cols = np.argmax(np.abs(u), axis=0) + signs = np.sign(u[max_abs_cols, range(u.shape[1])]) + u *= signs K = (u / d).T[:n_components] # see (6.33) p.140 del u, d From 1f120103db17434f4ec364476562cd85f4698766 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 13 Mar 2022 20:54:27 -0400 Subject: [PATCH 30/55] Updated `svd_solver`->`whiten_solver` in tests --- sklearn/decomposition/tests/test_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 39e9d849ac970..0a5669e9925ff 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -414,7 +414,7 @@ def test_fastica_simple_different_solvers(add_noise, seed): outs = {} for solver in ("svd", "eigh"): - ica = FastICA(random_state=0, svd_solver=solver) + ica = FastICA(random_state=0, whiten_solver=solver) sources = ica.fit_transform(m.T) outs[solver] = sources assert ica.components_.shape == (2, 2) From 6299607e09a82fd16eee093d6e28751f8c0c3203 Mon Sep 17 00:00:00 2001 From: Meekail Zain Date: Wed, 16 Mar 2022 11:40:30 -0400 Subject: [PATCH 31/55] Changed sign flip convention --- sklearn/decomposition/_fastica.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index d0344b0839d29..b3ad79eab88c5 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -590,9 +590,7 @@ def g(x, fun_args): # Give consistent eigenvectors for both svd solvers # Uses same semantics as svd_flip - max_abs_cols = np.argmax(np.abs(u), axis=0) - signs = np.sign(u[max_abs_cols, range(u.shape[1])]) - u *= signs + u *= np.sign(u[0]) K = (u / d).T[:n_components] # see (6.33) p.140 del u, d From a365dfa5385ead2967f01b80b50b43bf31848e36 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 19 Mar 2022 16:05:04 -0400 Subject: [PATCH 32/55] Specify whiten to avoid future warning --- sklearn/decomposition/tests/test_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 3c17c85001156..862e90812a4b0 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -415,7 +415,7 @@ def test_fastica_simple_different_solvers(add_noise, seed): outs = {} for solver in ("svd", "eigh"): - ica = FastICA(random_state=0, whiten_solver=solver) + ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver=solver) sources = ica.fit_transform(m.T) outs[solver] = sources assert ica.components_.shape == (2, 2) From 785131a9b7104f952f5a7420270da5c85ed1ccd2 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 19 Mar 2022 16:10:05 -0400 Subject: [PATCH 33/55] Fixed changelog class reference --- doc/whats_new/v1.1.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 3b87b30482da4..ac0745d601e23 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -37,7 +37,7 @@ random sampling procedures. - |Enhancement| :class:`decomposition.FastICA` now allows the user to select how whitening is performed through the new `whiten_solver` parameter, which supports `svd` and `eigh`. Minor parity changes in the internal workings of - `decomposition.FastICA` may result in different outputs. + :class:`decomposition.FastICA` may result in different outputs. - |Fix| The eigenvectors initialization for :class:`cluster.SpectralClustering` and :class:`manifold.SpectralEmbedding` now samples from a Gaussian when @@ -609,7 +609,7 @@ Changelog - |Fix| The `intercept_` attribute of :class:`LinearRegression` is now correctly computed in the presence of sample weights when the input is sparse. - :pr:`22891` by :user:`Jérémie du Boisberranger `. + :pr:`22891` by :user:`Jérémie du Boisberranger `. :mod:`sklearn.manifold` ....................... From 1a59a39cd693c0c8933d6142f33be5c7ce2e1f4a Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Fri, 25 Mar 2022 17:03:24 -0400 Subject: [PATCH 34/55] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/decomposition/_fastica.py | 5 ++--- sklearn/decomposition/tests/test_fastica.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index b3ad79eab88c5..ea76214fab7ff 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -226,8 +226,8 @@ def my_g(x): whiten_solver : str, default='svd' The solver to use for whitening. Can either be 'svd' or 'eigh'. - * 'svd' is more stable numerically if the problem is degenerate, and - often faster when `num_samples<=num_features`. + - 'svd' is more stable numerically if the problem is degenerate, and + often faster when `num_samples<=num_features`. * 'eigh' is generally more memory efficient when `num_samples>=num_features`, and can be faster when @@ -589,7 +589,6 @@ def g(x, fun_args): u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] # Give consistent eigenvectors for both svd solvers - # Uses same semantics as svd_flip u *= np.sign(u[0]) K = (u / d).T[:n_components] # see (6.33) p.140 diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 862e90812a4b0..b62d164634a1c 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -391,8 +391,7 @@ def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): @pytest.mark.parametrize("add_noise", [True, False]) -@pytest.mark.parametrize("seed", range(2)) -def test_fastica_simple_different_solvers(add_noise, seed): +def test_fastica_simple_different_solvers(add_noise, global_random_seed): """Test FastICA is consistent between svd_solvers.""" rng = np.random.RandomState(seed) n_samples = 1000 From 4284fded8013db3317cac21315a655c9eeee387f Mon Sep 17 00:00:00 2001 From: Micky774 Date: Fri, 25 Mar 2022 17:32:03 -0400 Subject: [PATCH 35/55] Add test for catching low-rank warning in `eigh` solver --- sklearn/decomposition/tests/test_fastica.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index b62d164634a1c..94d759a24a022 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -7,6 +7,7 @@ import numpy as np from scipy import stats +from scipy import linalg from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal @@ -393,7 +394,7 @@ def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): @pytest.mark.parametrize("add_noise", [True, False]) def test_fastica_simple_different_solvers(add_noise, global_random_seed): """Test FastICA is consistent between svd_solvers.""" - rng = np.random.RandomState(seed) + rng = np.random.RandomState(global_random_seed) n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 @@ -421,3 +422,18 @@ def test_fastica_simple_different_solvers(add_noise, global_random_seed): assert sources.shape == (1000, 2) assert_array_almost_equal(outs["eigh"], outs["svd"]) + + +def test_fastica_eigh_low_rank_warning(global_random_seed): + """Test FastICA eigh solver raises warning for low-rank data.""" + rng = np.random.RandomState(global_random_seed) + X = rng.rand(100, 100) + U, S, V = linalg.svd(X) + s = np.zeros_like(S) + rank = 10 + s[:rank] = S[:rank] + T = U @ linalg.diagsvd(s, *X.shape) @ V + + msg = "There are some small singular values" + with pytest.warns(UserWarning, match=msg): + FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh").fit(T) From 8b77ef242c1766487203ba6f0ea2e5c97b728f1e Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Mar 2022 19:47:23 -0400 Subject: [PATCH 36/55] Fixed sphinx lists --- sklearn/decomposition/_fastica.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index ea76214fab7ff..d28242f67da8c 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -382,12 +382,12 @@ def my_g(x): whiten_solver : str, default='svd' The solver to use for whitening. Can either be 'svd' or 'eigh'. - * 'svd' is more stable numerically if the problem is degenerate, and - often faster when `num_samples<=num_features`. + - 'svd' is more stable numerically if the problem is degenerate, and + often faster when `num_samples<=num_features`. - * 'eigh' is generally more memory efficient when - `num_samples>=num_features`, and can be faster when - `num_samples>= 50*num_features`. + - 'eigh' is generally more memory efficient when + `num_samples>=num_features`, and can be faster when + `num_samples>= 50*num_features`. random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a From 8e6273e2e8cabe8991d5c0c5d148ec618b27e710 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sat, 26 Mar 2022 20:20:45 -0400 Subject: [PATCH 37/55] Reformatted sphinx lists --- sklearn/decomposition/_fastica.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index d28242f67da8c..195c610996bc1 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -343,11 +343,13 @@ class FastICA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator) whiten : str or bool, default="warn" Specify the whitening strategy to use. - If 'arbitrary-variance' (default), a whitening with variance arbitrary is used. - If 'unit-variance', the whitening matrix is rescaled to ensure that each - recovered source has unit variance. - If False, the data is already considered to be whitened, and no - whitening is performed. + + - If 'arbitrary-variance' (default), a whitening with variance + arbitrary is used. + - If 'unit-variance', the whitening matrix is rescaled to ensure that + each recovered source has unit variance. + - If False, the data is already considered to be whitened, and no + whitening is performed. .. deprecated:: 1.1 From version 1.3 whiten='unit-variance' will be used by default. @@ -384,7 +386,6 @@ def my_g(x): - 'svd' is more stable numerically if the problem is degenerate, and often faster when `num_samples<=num_features`. - - 'eigh' is generally more memory efficient when `num_samples>=num_features`, and can be faster when `num_samples>= 50*num_features`. From 5e065424459c9d3cbaebce85ed57e579160b3c76 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 3 Apr 2022 11:05:03 -0400 Subject: [PATCH 38/55] Actually fix sphinx error...hopefully --- sklearn/decomposition/_fastica.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 195c610996bc1..4c23298a884cd 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -229,9 +229,9 @@ def my_g(x): - 'svd' is more stable numerically if the problem is degenerate, and often faster when `num_samples<=num_features`. - * 'eigh' is generally more memory efficient when - `num_samples>=num_features`, and can be faster when - `num_samples>= 50*num_features`. + - 'eigh' is generally more memory efficient when + `num_samples>=num_features`, and can be faster when + `num_samples>= 50*num_features`. random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a From 24a4fc88027c6479286e316f80a5098058ba9587 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 3 Apr 2022 13:06:43 -0400 Subject: [PATCH 39/55] Fixed git sync issue --- sklearn/decomposition/tests/test_fastica.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index ad1bfa85f61fa..da0b3d4a9689a 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -10,6 +10,7 @@ from scipy import linalg from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.decomposition import FastICA, fastica, PCA From 3c8a4460dfcdbc119e93cbff08ef1291be81e789 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Sun, 8 May 2022 18:02:38 -0400 Subject: [PATCH 40/55] Update sklearn/decomposition/_fastica.py Co-authored-by: Thomas J. Fan --- sklearn/decomposition/_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 95120cf1ae3c8..04ebcac1f1b70 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -579,7 +579,7 @@ def g(x, fun_args): # Faster when num_samples >> n_features d, u = linalg.eigh(XT.dot(X)) sort_indices = np.argsort(d)[::-1] - eps = np.finfo(np.double).eps + eps = np.finfo(d.dtype).eps degenerate_idx = d < eps if np.any(degenerate_idx): warnings.warn( From 066e83a054d891b7d09d67d351741cb5bbfc51e7 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Sun, 8 May 2022 18:09:52 -0400 Subject: [PATCH 41/55] Undo format change (form a separate PR) --- sklearn/decomposition/_fastica.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 95120cf1ae3c8..7a85ead1dae26 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -347,13 +347,11 @@ class FastICA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator) whiten : str or bool, default="warn" Specify the whitening strategy to use. - - - If 'arbitrary-variance' (default), a whitening with variance - arbitrary is used. - - If 'unit-variance', the whitening matrix is rescaled to ensure that - each recovered source has unit variance. - - If False, the data is already considered to be whitened, and no - whitening is performed. + If 'arbitrary-variance' (default), a whitening with variance arbitrary is used. + If 'unit-variance', the whitening matrix is rescaled to ensure that each + recovered source has unit variance. + If False, the data is already considered to be whitened, and no + whitening is performed. .. deprecated:: 1.1 From version 1.3 whiten='unit-variance' will be used by default. From 76ed7b637d6ff114172fa24cf13ac743ce7c5cb9 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 19 May 2022 19:23:03 -0400 Subject: [PATCH 42/55] Added "auto" option as new option, and added tests --- sklearn/decomposition/_fastica.py | 49 +++++++++++++-------- sklearn/decomposition/tests/test_fastica.py | 30 ++++++++++++- 2 files changed, 59 insertions(+), 20 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 2a97dcd362703..2ebbd2f29098f 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -161,7 +161,7 @@ def fastica( max_iter=200, tol=1e-04, w_init=None, - whiten_solver="svd", + whiten_solver="auto", random_state=None, return_X_mean=False, compute_sources=True, @@ -227,15 +227,16 @@ def my_g(x): Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.'s is used. - whiten_solver : str, default='svd' - The solver to use for whitening. Can either be 'svd' or 'eigh'. + whiten_solver : {"auto", "eigh", "svd"}, default="auto" + The solver to use for whitening. When set to "auto", the "eigh" solver + will be used if `n_samples >= 50 * n_features` otherwise "svd" is used. - - 'svd' is more stable numerically if the problem is degenerate, and - often faster when `num_samples<=num_features`. + - "svd" is more stable numerically if the problem is degenerate, and + often faster when `n_samples <= n_features`. - - 'eigh' is generally more memory efficient when - `num_samples>=num_features`, and can be faster when - `num_samples>= 50*num_features`. + - "eigh" is generally more memory efficient when + `n_samples >= n_features`, and can be faster when + `n_samples >= 50 * n_features`. random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a @@ -383,14 +384,16 @@ def my_g(x): w_init : ndarray of shape (n_components, n_components), default=None The mixing matrix to be used to initialize the algorithm. - whiten_solver : str, default='svd' - The solver to use for whitening. Can either be 'svd' or 'eigh'. + whiten_solver : {"auto", "eigh", "svd"}, default="auto" + The solver to use for whitening. When set to "auto", the "eigh" solver + will be used if `n_samples >= 50 * n_features` otherwise "svd" is used. - - 'svd' is more stable numerically if the problem is degenerate, and - often faster when `num_samples<=num_features`. - - 'eigh' is generally more memory efficient when - `num_samples>=num_features`, and can be faster when - `num_samples>= 50*num_features`. + - "svd" is more stable numerically if the problem is degenerate, and + often faster when `n_samples <= n_features`. + + - "eigh" is generally more memory efficient when + `n_samples >= n_features`, and can be faster when + `n_samples >= 50 * n_features`. random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a @@ -471,7 +474,7 @@ def __init__( max_iter=200, tol=1e-4, w_init=None, - whiten_solver="svd", + whiten_solver="auto", random_state=None, ): super().__init__() @@ -572,8 +575,13 @@ def g(x, fun_args): X_mean = XT.mean(axis=-1) XT -= X_mean[:, np.newaxis] + # Benchmark validated heuristic + self._whiten_solver = self.whiten_solver + if self._whiten_solver == "auto": + self._whiten_solver = "eigh" if X.shape[0] > 50 * X.shape[1] else "svd" + # Whitening and preprocessing by PCA - if self.whiten_solver == "eigh": + if self._whiten_solver == "eigh": # Faster when num_samples >> n_features d, u = linalg.eigh(XT.dot(X)) sort_indices = np.argsort(d)[::-1] @@ -588,8 +596,13 @@ def g(x, fun_args): d[degenerate_idx] = eps # For numerical issues np.sqrt(d, out=d) d, u = d[sort_indices], u[:, sort_indices] - else: + elif self._whiten_solver == "svd": u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] + else: + raise ValueError( + "`whiten_solver` must be 'auto', 'eigh' or 'svd' but got" + f" {self.whiten_solver} instead" + ) # Give consistent eigenvectors for both svd solvers u *= np.sign(u[0]) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index da0b3d4a9689a..51803fc864d85 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -10,7 +10,6 @@ from scipy import linalg from sklearn.utils._testing import assert_array_equal -from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_allclose from sklearn.decomposition import FastICA, fastica, PCA @@ -491,7 +490,7 @@ def test_fastica_simple_different_solvers(add_noise, global_random_seed): assert ica.components_.shape == (2, 2) assert sources.shape == (1000, 2) - assert_array_almost_equal(outs["eigh"], outs["svd"]) + assert_allclose(outs["eigh"], outs["svd"]) def test_fastica_eigh_low_rank_warning(global_random_seed): @@ -507,3 +506,30 @@ def test_fastica_eigh_low_rank_warning(global_random_seed): msg = "There are some small singular values" with pytest.warns(UserWarning, match=msg): FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh").fit(T) + + +@pytest.mark.parametrize("whiten_solver", ["svd", "eigh", "auto"]) +def test_fastica_whiten_solver(global_random_seed, whiten_solver): + rng = np.random.RandomState(global_random_seed) + X = rng.random_sample((100, 10)) + ica = FastICA(random_state=rng, whiten_solver=whiten_solver, whiten="unit-variance") + ica.fit_transform(X) + correct_solver = whiten_solver + if correct_solver == "auto": + correct_solver = "eigh" if X.shape[0] > 50 * X.shape[1] else "svd" + + assert ica.whiten_solver == whiten_solver + assert ica._whiten_solver == correct_solver + + +@pytest.mark.parametrize("whiten_solver", ["this_should_fail", "test", 1]) +def test_fastica_whiten_solver_validation(whiten_solver): + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)) + ica = FastICA(random_state=rng, whiten_solver=whiten_solver, whiten="unit-variance") + msg = ( + "`whiten_solver` must be 'auto', 'eigh' or 'svd' but got" + f" {whiten_solver} instead" + ) + with pytest.raises(ValueError, match=msg): + ica.fit_transform(X) From 3a271b15843fc2f0380cb0ea9a4520264fdb0d02 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 19 May 2022 19:35:03 -0400 Subject: [PATCH 43/55] Began deprecation of new `whiten_solver` param in favor of auto --- sklearn/decomposition/_fastica.py | 27 ++++++++++++++++++--- sklearn/decomposition/tests/test_fastica.py | 18 ++++++++++++-- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 2ebbd2f29098f..557e38713a1bd 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -161,7 +161,7 @@ def fastica( max_iter=200, tol=1e-04, w_init=None, - whiten_solver="auto", + whiten_solver="warn", random_state=None, return_X_mean=False, compute_sources=True, @@ -227,7 +227,7 @@ def my_g(x): Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.'s is used. - whiten_solver : {"auto", "eigh", "svd"}, default="auto" + whiten_solver : {"auto", "eigh", "svd"}, default="svd" The solver to use for whitening. When set to "auto", the "eigh" solver will be used if `n_samples >= 50 * n_features` otherwise "svd" is used. @@ -238,6 +238,12 @@ def my_g(x): `n_samples >= n_features`, and can be faster when `n_samples >= 50 * n_features`. + .. versionadded:: 1.2 + + .. versionchanged:: 1.4 + Default value for `whiten_solver` will change from "svd" to "auto" + in version 1.4. + random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a normal distribution. Pass an int, for reproducible results @@ -384,7 +390,7 @@ def my_g(x): w_init : ndarray of shape (n_components, n_components), default=None The mixing matrix to be used to initialize the algorithm. - whiten_solver : {"auto", "eigh", "svd"}, default="auto" + whiten_solver : {"auto", "eigh", "svd"}, default="svd" The solver to use for whitening. When set to "auto", the "eigh" solver will be used if `n_samples >= 50 * n_features` otherwise "svd" is used. @@ -395,6 +401,12 @@ def my_g(x): `n_samples >= n_features`, and can be faster when `n_samples >= 50 * n_features`. + .. versionadded:: 1.2 + + .. versionchanged:: 1.4 + Default value for `whiten_solver` will change from "svd" to "auto" + in version 1.4. + random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a normal distribution. Pass an int, for reproducible results @@ -474,7 +486,7 @@ def __init__( max_iter=200, tol=1e-4, w_init=None, - whiten_solver="auto", + whiten_solver="warn", random_state=None, ): super().__init__() @@ -577,6 +589,13 @@ def g(x, fun_args): # Benchmark validated heuristic self._whiten_solver = self.whiten_solver + if self._whiten_solver == "warn": + warnings.warn( + "From version 1.4 whiten_solver='auto' will be used by default.", + FutureWarning, + ) + self._whiten_solver = "svd" + if self._whiten_solver == "auto": self._whiten_solver = "eigh" if X.shape[0] > 50 * X.shape[1] else "svd" diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 51803fc864d85..a386c745b6af8 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -423,7 +423,10 @@ def test_fastica_whiten_backwards_compatibility(): # No warning must be raised in this case. av_ica = FastICA( - n_components=n_components, whiten="arbitrary-variance", random_state=0 + n_components=n_components, + whiten="arbitrary-variance", + random_state=0, + whiten_solver="svd", ) with warnings.catch_warnings(): warnings.simplefilter("error", FutureWarning) @@ -522,7 +525,7 @@ def test_fastica_whiten_solver(global_random_seed, whiten_solver): assert ica._whiten_solver == correct_solver -@pytest.mark.parametrize("whiten_solver", ["this_should_fail", "test", 1]) +@pytest.mark.parametrize("whiten_solver", ["this_should_fail", "test", 1, None]) def test_fastica_whiten_solver_validation(whiten_solver): rng = np.random.RandomState(0) X = rng.random_sample((100, 10)) @@ -533,3 +536,14 @@ def test_fastica_whiten_solver_validation(whiten_solver): ) with pytest.raises(ValueError, match=msg): ica.fit_transform(X) + + +def test_fastica_whiten_solver_future_warning(): + rng = np.random.RandomState(0) + X = rng.random_sample((100, 10)) + ica = FastICA(random_state=rng, whiten="unit-variance") + msg = "From version 1.4 whiten_solver='auto' will be used by default." + with pytest.warns(FutureWarning, match=msg): + ica.fit_transform(X) + assert ica.whiten_solver == "warn" + assert ica._whiten_solver == "svd" From 2591dea9f2d155467c84a1c5528f95d4d86a78f3 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Wed, 25 May 2022 08:30:52 -0400 Subject: [PATCH 44/55] Removed auto option, will reintroduce in future PR --- sklearn/decomposition/_fastica.py | 36 ++++----------------- sklearn/decomposition/tests/test_fastica.py | 23 ++----------- 2 files changed, 9 insertions(+), 50 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 305391b08da7b..9ed7a12f94a67 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -228,9 +228,8 @@ def my_g(x): Initial un-mixing array. If `w_init=None`, then an array of values drawn from a normal distribution is used. - whiten_solver : {"auto", "eigh", "svd"}, default="svd" - The solver to use for whitening. When set to "auto", the "eigh" solver - will be used if `n_samples >= 50 * n_features` otherwise "svd" is used. + whiten_solver : {"eigh", "svd"}, default="svd" + The solver to use for whitening. - "svd" is more stable numerically if the problem is degenerate, and often faster when `n_samples <= n_features`. @@ -241,10 +240,6 @@ def my_g(x): .. versionadded:: 1.2 - .. versionchanged:: 1.4 - Default value for `whiten_solver` will change from "svd" to "auto" - in version 1.4. - random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a normal distribution. Pass an int, for reproducible results @@ -396,9 +391,8 @@ def my_g(x): Initial un-mixing array. If `w_init=None`, then an array of values drawn from a normal distribution is used. - whiten_solver : {"auto", "eigh", "svd"}, default="svd" - The solver to use for whitening. When set to "auto", the "eigh" solver - will be used if `n_samples >= 50 * n_features` otherwise "svd" is used. + whiten_solver : {"eigh", "svd"}, default="svd" + The solver to use for whitening. - "svd" is more stable numerically if the problem is degenerate, and often faster when `n_samples <= n_features`. @@ -409,10 +403,6 @@ def my_g(x): .. versionadded:: 1.2 - .. versionchanged:: 1.4 - Default value for `whiten_solver` will change from "svd" to "auto" - in version 1.4. - random_state : int, RandomState instance or None, default=None Used to initialize ``w_init`` when not specified, with a normal distribution. Pass an int, for reproducible results @@ -593,20 +583,8 @@ def g(x, fun_args): X_mean = XT.mean(axis=-1) XT -= X_mean[:, np.newaxis] - # Benchmark validated heuristic - self._whiten_solver = self.whiten_solver - if self._whiten_solver == "warn": - warnings.warn( - "From version 1.4 whiten_solver='auto' will be used by default.", - FutureWarning, - ) - self._whiten_solver = "svd" - - if self._whiten_solver == "auto": - self._whiten_solver = "eigh" if X.shape[0] > 50 * X.shape[1] else "svd" - # Whitening and preprocessing by PCA - if self._whiten_solver == "eigh": + if self.whiten_solver == "eigh": # Faster when num_samples >> n_features d, u = linalg.eigh(XT.dot(X)) sort_indices = np.argsort(d)[::-1] @@ -621,11 +599,11 @@ def g(x, fun_args): d[degenerate_idx] = eps # For numerical issues np.sqrt(d, out=d) d, u = d[sort_indices], u[:, sort_indices] - elif self._whiten_solver == "svd": + elif self.whiten_solver == "svd": u, d = linalg.svd(XT, full_matrices=False, check_finite=False)[:2] else: raise ValueError( - "`whiten_solver` must be 'auto', 'eigh' or 'svd' but got" + "`whiten_solver` must be 'eigh' or 'svd' but got" f" {self.whiten_solver} instead" ) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index f5737c15bb988..5bf78d5715c1c 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -511,18 +511,13 @@ def test_fastica_eigh_low_rank_warning(global_random_seed): FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh").fit(T) -@pytest.mark.parametrize("whiten_solver", ["svd", "eigh", "auto"]) +@pytest.mark.parametrize("whiten_solver", ["svd", "eigh"]) def test_fastica_whiten_solver(global_random_seed, whiten_solver): rng = np.random.RandomState(global_random_seed) X = rng.random_sample((100, 10)) ica = FastICA(random_state=rng, whiten_solver=whiten_solver, whiten="unit-variance") ica.fit_transform(X) - correct_solver = whiten_solver - if correct_solver == "auto": - correct_solver = "eigh" if X.shape[0] > 50 * X.shape[1] else "svd" - assert ica.whiten_solver == whiten_solver - assert ica._whiten_solver == correct_solver @pytest.mark.parametrize("whiten_solver", ["this_should_fail", "test", 1, None]) @@ -530,20 +525,6 @@ def test_fastica_whiten_solver_validation(whiten_solver): rng = np.random.RandomState(0) X = rng.random_sample((100, 10)) ica = FastICA(random_state=rng, whiten_solver=whiten_solver, whiten="unit-variance") - msg = ( - "`whiten_solver` must be 'auto', 'eigh' or 'svd' but got" - f" {whiten_solver} instead" - ) + msg = f"`whiten_solver` must be 'eigh' or 'svd' but got {whiten_solver} instead" with pytest.raises(ValueError, match=msg): ica.fit_transform(X) - - -def test_fastica_whiten_solver_future_warning(): - rng = np.random.RandomState(0) - X = rng.random_sample((100, 10)) - ica = FastICA(random_state=rng, whiten="unit-variance") - msg = "From version 1.4 whiten_solver='auto' will be used by default." - with pytest.warns(FutureWarning, match=msg): - ica.fit_transform(X) - assert ica.whiten_solver == "warn" - assert ica._whiten_solver == "svd" From 547a22062b32ce7fdedad318a85b6296db60cbb8 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Wed, 25 May 2022 08:34:39 -0400 Subject: [PATCH 45/55] Added changed models entry for sign-flipping --- doc/whats_new/v1.2.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 3e0f4e1165f53..b8ebf2833b31e 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -19,6 +19,13 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. +:mod:`sklearn.decomposition` +............................ + +- |Fix| :class:`decomposition.FastICA` now employs sign-flipping to produce + consistent results between varying solvers. This may change the output of + the model up to sign-flipping, but does not affect the overall correctness. + Changelog --------- From 07ca1993c7bee2c597021790395cf0e4fb377f19 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 26 May 2022 16:34:56 -0400 Subject: [PATCH 46/55] Reverted default value for whiten solver, pending follow-up PR --- sklearn/decomposition/_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 9ed7a12f94a67..793ebd97ebf7e 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -161,7 +161,7 @@ def fastica( max_iter=200, tol=1e-04, w_init=None, - whiten_solver="warn", + whiten_solver="svd", random_state=None, return_X_mean=False, compute_sources=True, From e2ecad00819d2e6aad1b91745a24c1dff20ae27c Mon Sep 17 00:00:00 2001 From: Micky774 Date: Mon, 30 May 2022 14:39:37 -0400 Subject: [PATCH 47/55] Changed erroneous default value --- sklearn/decomposition/_fastica.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 793ebd97ebf7e..e99e149c5e640 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -482,7 +482,7 @@ def __init__( max_iter=200, tol=1e-4, w_init=None, - whiten_solver="warn", + whiten_solver="svd", random_state=None, ): super().__init__() From 1e0cc7ab5f7ff8d75760109afd59572b476dd5fe Mon Sep 17 00:00:00 2001 From: Micky774 Date: Wed, 1 Jun 2022 10:30:55 -0400 Subject: [PATCH 48/55] Fixed bad changelog and corrected test description --- doc/whats_new/v1.2.rst | 4 +--- sklearn/decomposition/tests/test_fastica.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 18d4f12a6829f..a4de43a9d5cdb 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -19,19 +19,17 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -<<<<<<< HEAD :mod:`sklearn.decomposition` ............................ - |Fix| :class:`decomposition.FastICA` now employs sign-flipping to produce consistent results between varying solvers. This may change the output of the model up to sign-flipping, but does not affect the overall correctness. -======= + - |Fix| :class:`manifold.TSNE` now throws a `ValueError` when fit with `perplexity>=n_samples` to ensure mathematical correctness of the algorithm. :pr:`10805` by :user:`Mathias Andersen ` and :pr:`23471` by :user:`Meekail Zain ` ->>>>>>> main Changes impacting all modules ----------------------------- diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 5bf78d5715c1c..18911180fc23a 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -465,7 +465,7 @@ def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): @pytest.mark.parametrize("add_noise", [True, False]) def test_fastica_simple_different_solvers(add_noise, global_random_seed): - """Test FastICA is consistent between svd_solvers.""" + """Test FastICA is consistent between whiten_solvers.""" rng = np.random.RandomState(global_random_seed) n_samples = 1000 # Generate two sources: From e15b6d5c43e8a252b4cf8355cbed88c952762453 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Wed, 1 Jun 2022 11:50:12 -0400 Subject: [PATCH 49/55] Added sign-flip parameter --- doc/whats_new/v1.2.rst | 16 +++++++++------ sklearn/decomposition/_fastica.py | 33 ++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index a4de43a9d5cdb..24fdb4050ca39 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -22,9 +22,10 @@ random sampling procedures. :mod:`sklearn.decomposition` ............................ -- |Fix| :class:`decomposition.FastICA` now employs sign-flipping to produce - consistent results between varying solvers. This may change the output of - the model up to sign-flipping, but does not affect the overall correctness. +- |Fix| :class:`decomposition.FastICA` now allows the user to enable sign-flipping + to produce consistent results between varying whitening solvers. This may change + the output of the model up to sign-flipping, but does not affect the overall + correctness. - |Fix| :class:`manifold.TSNE` now throws a `ValueError` when fit with `perplexity>=n_samples` to ensure mathematical correctness of the algorithm. @@ -106,9 +107,12 @@ Changelog how whitening is performed through the new `whiten_solver` parameter, which supports `svd` and `eigh`. `whiten_solver` defaults to `svd` although `eigh` may be faster and more memory efficient in cases where - `num_features > num_samples` :pr:`11860` by - :user:`Pierre Ablin `, :pr:`22527` by - :user:`Meekail Zain ` and `Thomas Fan`_. + `num_features > num_samples`. An additional `sign_flip` parameter is added. + When `sign_flip=True`, then the output of both solvers will be reconciled + during fit so that their outputs match. This may change the output of the + default solver, and hence may not be backwards compatible. + :pr:`11860` by :user:`Pierre Ablin `, + :pr:`22527` by :user:`Meekail Zain ` and `Thomas Fan`_. :mod:`sklearn.impute` ..................... diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index e99e149c5e640..1a0372de6f641 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -166,6 +166,7 @@ def fastica( return_X_mean=False, compute_sources=True, return_n_iter=False, + sign_flip=False, ): """Perform Fast Independent Component Analysis. @@ -256,6 +257,19 @@ def my_g(x): return_n_iter : bool, default=False Whether or not to return the number of iterations. + sign_flip : bool, default=False + Used to determine whether to enable sign flipping during whitening for + consistency in output between solvers. + + - If `sign_flip=False` then the output of different choices for + `whiten_solver` may not be equal. Both outputs will still be correct, + but may differ numerically. + + - If `sign_flip=True` then the output of both solvers will be + reconciled during fit so that their outputs match. This may produce + a different output for each solver when compared to + `sign_flip=False`. + Returns ------- K : ndarray of shape (n_components, n_features) or None @@ -314,6 +328,7 @@ def my_g(x): w_init=w_init, whiten_solver=whiten_solver, random_state=random_state, + sign_flip=sign_flip, ) S = est._fit(X, compute_sources=compute_sources) @@ -409,6 +424,19 @@ def my_g(x): across multiple function calls. See :term:`Glossary `. + sign_flip : bool, default=False + Used to determine whether to enable sign flipping during whitening for + consistency in output between solvers. + + - If `sign_flip=False` then the output of different choices for + `whiten_solver` may not be equal. Both outputs will still be correct, + but may differ numerically. + + - If `sign_flip=True` then the output of both solvers will be + reconciled during fit so that their outputs match. This may produce + a different output for each solver when compared to + `sign_flip=False`. + Attributes ---------- components_ : ndarray of shape (n_components, n_features) @@ -484,6 +512,7 @@ def __init__( w_init=None, whiten_solver="svd", random_state=None, + sign_flip=False, ): super().__init__() self.n_components = n_components @@ -496,6 +525,7 @@ def __init__( self.w_init = w_init self.whiten_solver = whiten_solver self.random_state = random_state + self.sign_flip = sign_flip def _fit(self, X, compute_sources=False): """Fit the model. @@ -608,7 +638,8 @@ def g(x, fun_args): ) # Give consistent eigenvectors for both svd solvers - u *= np.sign(u[0]) + if self.sign_flip: + u *= np.sign(u[0]) K = (u / d).T[:n_components] # see (6.33) p.140 del u, d From fac13d4b270015839689af5e5933401f48e31caa Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 2 Jun 2022 08:43:41 -0400 Subject: [PATCH 50/55] Fixed test --- sklearn/decomposition/tests/test_fastica.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 18911180fc23a..98ff781cf81c6 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -487,7 +487,9 @@ def test_fastica_simple_different_solvers(add_noise, global_random_seed): outs = {} for solver in ("svd", "eigh"): - ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver=solver) + ica = FastICA( + random_state=0, whiten="unit-variance", whiten_solver=solver, sign_flip=True + ) sources = ica.fit_transform(m.T) outs[solver] = sources assert ica.components_.shape == (2, 2) From 4c13aac8be22117b4917599bb6eeddf67c5e8ed9 Mon Sep 17 00:00:00 2001 From: Meekail Zain <34613774+Micky774@users.noreply.github.com> Date: Mon, 6 Jun 2022 13:03:40 -0400 Subject: [PATCH 51/55] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- sklearn/decomposition/_fastica.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 1a0372de6f641..cf8908308e25b 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -269,6 +269,8 @@ def my_g(x): reconciled during fit so that their outputs match. This may produce a different output for each solver when compared to `sign_flip=False`. + + .. versionadded:: 1.2 Returns ------- @@ -436,6 +438,8 @@ def my_g(x): reconciled during fit so that their outputs match. This may produce a different output for each solver when compared to `sign_flip=False`. + + .. versionadded:: 1.2 Attributes ---------- From 66c7080307927c4e775bb6e4005adf1488c31f11 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Mon, 6 Jun 2022 13:04:47 -0400 Subject: [PATCH 52/55] Removed extra test --- sklearn/decomposition/tests/test_fastica.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 98ff781cf81c6..9a51e8cdc55d4 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -513,15 +513,6 @@ def test_fastica_eigh_low_rank_warning(global_random_seed): FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh").fit(T) -@pytest.mark.parametrize("whiten_solver", ["svd", "eigh"]) -def test_fastica_whiten_solver(global_random_seed, whiten_solver): - rng = np.random.RandomState(global_random_seed) - X = rng.random_sample((100, 10)) - ica = FastICA(random_state=rng, whiten_solver=whiten_solver, whiten="unit-variance") - ica.fit_transform(X) - assert ica.whiten_solver == whiten_solver - - @pytest.mark.parametrize("whiten_solver", ["this_should_fail", "test", 1, None]) def test_fastica_whiten_solver_validation(whiten_solver): rng = np.random.RandomState(0) From ae9ac995f228d504c546ce140c28e4050d4aadc1 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Thu, 9 Jun 2022 11:14:55 -0400 Subject: [PATCH 53/55] Incorporated review feedback --- sklearn/decomposition/tests/test_fastica.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/sklearn/decomposition/tests/test_fastica.py b/sklearn/decomposition/tests/test_fastica.py index 9a51e8cdc55d4..3f6195a0dac82 100644 --- a/sklearn/decomposition/tests/test_fastica.py +++ b/sklearn/decomposition/tests/test_fastica.py @@ -7,7 +7,7 @@ import numpy as np from scipy import stats -from scipy import linalg +from sklearn.datasets import make_low_rank_matrix from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_allclose @@ -465,7 +465,7 @@ def test_fastica_output_shape(whiten, return_X_mean, return_n_iter): @pytest.mark.parametrize("add_noise", [True, False]) def test_fastica_simple_different_solvers(add_noise, global_random_seed): - """Test FastICA is consistent between whiten_solvers.""" + """Test FastICA is consistent between whiten_solvers when `sign_flip=True`.""" rng = np.random.RandomState(global_random_seed) n_samples = 1000 # Generate two sources: @@ -476,7 +476,7 @@ def test_fastica_simple_different_solvers(add_noise, global_random_seed): s1, s2 = s # Mixing angle - phi = 0.6 + phi = rng.rand() * 2 * np.pi mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) @@ -501,22 +501,19 @@ def test_fastica_simple_different_solvers(add_noise, global_random_seed): def test_fastica_eigh_low_rank_warning(global_random_seed): """Test FastICA eigh solver raises warning for low-rank data.""" rng = np.random.RandomState(global_random_seed) - X = rng.rand(100, 100) - U, S, V = linalg.svd(X) - s = np.zeros_like(S) - rank = 10 - s[:rank] = S[:rank] - T = U @ linalg.diagsvd(s, *X.shape) @ V - + X = make_low_rank_matrix( + n_samples=10, n_features=10, random_state=rng, effective_rank=2 + ) + ica = FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh") msg = "There are some small singular values" with pytest.warns(UserWarning, match=msg): - FastICA(random_state=0, whiten="unit-variance", whiten_solver="eigh").fit(T) + ica.fit(X) @pytest.mark.parametrize("whiten_solver", ["this_should_fail", "test", 1, None]) def test_fastica_whiten_solver_validation(whiten_solver): rng = np.random.RandomState(0) - X = rng.random_sample((100, 10)) + X = rng.random_sample((10, 2)) ica = FastICA(random_state=rng, whiten_solver=whiten_solver, whiten="unit-variance") msg = f"`whiten_solver` must be 'eigh' or 'svd' but got {whiten_solver} instead" with pytest.raises(ValueError, match=msg): From a5156175c843e0eafeae8955d5f8185ca598d3ae Mon Sep 17 00:00:00 2001 From: Micky774 Date: Mon, 13 Jun 2022 10:57:57 -0400 Subject: [PATCH 54/55] Updated changelog entry --- doc/whats_new/v1.2.rst | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index eaaee2ba6fadf..8fcc13082727c 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -19,13 +19,6 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. -- |Fix| :class:`decomposition.FastICA` now allows the user to enable sign-flipping - to produce consistent results between varying whitening solvers. This may change - the output of the model up to sign-flipping, but does not affect the overall - correctness. - :pr:`11860` by :user:`Pierre Ablin `, - :pr:`22527` by :user:`Meekail Zain ` and `Thomas Fan`_. - - |Enhancement| The default `eigen_tol` for :class:`cluster.SpectralClustering`, :class:`manifold.SpectralEmbedding`, :func:`cluster.spectral_clustering`, and :func:`manifold.spectral_embedding` is now `None` when using the `'amg'` @@ -121,7 +114,7 @@ Changelog may be faster and more memory efficient in cases where `num_features > num_samples`. An additional `sign_flip` parameter is added. When `sign_flip=True`, then the output of both solvers will be reconciled - during fit so that their outputs match. This may change the output of the + during `fit` so that their outputs match. This may change the output of the default solver, and hence may not be backwards compatible. :pr:`11860` by :user:`Pierre Ablin `, :pr:`22527` by :user:`Meekail Zain ` and `Thomas Fan`_. From ba52b00b89a1fb285960306f00a8a8f0b0fce5c1 Mon Sep 17 00:00:00 2001 From: Micky774 Date: Mon, 13 Jun 2022 10:58:50 -0400 Subject: [PATCH 55/55] Linting --- sklearn/decomposition/_fastica.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index cf8908308e25b..617823542a4ad 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -269,7 +269,7 @@ def my_g(x): reconciled during fit so that their outputs match. This may produce a different output for each solver when compared to `sign_flip=False`. - + .. versionadded:: 1.2 Returns @@ -438,7 +438,7 @@ def my_g(x): reconciled during fit so that their outputs match. This may produce a different output for each solver when compared to `sign_flip=False`. - + .. versionadded:: 1.2 Attributes