From 44cf30a49d3e2fcc5ac1b27a8cd7dac28ae86210 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 21 Apr 2021 09:12:34 +0200 Subject: [PATCH 01/18] DOC improve example --- examples/decomposition/plot_kernel_pca.py | 291 +++++++++++++++++----- sklearn/decomposition/_kernel_pca.py | 14 +- 2 files changed, 238 insertions(+), 67 deletions(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index cfec4f4ec8b1d..03a167b27c304 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -3,78 +3,241 @@ Kernel PCA ========== -This example shows that Kernel PCA is able to find a projection of the data -that makes data linearly separable. +This example shows the difference between Principal Components Analysis +(:class:`~sklearn.decomposition.PCA`) and +:class:`~sklearn.decomposition.KernelPCA`. + +On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able +to find a projection of the data that makes them linearly separable while it is +not the case with +:class:`~sklearn.decomposition.PCA`. + +On the other hand, we show the limitation of inverting this this projection +that is only an approximation with :class:`~sklearn.decomposition.KernelPCA` +while being exact with :class:`~sklearn.decomposition.PCA`. + +Finally, we show that this limitation can be useful in some applications such +as image denoising. """ print(__doc__) # Authors: Mathieu Blondel # Andreas Mueller +# Guillaume Lemaitre # License: BSD 3 clause -import numpy as np +# %% +# Projecting data: `PCA` vs. `KernelPCA` +# -------------------------------------- +# +# In this section, we will show the advantages of using a kernel when +# projecting data using a Principal Component Analysis (PCA). We create a +# dataset made of two nested circles. +from sklearn.datasets import make_circles +from sklearn.model_selection import train_test_split + +X, y = make_circles(n_samples=1_000, factor=.3, noise=.05, random_state=0) +X_train, X_test, y_train, y_test = train_test_split( + X, y, stratify=y, random_state=0) + +# %% +# Let's have a quick first look to the dataset generated. import matplotlib.pyplot as plt +_, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4)) + +axs[0].scatter(X_train[:, 0], X_train[:, 1], c=y_train) +axs[0].set_ylabel("Feature #1") +axs[0].set_xlabel("Feature #0") +axs[0].set_title("Training data") + +axs[1].scatter(X_test[:, 0], X_test[:, 1], c=y_test) +axs[1].set_xlabel("Feature #0") +_ = axs[1].set_title("Testing data") + +# %% +# The samples from each class cannot be linearly separated: we come with a +# straightline that would split the samples from the inner circle to outer +# circle. Perfectly, a potential decision function would be a circle separating +# both circles. +# +# Now, we will use PCA with and without a kernel to see what is the effect of +# using such a kernel. The kernel used here is a radial basis function (RBF) +# kernel. from sklearn.decomposition import PCA, KernelPCA -from sklearn.datasets import make_circles -np.random.seed(0) - -X, y = make_circles(n_samples=400, factor=.3, noise=.05) - -kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) -X_kpca = kpca.fit_transform(X) -X_back = kpca.inverse_transform(X_kpca) -pca = PCA() -X_pca = pca.fit_transform(X) - -# Plot results - -plt.figure() -plt.subplot(2, 2, 1, aspect='equal') -plt.title("Original space") -reds = y == 0 -blues = y == 1 - -plt.scatter(X[reds, 0], X[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X[blues, 0], X[blues, 1], c="blue", - s=20, edgecolor='k') -plt.xlabel("$x_1$") -plt.ylabel("$x_2$") - -X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50)) -X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T -# projection on the first principal component (in the phi space) -Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape) -plt.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower') - -plt.subplot(2, 2, 2, aspect='equal') -plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue", - s=20, edgecolor='k') -plt.title("Projection by PCA") -plt.xlabel("1st principal component") -plt.ylabel("2nd component") - -plt.subplot(2, 2, 3, aspect='equal') -plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", - s=20, edgecolor='k') -plt.title("Projection by KPCA") -plt.xlabel(r"1st principal component in space induced by $\phi$") -plt.ylabel("2nd component") - -plt.subplot(2, 2, 4, aspect='equal') -plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red", - s=20, edgecolor='k') -plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue", - s=20, edgecolor='k') -plt.title("Original space after inverse transform") -plt.xlabel("$x_1$") -plt.ylabel("$x_2$") - -plt.tight_layout() -plt.show() +pca = PCA(n_components=None) +kernel_pca = KernelPCA( + n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, + alpha=0.1) + +X_test_pca = pca.fit(X_train).transform(X_test) +X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test) + +# %% +fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(8, 8)) + +axs[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train) +axs[0, 0].set_ylabel("Feature #1") +axs[0, 0].set_xlabel("Feature #0") +axs[0, 0].set_title("Training data") + +axs[0, 1].scatter(X_test[:, 0], X_test[:, 1], c=y_test) +axs[0, 1].set_xlabel("Feature #0") +axs[0, 1].set_title("Testing data") + +axs[1, 0].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) +axs[1, 0].set_ylabel("Principal component #1") +axs[1, 0].set_xlabel("Principal component #0") +axs[1, 0].set_title("Projection using PCA") + +axs[1, 1].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) +axs[1, 1].set_xlabel("Principal component #0") +axs[1, 1].set_title("Projection using KernelPCA") + +fig.subplots_adjust(hspace=0.4) + +# %% +# We recall that PCA will project the data using a linear projection. +# Intuitively, it means that the coordinate system will be rotated with an some +# rescaling of the axis. This rescaling will depend on the variance of the +# data. +# +# Thus, looking at the projection made using PCA (i.e. figure on the +# bottom-left), we see that there is no change regarding the scaling; indeed +# the data being two concentric circles centered in zero, the variance of the +# original data was already maximized. However, we can see that the data have +# been rotated. As a conclusion, we see that such a projection would not help +# if define a linear classifier to distinguish samples from both classes. +# +# Using a kernel allows to make a non-linear projection. Here, by using an RBF +# kernel, we expect that the projection to unfold the dataset but keeping that +# point close in the original space should still be close in the new space. +# +# We observe such behaviour in the bottom-right figure: the samples of a given +# class are closer to each other than the samples from the opposite class. The +# "radial" effect make that we unrolled the circle. Now, we can use a linear +# classifier to separate the samples from the two classes. +# +# Projecting into the original feature space +# ------------------------------------------ +# +# One particularity to have in mind when using +# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction +# (i.e. the back projection in the original feature space). With +# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if +# `n_components` is the same than the number of original features as in this +# example. Thus, projecting the data on the PCA basis and projecting back will +# give the same dataset. +# +# We can investigate if we get a similar outcome with +# :class:`~sklearn.decomposition.KernelPCA`. +X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test)) +X_reconstructed_kernel_pca = kernel_pca.inverse_transform( + kernel_pca.transform(X_test)) + +# %% +fig, axs = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4)) + +axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test) +axs[0].set_ylabel("Feature #1") +axs[0].set_xlabel("Feature #0") +axs[0].set_title("Original test data") + +axs[1].scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test) +axs[1].set_xlabel("Feature #0") +axs[1].set_title("Reconstruction via PCA") + +axs[2].scatter(X_reconstructed_kernel_pca[:, 0], + X_reconstructed_kernel_pca[:, 1], c=y_test) +axs[2].set_xlabel("Feature #0") +_ = axs[2].set_title("Reconstruction via KernelPCA") + +# %% +# While we see a perfect reconstruction, we observe a different results for +# :class:`~sklearn.decomposition.KernelPCA`. Indeed, +# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot rely on an +# analytical back-projection and thus an extact reconstruction. Instead, a +# :class:`~sklearn.linear_model.KernelRidge` was trained to learn a projection +# function to map a sample from the PCA basis into the original feature space. +# This method is therefore an approximation leading to small difference. The +# parameter `alpha` in the :class:`~sklearn.decomposition.KernelPCA` is used +# to penalized the mapping function to fit more or less the training data. +# +# Application to image denoising +# ------------------------------ +# +# In this section, we will show how one can use the approximation function +# learned to denoise image. + +# %% +import numpy as np +from sklearn.datasets import fetch_openml +from sklearn.preprocessing import MinMaxScaler +from sklearn.model_selection import train_test_split + +X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True) +X = MinMaxScaler().fit_transform(X) +X_train, X_test, y_train, y_test = train_test_split( + X, y, stratify=y, random_state=0, train_size=1_000, test_size=100 +) + +rng = np.random.RandomState(0) +noise = rng.normal(scale=0.25, size=X_test.shape) +X_test_noisy = X_test + noise + + +# %% +def plot_digits(X, title): + """Small helper function to plot 100 digits.""" + fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(8, 8)) + for img, ax in zip(X, axs.ravel()): + ax.imshow(img.reshape((16, 16)), cmap="Greys") + ax.axis("off") + fig.suptitle(title, fontsize=30) + + +# %% +plot_digits(X_train, "Uncorrupted train images") +plot_digits(X_test, "Uncorrupted test images") +plot_digits(X_test_noisy, + f"Noisy test images - " + f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}") + +# %% +# We created a training and testing sets with 100 digits in each set. Also, +# we created a corrupted testing set that correspond to the original test set +# with additional Gaussian noise. +# +# The idea of this section, is to show that we can denoise the corrupted test +# set by a learning a PCA basis on the uncorrupted train set. We will use +# both a PCA and a kernel-based PCA. +pca = PCA(n_components=32) +kernel_pca = KernelPCA(n_components=200, kernel="rbf", gamma=1e-3, + fit_inverse_transform=True, alpha=10) + +pca.fit(X_train) +_ = kernel_pca.fit(X_train) + +# %% +# Now, can transform and reconstruct the noisy test set. Since we used less +# components than the number of original features, we will get an approximation +# of the original set. Indeed, by dropping the components explaining less +# variance in PCA, we hope to remove noise. Similar thinking happen in kernel +# PCA; however, we expect a better reconstruction because we use a non-linear +# kernel to learn the PCA basis and a kernel ridge to learn the mapping +# function. +X_reconstructed_kernel_pca = kernel_pca.inverse_transform( + kernel_pca.transform(X_test_noisy)) +X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy)) + +# %% +plot_digits(X_test, "Uncorrupted test images") +plot_digits(X_reconstructed_pca, + f"PCA reconstruction - " + f"MSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}") +plot_digits(X_reconstructed_kernel_pca, + f"Kernel PCA reconstruction - " + f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}") + +# %% diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 5655eddb0bf31..8bfcea68bb811 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -260,6 +260,14 @@ def _fit_inverse_transform(self, X_transformed, X): raise NotImplementedError("Inverse transform not implemented for " "sparse matrices!") + # from sklearn.kernel_ridge import KernelRidge + # self._inverse_transformer = KernelRidge( + # alpha=self.alpha, kernel=self.kernel, gamma=self.gamma, + # degree=self.degree, coef0=self.coef0, + # kernel_params=self.kernel_params, + # ) + # self._inverse_transformer.fit(X_transformed, X) + n_samples = X_transformed.shape[0] K = self._get_kernel(X_transformed) K.flat[::n_samples + 1] += self.alpha @@ -362,10 +370,10 @@ def inverse_transform(self, X): raise NotFittedError("The fit_inverse_transform parameter was not" " set to True when instantiating and hence " "the inverse transform is not available.") - + # return self._inverse_transformer.predict(X) K = self._get_kernel(X, self.X_transformed_fit_) - n_samples = self.X_transformed_fit_.shape[0] - K.flat[::n_samples + 1] += self.alpha + # n_samples = self.X_transformed_fit_.shape[0] + # K.flat[::n_samples + 1] += self.alpha return np.dot(K, self.dual_coef_) def _more_tags(self): From 736ea5143ac23bc88632ad4fa32564dfdf265f54 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 21 Apr 2021 10:34:04 +0200 Subject: [PATCH 02/18] iter --- examples/decomposition/plot_kernel_pca.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index 03a167b27c304..84db78e9cdbba 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -89,11 +89,11 @@ axs[1, 0].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) axs[1, 0].set_ylabel("Principal component #1") axs[1, 0].set_xlabel("Principal component #0") -axs[1, 0].set_title("Projection using PCA") +axs[1, 0].set_title("Projection of testing data\n using PCA") axs[1, 1].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) axs[1, 1].set_xlabel("Principal component #0") -axs[1, 1].set_title("Projection using KernelPCA") +axs[1, 1].set_title("Projection of testing data\n using KernelPCA") fig.subplots_adjust(hspace=0.4) @@ -205,16 +205,16 @@ def plot_digits(X, title): f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}") # %% -# We created a training and testing sets with 100 digits in each set. Also, -# we created a corrupted testing set that correspond to the original test set -# with additional Gaussian noise. +# We created a training and testing of 1,000 samples and a test set of 100 +# samples. Also, we created a corrupted testing set that correspond to the +# original test set with additional Gaussian noise. # # The idea of this section, is to show that we can denoise the corrupted test # set by a learning a PCA basis on the uncorrupted train set. We will use # both a PCA and a kernel-based PCA. pca = PCA(n_components=32) kernel_pca = KernelPCA(n_components=200, kernel="rbf", gamma=1e-3, - fit_inverse_transform=True, alpha=10) + fit_inverse_transform=True, alpha=5e-3) pca.fit(X_train) _ = kernel_pca.fit(X_train) @@ -241,3 +241,7 @@ def plot_digits(X, title): f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}") # %% +# Even if both PCA and kernel PCA have the same MSE, a qualitative analysis +# will favor the output of the kernel PCA. However, it should be noted that +# the results of the denoising with kernel PCA will depend of the parameters +# `n_components`, `gamma`, and `alpha`. From e61baf6284f0f0e49f9602b28ceddaca6ab35726 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 21 Apr 2021 10:41:25 +0200 Subject: [PATCH 03/18] iter --- doc/modules/decomposition.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index e971d784c63d6..0e05eba9cdfd4 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -183,16 +183,22 @@ has many applications including denoising, compression and structured prediction (kernel dependency estimation). :class:`KernelPCA` supports both ``transform`` and ``inverse_transform``. -.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png +.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png :target: ../auto_examples/decomposition/plot_kernel_pca.html :align: center :scale: 75% +.. note:: + :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the + function to map samples from the PCA basis into the original feature + spaces. Thus, the reconstruction obtained with + :meth:`KernelPCA.inverse_transform` is an approximation. See the example + linked below to go more into details. + .. topic:: Examples: * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py` - .. _SparsePCA: Sparse principal components analysis (SparsePCA and MiniBatchSparsePCA) From a4a9ab28da98e3d24114b453642e77583adcc773 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 21 Apr 2021 10:52:31 +0200 Subject: [PATCH 04/18] add info references --- doc/modules/decomposition.rst | 24 +++++++++++++++++++----- sklearn/decomposition/_kernel_pca.py | 25 +++++++++++++++++-------- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index 0e05eba9cdfd4..4bd9fd54c3631 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -178,10 +178,10 @@ Kernel PCA ---------- :class:`KernelPCA` is an extension of PCA which achieves non-linear -dimensionality reduction through the use of kernels (see :ref:`metrics`). It -has many applications including denoising, compression and structured -prediction (kernel dependency estimation). :class:`KernelPCA` supports both -``transform`` and ``inverse_transform``. +dimensionality reduction through the use of kernels (see :ref:`metrics`) +[Scholkopf1997]_. It has many applications including denoising, compression and +structured prediction (kernel dependency estimation). :class:`KernelPCA` +supports both ``transform`` and ``inverse_transform``. .. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png :target: ../auto_examples/decomposition/plot_kernel_pca.html @@ -191,7 +191,7 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both .. note:: :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the function to map samples from the PCA basis into the original feature - spaces. Thus, the reconstruction obtained with + spaces [Bakir2004]_. Thus, the reconstruction obtained with :meth:`KernelPCA.inverse_transform` is an approximation. See the example linked below to go more into details. @@ -199,6 +199,20 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py` +.. topic:: References + + .. [Scholkopf1997] Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. + `"Kernel principal component analysis." + `_ + International conference on artificial neural networks. + Springer, Berlin, Heidelberg, 1997. + + .. [Bakir2004] Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + `"Learning to find pre-images." + `_ + Advances in neural information processing systems 16 (2004): 449-456. + + .. _SparsePCA: Sparse principal components analysis (SparsePCA and MiniBatchSparsePCA) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index af9f525d31aac..d2ad6b6d176d8 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -56,8 +56,8 @@ class KernelPCA(TransformerMixin, BaseEstimator): inverse transform (when fit_inverse_transform=True). fit_inverse_transform : bool, default=False - Learn the inverse transform for non-precomputed kernels. - (i.e. learn to find the pre-image of a point) + Learn the inverse transform for non-precomputed kernels (i.e. learn to + find the pre-image of a point). This method is based on [2]_. eigen_solver : {'auto', 'dense', 'arpack'}, default='auto' Select eigensolver to use. If n_components is much less than @@ -136,11 +136,16 @@ class KernelPCA(TransformerMixin, BaseEstimator): References ---------- - Kernel PCA was introduced in: - Bernhard Schoelkopf, Alexander J. Smola, - and Klaus-Robert Mueller. 1999. Kernel principal - component analysis. In Advances in kernel methods, - MIT Press, Cambridge, MA, USA 327-352. + .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. + "Kernel principal component analysis." + International conference on artificial neural networks. + Springer, Berlin, Heidelberg, 1997. + `_ + + .. [2] `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + "Learning to find pre-images." + Advances in neural information processing systems 16 (2004): 449-456. + `_ """ @_deprecate_positional_args def __init__(self, n_components=None, *, kernel="linear", @@ -376,12 +381,16 @@ def inverse_transform(self, X): References ---------- - "Learning to Find Pre-Images", G BakIr et al, 2004. + Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + "Learning to find pre-images." + Advances in neural information processing systems 16 (2004): 449-456. + `_ """ if not self.fit_inverse_transform: raise NotFittedError("The fit_inverse_transform parameter was not" " set to True when instantiating and hence " "the inverse transform is not available.") + K = self._get_kernel(X, self.X_transformed_fit_) return np.dot(K, self.dual_coef_) From e1f090088b6d7dc76619e2170a29173bca1ac0fb Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 31 May 2021 14:37:28 +0200 Subject: [PATCH 05/18] Apply suggestions from code review Co-authored-by: Thomas J. Fan --- examples/decomposition/plot_kernel_pca.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index 84db78e9cdbba..ef102f4ec8c5a 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -12,8 +12,8 @@ not the case with :class:`~sklearn.decomposition.PCA`. -On the other hand, we show the limitation of inverting this this projection -that is only an approximation with :class:`~sklearn.decomposition.KernelPCA` +On the other hand, we show that inverting this projection is an +approximation with :class:`~sklearn.decomposition.KernelPCA`, while being exact with :class:`~sklearn.decomposition.PCA`. Finally, we show that this limitation can be useful in some applications such @@ -66,7 +66,7 @@ # kernel. from sklearn.decomposition import PCA, KernelPCA -pca = PCA(n_components=None) +pca = PCA(n_components=2) kernel_pca = KernelPCA( n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True, alpha=0.1) @@ -108,7 +108,7 @@ # the data being two concentric circles centered in zero, the variance of the # original data was already maximized. However, we can see that the data have # been rotated. As a conclusion, we see that such a projection would not help -# if define a linear classifier to distinguish samples from both classes. +# define a linear classifier to distinguish samples from both classes. # # Using a kernel allows to make a non-linear projection. Here, by using an RBF # kernel, we expect that the projection to unfold the dataset but keeping that @@ -177,10 +177,12 @@ from sklearn.model_selection import train_test_split X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True) -X = MinMaxScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, random_state=0, train_size=1_000, test_size=100 ) +min_max_scaler = MinMaxScaler() +X_train = min_max_scaler.fit_transform(X_train) +X_test = min_max_scaler.transform(X_test) rng = np.random.RandomState(0) noise = rng.normal(scale=0.25, size=X_test.shape) From b9d6696b2e3c5fd3c063d42f801a7d8a1689b6fa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 10 Jun 2021 17:59:23 +0200 Subject: [PATCH 06/18] DOC simplify example --- examples/decomposition/plot_kernel_pca.py | 141 ++++------------------ 1 file changed, 25 insertions(+), 116 deletions(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index ef102f4ec8c5a..a6e62014bbe7e 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -3,21 +3,17 @@ Kernel PCA ========== -This example shows the difference between Principal Components Analysis -(:class:`~sklearn.decomposition.PCA`) and -:class:`~sklearn.decomposition.KernelPCA`. +This example shows the difference between the Principal Components Analysis +(:class:`~sklearn.decomposition.PCA`) and its kernalize version +(:class:`~sklearn.decomposition.KernelPCA`). On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able to find a projection of the data that makes them linearly separable while it is -not the case with -:class:`~sklearn.decomposition.PCA`. +not the case with :class:`~sklearn.decomposition.PCA`. On the other hand, we show that inverting this projection is an approximation with :class:`~sklearn.decomposition.KernelPCA`, while being exact with :class:`~sklearn.decomposition.PCA`. - -Finally, we show that this limitation can be useful in some applications such -as image denoising. """ print(__doc__) @@ -75,27 +71,24 @@ X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test) # %% -fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(8, 8)) - -axs[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train) -axs[0, 0].set_ylabel("Feature #1") -axs[0, 0].set_xlabel("Feature #0") -axs[0, 0].set_title("Training data") +fig, axs = plt.subplots(ncols=3, figsize=(14, 4)) -axs[0, 1].scatter(X_test[:, 0], X_test[:, 1], c=y_test) -axs[0, 1].set_xlabel("Feature #0") -axs[0, 1].set_title("Testing data") +axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test) +axs[0].set_ylabel("Feature #1") +axs[0].set_xlabel("Feature #0") +axs[0].set_title("Testing data") -axs[1, 0].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) -axs[1, 0].set_ylabel("Principal component #1") -axs[1, 0].set_xlabel("Principal component #0") -axs[1, 0].set_title("Projection of testing data\n using PCA") +axs[1].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) +axs[1].set_ylabel("Principal component #1") +axs[1].set_xlabel("Principal component #0") +axs[1].set_title("Projection of testing data\n using PCA") -axs[1, 1].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) -axs[1, 1].set_xlabel("Principal component #0") -axs[1, 1].set_title("Projection of testing data\n using KernelPCA") +axs[2].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) +axs[2].set_ylabel("Principal component #1") +axs[2].set_xlabel("Principal component #0") +axs[2].set_title("Projection of testing data\n using KernelPCA") -fig.subplots_adjust(hspace=0.4) +fig.subplots_adjust(wspace=0.3) # %% # We recall that PCA will project the data using a linear projection. @@ -103,18 +96,18 @@ # rescaling of the axis. This rescaling will depend on the variance of the # data. # -# Thus, looking at the projection made using PCA (i.e. figure on the -# bottom-left), we see that there is no change regarding the scaling; indeed -# the data being two concentric circles centered in zero, the variance of the -# original data was already maximized. However, we can see that the data have -# been rotated. As a conclusion, we see that such a projection would not help -# define a linear classifier to distinguish samples from both classes. +# Thus, looking at the projection made using PCA (i.e. the middle figure), we +# see that there is no change regarding the scaling; indeed the data being two +# concentric circles centered in zero, the variance of the original data was +# already maximized. However, we can see that the data have been rotated. As a +# conclusion, we see that such a projection would not help if define a linear +# classifier to distinguish samples from both classes. # # Using a kernel allows to make a non-linear projection. Here, by using an RBF # kernel, we expect that the projection to unfold the dataset but keeping that # point close in the original space should still be close in the new space. # -# We observe such behaviour in the bottom-right figure: the samples of a given +# We observe such behaviour in the figure on the right: the samples of a given # class are closer to each other than the samples from the opposite class. The # "radial" effect make that we unrolled the circle. Now, we can use a linear # classifier to separate the samples from the two classes. @@ -163,87 +156,3 @@ # This method is therefore an approximation leading to small difference. The # parameter `alpha` in the :class:`~sklearn.decomposition.KernelPCA` is used # to penalized the mapping function to fit more or less the training data. -# -# Application to image denoising -# ------------------------------ -# -# In this section, we will show how one can use the approximation function -# learned to denoise image. - -# %% -import numpy as np -from sklearn.datasets import fetch_openml -from sklearn.preprocessing import MinMaxScaler -from sklearn.model_selection import train_test_split - -X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True) -X_train, X_test, y_train, y_test = train_test_split( - X, y, stratify=y, random_state=0, train_size=1_000, test_size=100 -) -min_max_scaler = MinMaxScaler() -X_train = min_max_scaler.fit_transform(X_train) -X_test = min_max_scaler.transform(X_test) - -rng = np.random.RandomState(0) -noise = rng.normal(scale=0.25, size=X_test.shape) -X_test_noisy = X_test + noise - - -# %% -def plot_digits(X, title): - """Small helper function to plot 100 digits.""" - fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(8, 8)) - for img, ax in zip(X, axs.ravel()): - ax.imshow(img.reshape((16, 16)), cmap="Greys") - ax.axis("off") - fig.suptitle(title, fontsize=30) - - -# %% -plot_digits(X_train, "Uncorrupted train images") -plot_digits(X_test, "Uncorrupted test images") -plot_digits(X_test_noisy, - f"Noisy test images - " - f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}") - -# %% -# We created a training and testing of 1,000 samples and a test set of 100 -# samples. Also, we created a corrupted testing set that correspond to the -# original test set with additional Gaussian noise. -# -# The idea of this section, is to show that we can denoise the corrupted test -# set by a learning a PCA basis on the uncorrupted train set. We will use -# both a PCA and a kernel-based PCA. -pca = PCA(n_components=32) -kernel_pca = KernelPCA(n_components=200, kernel="rbf", gamma=1e-3, - fit_inverse_transform=True, alpha=5e-3) - -pca.fit(X_train) -_ = kernel_pca.fit(X_train) - -# %% -# Now, can transform and reconstruct the noisy test set. Since we used less -# components than the number of original features, we will get an approximation -# of the original set. Indeed, by dropping the components explaining less -# variance in PCA, we hope to remove noise. Similar thinking happen in kernel -# PCA; however, we expect a better reconstruction because we use a non-linear -# kernel to learn the PCA basis and a kernel ridge to learn the mapping -# function. -X_reconstructed_kernel_pca = kernel_pca.inverse_transform( - kernel_pca.transform(X_test_noisy)) -X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy)) - -# %% -plot_digits(X_test, "Uncorrupted test images") -plot_digits(X_reconstructed_pca, - f"PCA reconstruction - " - f"MSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}") -plot_digits(X_reconstructed_kernel_pca, - f"Kernel PCA reconstruction - " - f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}") - -# %% -# Even if both PCA and kernel PCA have the same MSE, a qualitative analysis -# will favor the output of the kernel PCA. However, it should be noted that -# the results of the denoising with kernel PCA will depend of the parameters -# `n_components`, `gamma`, and `alpha`. From a28ae8f254817343338b8323d4cd8a1c3ce7719f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 23 Jun 2021 10:53:03 +0200 Subject: [PATCH 07/18] apply suggestion of julien --- examples/decomposition/plot_kernel_pca.py | 42 +++++++++++++---------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index a6e62014bbe7e..5709264091e08 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -53,9 +53,9 @@ # %% # The samples from each class cannot be linearly separated: we come with a -# straightline that would split the samples from the inner circle to outer -# circle. Perfectly, a potential decision function would be a circle separating -# both circles. +# straight line that would split the samples from the inner set to outer +# one. Perfectly, a potential decision boundary would be a circle separating +# both sample sets. # # Now, we will use PCA with and without a kernel to see what is the effect of # using such a kernel. The kernel used here is a radial basis function (RBF) @@ -91,10 +91,9 @@ fig.subplots_adjust(wspace=0.3) # %% -# We recall that PCA will project the data using a linear projection. -# Intuitively, it means that the coordinate system will be rotated with an some -# rescaling of the axis. This rescaling will depend on the variance of the -# data. +# We recall that PCA will project the data linearly. Intuitively, it means that +# the coordinate system will be rotated after centering and rescaling on each +# axis. This rescaling will depend on the variance of the data. # # Thus, looking at the projection made using PCA (i.e. the middle figure), we # see that there is no change regarding the scaling; indeed the data being two @@ -108,9 +107,9 @@ # point close in the original space should still be close in the new space. # # We observe such behaviour in the figure on the right: the samples of a given -# class are closer to each other than the samples from the opposite class. The -# "radial" effect make that we unrolled the circle. Now, we can use a linear -# classifier to separate the samples from the two classes. +# class are closer to each other than the samples from the opposite class, +# untangling both sample sets. Now, we can use a linear classifier to separate +# the samples from the two classes. # # Projecting into the original feature space # ------------------------------------------ @@ -147,12 +146,17 @@ _ = axs[2].set_title("Reconstruction via KernelPCA") # %% -# While we see a perfect reconstruction, we observe a different results for -# :class:`~sklearn.decomposition.KernelPCA`. Indeed, -# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot rely on an -# analytical back-projection and thus an extact reconstruction. Instead, a -# :class:`~sklearn.linear_model.KernelRidge` was trained to learn a projection -# function to map a sample from the PCA basis into the original feature space. -# This method is therefore an approximation leading to small difference. The -# parameter `alpha` in the :class:`~sklearn.decomposition.KernelPCA` is used -# to penalized the mapping function to fit more or less the training data. +# While we see a perfect reconstruction with +# :class:`~sklearn.decomposition.PCA` we observe a different results for +# :class:`~sklearn.decomposition.KernelPCA`. +# +# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot +# rely on an analytical back-projection and thus an extact reconstruction. +# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` was internally trained +# to learn a mapping from the PCA basis to the original feature space. This +# method is therefore an approximation leading to small difference. +# +# To improve the reconstruction using +# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune +# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term +# which controls the reliance on the training data during the mapping training. From acfa8118ac30e2a772ab14d5667929d4e42ff169 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 23 Jun 2021 10:53:42 +0200 Subject: [PATCH 08/18] empty commit From b9f7387c8c9a70bcb989767e2b7ab299fd7568ae Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 24 Jun 2021 12:03:18 +0200 Subject: [PATCH 09/18] Apply suggestions from code review Co-authored-by: Olivier Grisel --- examples/decomposition/plot_kernel_pca.py | 32 ++++++++++++----------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index 5709264091e08..e4e729a819447 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -11,9 +11,9 @@ to find a projection of the data that makes them linearly separable while it is not the case with :class:`~sklearn.decomposition.PCA`. -On the other hand, we show that inverting this projection is an -approximation with :class:`~sklearn.decomposition.KernelPCA`, -while being exact with :class:`~sklearn.decomposition.PCA`. +Finally, we show that inverting this projection is an approximation with +:class:`~sklearn.decomposition.KernelPCA`, while inverting is exact with +:class:`~sklearn.decomposition.PCA`. """ print(__doc__) @@ -26,7 +26,7 @@ # Projecting data: `PCA` vs. `KernelPCA` # -------------------------------------- # -# In this section, we will show the advantages of using a kernel when +# In this section, we show the advantages of using a kernel when # projecting data using a Principal Component Analysis (PCA). We create a # dataset made of two nested circles. from sklearn.datasets import make_circles @@ -37,7 +37,7 @@ X, y, stratify=y, random_state=0) # %% -# Let's have a quick first look to the dataset generated. +# Let's have a quick first look at the generated dataset. import matplotlib.pyplot as plt _, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4)) @@ -52,10 +52,9 @@ _ = axs[1].set_title("Testing data") # %% -# The samples from each class cannot be linearly separated: we come with a -# straight line that would split the samples from the inner set to outer -# one. Perfectly, a potential decision boundary would be a circle separating -# both sample sets. +# The samples from each class cannot be linearly separated: there is no +# straight line that can split the samples from the inner set from the outer +# set. # # Now, we will use PCA with and without a kernel to see what is the effect of # using such a kernel. The kernel used here is a radial basis function (RBF) @@ -91,7 +90,7 @@ fig.subplots_adjust(wspace=0.3) # %% -# We recall that PCA will project the data linearly. Intuitively, it means that +# We recall that PCA projects the data linearly. Intuitively, it means that # the coordinate system will be rotated after centering and rescaling on each # axis. This rescaling will depend on the variance of the data. # @@ -103,8 +102,9 @@ # classifier to distinguish samples from both classes. # # Using a kernel allows to make a non-linear projection. Here, by using an RBF -# kernel, we expect that the projection to unfold the dataset but keeping that -# point close in the original space should still be close in the new space. +# kernel, we expect that the projection to unfold the dataset while keeping +# approximately preserving the relative distances of pairs of data points that +# are close to one another in the original space. # # We observe such behaviour in the figure on the right: the samples of a given # class are closer to each other than the samples from the opposite class, @@ -152,11 +152,13 @@ # # Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot # rely on an analytical back-projection and thus an extact reconstruction. -# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` was internally trained +# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained # to learn a mapping from the PCA basis to the original feature space. This -# method is therefore an approximation leading to small difference. +# method is therefore an approximation introducing small differences when +# attempting to reconstruct the original input. # # To improve the reconstruction using # :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune # `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term -# which controls the reliance on the training data during the mapping training. +# which controls the reliance on the training data during the training of +# the mapping. From 93a965aafd577ec35584ef88fb49e27b95dd9870 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 25 Jun 2021 14:56:01 +0200 Subject: [PATCH 10/18] FIX hyperlink in inverse_transform --- sklearn/decomposition/_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index 837b73389d223..8b0c224ef5166 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -468,7 +468,7 @@ def inverse_transform(self, X): References ---------- - Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. + `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf. "Learning to find pre-images." Advances in neural information processing systems 16 (2004): 449-456. `_ From 0540fe5da5ec46c8c7f84e952c16972d3766c797 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 25 Jun 2021 16:07:12 +0200 Subject: [PATCH 11/18] Update examples/decomposition/plot_kernel_pca.py Co-authored-by: Christos Aridas --- examples/decomposition/plot_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index e4e729a819447..2f6d06213234f 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -4,7 +4,7 @@ ========== This example shows the difference between the Principal Components Analysis -(:class:`~sklearn.decomposition.PCA`) and its kernalize version +(:class:`~sklearn.decomposition.PCA`) and its kernalized version (:class:`~sklearn.decomposition.KernelPCA`). On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able From 42feb3f27ceeb0b8ffc8c14b823e569368766282 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 25 Jun 2021 16:07:18 +0200 Subject: [PATCH 12/18] Update doc/modules/decomposition.rst Co-authored-by: Christos Aridas --- doc/modules/decomposition.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index fe05804a4158a..3f601b6be6f2c 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -286,7 +286,7 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both .. note:: :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the function to map samples from the PCA basis into the original feature - spaces [Bakir2004]_. Thus, the reconstruction obtained with + space [Bakir2004]_. Thus, the reconstruction obtained with :meth:`KernelPCA.inverse_transform` is an approximation. See the example linked below to go more into details. From 03c9887b8f076b585551c067fd4c5dbc645ab384 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 25 Jun 2021 17:16:09 +0200 Subject: [PATCH 13/18] Update examples/decomposition/plot_kernel_pca.py Co-authored-by: Christos Aridas --- examples/decomposition/plot_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index 2f6d06213234f..1e9fc2eb36887 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -53,7 +53,7 @@ # %% # The samples from each class cannot be linearly separated: there is no -# straight line that can split the samples from the inner set from the outer +# straight line that can split the samples of the inner set from the outer # set. # # Now, we will use PCA with and without a kernel to see what is the effect of From f470dd9b82ea017ede3605aca09eb6bf0e0368cd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 29 Jun 2021 15:33:35 +0200 Subject: [PATCH 14/18] Update examples/decomposition/plot_kernel_pca.py Co-authored-by: Christos Aridas --- examples/decomposition/plot_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index 1e9fc2eb36887..88d745de303f3 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -102,7 +102,7 @@ # classifier to distinguish samples from both classes. # # Using a kernel allows to make a non-linear projection. Here, by using an RBF -# kernel, we expect that the projection to unfold the dataset while keeping +# kernel, we expect that the projection will unfold the dataset while keeping # approximately preserving the relative distances of pairs of data points that # are close to one another in the original space. # From 8237ea1376699b2924e12b1a3699da3d1a34f9e7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Nov 2021 18:09:33 +0100 Subject: [PATCH 15/18] Apply suggestion Julien Co-authored-by: Julien Jerphanion --- doc/modules/decomposition.rst | 4 +- examples/decomposition/plot_kernel_pca.py | 95 ++++++++++++----------- sklearn/decomposition/_kernel_pca.py | 5 +- 3 files changed, 52 insertions(+), 52 deletions(-) diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst index 371cfdf21fc07..eac8f063be258 100644 --- a/doc/modules/decomposition.rst +++ b/doc/modules/decomposition.rst @@ -285,10 +285,10 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both .. note:: :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the - function to map samples from the PCA basis into the original feature + function mapping samples from the PCA basis into the original feature space [Bakir2004]_. Thus, the reconstruction obtained with :meth:`KernelPCA.inverse_transform` is an approximation. See the example - linked below to go more into details. + linked below for more details. .. topic:: Examples: diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index fee3152a9687c..faeec45ed6689 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -8,11 +8,11 @@ (:class:`~sklearn.decomposition.KernelPCA`). On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able -to find a projection of the data that makes them linearly separable while it is -not the case with :class:`~sklearn.decomposition.PCA`. +to find a projection of the data which linearly separates them while it is not the case +with :class:`~sklearn.decomposition.PCA`. Finally, we show that inverting this projection is an approximation with -:class:`~sklearn.decomposition.KernelPCA`, while inverting is exact with +:class:`~sklearn.decomposition.KernelPCA`, while it is exact with :class:`~sklearn.decomposition.PCA`. """ @@ -38,16 +38,16 @@ # Let's have a quick first look at the generated dataset. import matplotlib.pyplot as plt -_, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4)) +_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4)) -axs[0].scatter(X_train[:, 0], X_train[:, 1], c=y_train) -axs[0].set_ylabel("Feature #1") -axs[0].set_xlabel("Feature #0") -axs[0].set_title("Training data") +train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train) +train_ax.set_ylabel("Feature #1") +train_ax.set_xlabel("Feature #0") +train_ax.set_title("Training data") -axs[1].scatter(X_test[:, 0], X_test[:, 1], c=y_test) -axs[1].set_xlabel("Feature #0") -_ = axs[1].set_title("Testing data") +test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) +test_ax.set_xlabel("Feature #0") +_ = test_ax.set_title("Testing data") # %% # The samples from each class cannot be linearly separated: there is no @@ -68,32 +68,34 @@ X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test) # %% -fig, axs = plt.subplots(ncols=3, figsize=(14, 4)) +fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(ncols=3, figsize=(14, 4)) -axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test) -axs[0].set_ylabel("Feature #1") -axs[0].set_xlabel("Feature #0") -axs[0].set_title("Testing data") +orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) +orig_data_ax.set_ylabel("Feature #1") +orig_data_ax.set_xlabel("Feature #0") +orig_data_ax.set_title("Testing data") -axs[1].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) -axs[1].set_ylabel("Principal component #1") -axs[1].set_xlabel("Principal component #0") -axs[1].set_title("Projection of testing data\n using PCA") +pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test) +pca_proj_ax.set_ylabel("Principal component #1") +pca_proj_ax.set_xlabel("Principal component #0") +pca_proj_ax.set_title("Projection of testing data\n using PCA") -axs[2].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) -axs[2].set_ylabel("Principal component #1") -axs[2].set_xlabel("Principal component #0") -axs[2].set_title("Projection of testing data\n using KernelPCA") +kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) +kernel_pca_proj_ax.set_ylabel("Principal component #1") +kernel_pca_proj_ax.set_xlabel("Principal component #0") +kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA") # %% -# We recall that PCA projects the data linearly. Intuitively, it means that -# the coordinate system will be rotated after centering and rescaling on each -# axis. This rescaling will depend on the variance of the data. +# We recall that PCA transforms the data linearly. Intuitively, it means that +# the coordinate system will be centered, rescaled on each component +# with respected to its variance and finally be rotated. +# The obtained data from this transformation is isotropic and can now be +# projected on its _principal components_. # # Thus, looking at the projection made using PCA (i.e. the middle figure), we # see that there is no change regarding the scaling; indeed the data being two -# concentric circles centered in zero, the variance of the original data was -# already maximized. However, we can see that the data have been rotated. As a +# concentric circles centered in zero, the original data is already isotropic. +# However, we can see that the data have been rotated. As a # conclusion, we see that such a projection would not help if define a linear # classifier to distinguish samples from both classes. # @@ -114,9 +116,8 @@ # :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction # (i.e. the back projection in the original feature space). With # :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if -# `n_components` is the same than the number of original features as in this -# example. Thus, projecting the data on the PCA basis and projecting back will -# give the same dataset. +# `n_components` is the same than the number of original features. +# This is the case in this example. # # We can investigate if we get a similar outcome with # :class:`~sklearn.decomposition.KernelPCA`. @@ -124,34 +125,34 @@ X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test)) # %% -fig, axs = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4)) +fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4)) -axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test) -axs[0].set_ylabel("Feature #1") -axs[0].set_xlabel("Feature #0") -axs[0].set_title("Original test data") +orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) +orig_data_ax.set_ylabel("Feature #1") +orig_data_ax.set_xlabel("Feature #0") +orig_data_ax.set_title("Original test data") -axs[1].scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test) -axs[1].set_xlabel("Feature #0") -axs[1].set_title("Reconstruction via PCA") +pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test) +pca_back_proj_ax.set_xlabel("Feature #0") +pca_back_proj_ax.set_title("Reconstruction via PCA") -axs[2].scatter( +kernel_pca_back_proj_ax.scatter( X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test ) -axs[2].set_xlabel("Feature #0") -_ = axs[2].set_title("Reconstruction via KernelPCA") +kernel_pca_back_proj_ax.set_xlabel("Feature #0") +_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA") # %% # While we see a perfect reconstruction with -# :class:`~sklearn.decomposition.PCA` we observe a different results for +# :class:`~sklearn.decomposition.PCA` we observe a different result for # :class:`~sklearn.decomposition.KernelPCA`. # # Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot # rely on an analytical back-projection and thus an extact reconstruction. # Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained -# to learn a mapping from the PCA basis to the original feature space. This -# method is therefore an approximation introducing small differences when -# attempting to reconstruct the original input. +# to learn a mapping from the kernalized PCA basis to the original feature +# space. This method therefore comes with an approximation introducing small +# differences when back projecting in the original feature space. # # To improve the reconstruction using # :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index f8a789daf4add..0efcf2d4fd341 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -216,11 +216,10 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato Advances in neural information processing systems 16 (2004): 449-456. `_ - .. [3] `Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. + .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp. "Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions." - SIAM review 53.2 (2011): 217-288. - `_ + SIAM review 53.2 (2011): 217-288. <0909.4061>` .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert. "A randomized algorithm for the decomposition of matrices." From 005e76219c053de2c95685f30161f2f5e6253cae Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Nov 2021 18:19:01 +0100 Subject: [PATCH 16/18] Update examples/decomposition/plot_kernel_pca.py Co-authored-by: Julien Jerphanion --- examples/decomposition/plot_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index faeec45ed6689..cf166bbcf71e3 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -119,7 +119,7 @@ # `n_components` is the same than the number of original features. # This is the case in this example. # -# We can investigate if we get a similar outcome with +# We can investigate if we get the original dataset when back projecting with # :class:`~sklearn.decomposition.KernelPCA`. X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test)) X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test)) From 4f301db2c7cfcd71a316f716d327eb08c2b8cdf4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Nov 2021 18:21:10 +0100 Subject: [PATCH 17/18] black --- examples/decomposition/plot_kernel_pca.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index cf166bbcf71e3..60e40065b2843 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -68,7 +68,9 @@ X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test) # %% -fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(ncols=3, figsize=(14, 4)) +fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots( + ncols=3, figsize=(14, 4) +) orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) orig_data_ax.set_ylabel("Feature #1") @@ -125,7 +127,9 @@ X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test)) # %% -fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4)) +fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots( + ncols=3, sharex=True, sharey=True, figsize=(13, 4) +) orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test) orig_data_ax.set_ylabel("Feature #1") From 1a52093d37992582eb9b1a3fa4a628ad93579936 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 3 Nov 2021 21:49:06 +0100 Subject: [PATCH 18/18] Update plot_kernel_pca.py --- examples/decomposition/plot_kernel_pca.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py index 60e40065b2843..fe6d63240523e 100644 --- a/examples/decomposition/plot_kernel_pca.py +++ b/examples/decomposition/plot_kernel_pca.py @@ -85,7 +85,7 @@ kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test) kernel_pca_proj_ax.set_ylabel("Principal component #1") kernel_pca_proj_ax.set_xlabel("Principal component #0") -kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA") +_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA") # %% # We recall that PCA transforms the data linearly. Intuitively, it means that