From 44cf30a49d3e2fcc5ac1b27a8cd7dac28ae86210 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 21 Apr 2021 09:12:34 +0200
Subject: [PATCH 01/18] DOC improve example

---
 examples/decomposition/plot_kernel_pca.py | 291 +++++++++++++++++-----
 sklearn/decomposition/_kernel_pca.py      |  14 +-
 2 files changed, 238 insertions(+), 67 deletions(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index cfec4f4ec8b1d..03a167b27c304 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -3,78 +3,241 @@
 Kernel PCA
 ==========
 
-This example shows that Kernel PCA is able to find a projection of the data
-that makes data linearly separable.
+This example shows the difference between Principal Components Analysis
+(:class:`~sklearn.decomposition.PCA`) and
+:class:`~sklearn.decomposition.KernelPCA`.
+
+On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
+to find a projection of the data that makes them linearly separable while it is
+not the case with
+:class:`~sklearn.decomposition.PCA`.
+
+On the other hand, we show the limitation of inverting this this projection
+that is only an approximation with :class:`~sklearn.decomposition.KernelPCA`
+while being exact with :class:`~sklearn.decomposition.PCA`.
+
+Finally, we show that this limitation can be useful in some applications such
+as image denoising.
 """
 print(__doc__)
 
 # Authors: Mathieu Blondel
 #          Andreas Mueller
+#          Guillaume Lemaitre
 # License: BSD 3 clause
 
-import numpy as np
+# %%
+# Projecting data: `PCA` vs. `KernelPCA`
+# --------------------------------------
+#
+# In this section, we will show the advantages of using a kernel when
+# projecting data using a Principal Component Analysis (PCA). We create a
+# dataset made of two nested circles.
+from sklearn.datasets import make_circles
+from sklearn.model_selection import train_test_split
+
+X, y = make_circles(n_samples=1_000, factor=.3, noise=.05, random_state=0)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, stratify=y, random_state=0)
+
+# %%
+# Let's have a quick first look to the dataset generated.
 import matplotlib.pyplot as plt
 
+_, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))
+
+axs[0].scatter(X_train[:, 0], X_train[:, 1], c=y_train)
+axs[0].set_ylabel("Feature #1")
+axs[0].set_xlabel("Feature #0")
+axs[0].set_title("Training data")
+
+axs[1].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+axs[1].set_xlabel("Feature #0")
+_ = axs[1].set_title("Testing data")
+
+# %%
+# The samples from each class cannot be linearly separated: we come with a
+# straightline that would split the samples from the inner circle to outer
+# circle. Perfectly, a potential decision function would be a circle separating
+# both circles.
+#
+# Now, we will use PCA with and without a kernel to see what is the effect of
+# using such a kernel. The kernel used here is a radial basis function (RBF)
+# kernel.
 from sklearn.decomposition import PCA, KernelPCA
-from sklearn.datasets import make_circles
 
-np.random.seed(0)
-
-X, y = make_circles(n_samples=400, factor=.3, noise=.05)
-
-kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
-X_kpca = kpca.fit_transform(X)
-X_back = kpca.inverse_transform(X_kpca)
-pca = PCA()
-X_pca = pca.fit_transform(X)
-
-# Plot results
-
-plt.figure()
-plt.subplot(2, 2, 1, aspect='equal')
-plt.title("Original space")
-reds = y == 0
-blues = y == 1
-
-plt.scatter(X[reds, 0], X[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X[blues, 0], X[blues, 1], c="blue",
-            s=20, edgecolor='k')
-plt.xlabel("$x_1$")
-plt.ylabel("$x_2$")
-
-X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))
-X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
-# projection on the first principal component (in the phi space)
-Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
-plt.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower')
-
-plt.subplot(2, 2, 2, aspect='equal')
-plt.scatter(X_pca[reds, 0], X_pca[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X_pca[blues, 0], X_pca[blues, 1], c="blue",
-            s=20, edgecolor='k')
-plt.title("Projection by PCA")
-plt.xlabel("1st principal component")
-plt.ylabel("2nd component")
-
-plt.subplot(2, 2, 3, aspect='equal')
-plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue",
-            s=20, edgecolor='k')
-plt.title("Projection by KPCA")
-plt.xlabel(r"1st principal component in space induced by $\phi$")
-plt.ylabel("2nd component")
-
-plt.subplot(2, 2, 4, aspect='equal')
-plt.scatter(X_back[reds, 0], X_back[reds, 1], c="red",
-            s=20, edgecolor='k')
-plt.scatter(X_back[blues, 0], X_back[blues, 1], c="blue",
-            s=20, edgecolor='k')
-plt.title("Original space after inverse transform")
-plt.xlabel("$x_1$")
-plt.ylabel("$x_2$")
-
-plt.tight_layout()
-plt.show()
+pca = PCA(n_components=None)
+kernel_pca = KernelPCA(
+    n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True,
+    alpha=0.1)
+
+X_test_pca = pca.fit(X_train).transform(X_test)
+X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)
+
+# %%
+fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(8, 8))
+
+axs[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train)
+axs[0, 0].set_ylabel("Feature #1")
+axs[0, 0].set_xlabel("Feature #0")
+axs[0, 0].set_title("Training data")
+
+axs[0, 1].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+axs[0, 1].set_xlabel("Feature #0")
+axs[0, 1].set_title("Testing data")
+
+axs[1, 0].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
+axs[1, 0].set_ylabel("Principal component #1")
+axs[1, 0].set_xlabel("Principal component #0")
+axs[1, 0].set_title("Projection using PCA")
+
+axs[1, 1].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
+axs[1, 1].set_xlabel("Principal component #0")
+axs[1, 1].set_title("Projection using KernelPCA")
+
+fig.subplots_adjust(hspace=0.4)
+
+# %%
+# We recall that PCA will project the data using a linear projection.
+# Intuitively, it means that the coordinate system will be rotated with an some
+# rescaling of the axis. This rescaling will depend on the variance of the
+# data.
+#
+# Thus, looking at the projection made using PCA (i.e. figure on the
+# bottom-left), we see that there is no change regarding the scaling; indeed
+# the data being two concentric circles centered in zero, the variance of the
+# original data was already maximized. However, we can see that the data have
+# been rotated. As a conclusion, we see that such a projection would not help
+# if define a linear classifier to distinguish samples from both classes.
+#
+# Using a kernel allows to make a non-linear projection. Here, by using an RBF
+# kernel, we expect that the projection to unfold the dataset but keeping that
+# point close in the original space should still be close in the new space.
+#
+# We observe such behaviour in the bottom-right figure: the samples of a given
+# class are closer to each other than the samples from the opposite class. The
+# "radial" effect make that we unrolled the circle. Now, we can use a linear
+# classifier to separate the samples from the two classes.
+#
+# Projecting into the original feature space
+# ------------------------------------------
+#
+# One particularity to have in mind when using
+# :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction
+# (i.e. the back projection in the original feature space). With
+# :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if
+# `n_components` is the same than the number of original features as in this
+# example. Thus, projecting the data on the PCA basis and projecting back will
+# give the same dataset.
+#
+# We can investigate if we get a similar outcome with
+# :class:`~sklearn.decomposition.KernelPCA`.
+X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test))
+X_reconstructed_kernel_pca = kernel_pca.inverse_transform(
+    kernel_pca.transform(X_test))
+
+# %%
+fig, axs = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4))
+
+axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+axs[0].set_ylabel("Feature #1")
+axs[0].set_xlabel("Feature #0")
+axs[0].set_title("Original test data")
+
+axs[1].scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)
+axs[1].set_xlabel("Feature #0")
+axs[1].set_title("Reconstruction via PCA")
+
+axs[2].scatter(X_reconstructed_kernel_pca[:, 0],
+               X_reconstructed_kernel_pca[:, 1], c=y_test)
+axs[2].set_xlabel("Feature #0")
+_ = axs[2].set_title("Reconstruction via KernelPCA")
+
+# %%
+# While we see a perfect reconstruction, we observe a different results for
+# :class:`~sklearn.decomposition.KernelPCA`. Indeed,
+# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot rely on an
+# analytical back-projection and thus an extact reconstruction. Instead, a
+# :class:`~sklearn.linear_model.KernelRidge` was trained to learn a projection
+# function to map a sample from the PCA basis into the original feature space.
+# This method is therefore an approximation leading to small difference. The
+# parameter `alpha` in the :class:`~sklearn.decomposition.KernelPCA` is used
+# to penalized the mapping function to fit more or less the training data.
+#
+# Application to image denoising
+# ------------------------------
+#
+# In this section, we will show how one can use the approximation function
+# learned to denoise image.
+
+# %%
+import numpy as np
+from sklearn.datasets import fetch_openml
+from sklearn.preprocessing import MinMaxScaler
+from sklearn.model_selection import train_test_split
+
+X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
+X = MinMaxScaler().fit_transform(X)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, stratify=y, random_state=0, train_size=1_000, test_size=100
+)
+
+rng = np.random.RandomState(0)
+noise = rng.normal(scale=0.25, size=X_test.shape)
+X_test_noisy = X_test + noise
+
+
+# %%
+def plot_digits(X, title):
+    """Small helper function to plot 100 digits."""
+    fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(8, 8))
+    for img, ax in zip(X, axs.ravel()):
+        ax.imshow(img.reshape((16, 16)), cmap="Greys")
+        ax.axis("off")
+    fig.suptitle(title, fontsize=30)
+
+
+# %%
+plot_digits(X_train, "Uncorrupted train images")
+plot_digits(X_test, "Uncorrupted test images")
+plot_digits(X_test_noisy,
+            f"Noisy test images - "
+            f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}")
+
+# %%
+# We created a training and testing sets with 100 digits in each set. Also,
+# we created a corrupted testing set that correspond to the original test set
+# with additional Gaussian noise.
+#
+# The idea of this section, is to show that we can denoise the corrupted test
+# set by a learning a PCA basis on the uncorrupted train set. We will use
+# both a PCA and a kernel-based PCA.
+pca = PCA(n_components=32)
+kernel_pca = KernelPCA(n_components=200, kernel="rbf", gamma=1e-3,
+                       fit_inverse_transform=True, alpha=10)
+
+pca.fit(X_train)
+_ = kernel_pca.fit(X_train)
+
+# %%
+# Now, can transform and reconstruct the noisy test set. Since we used less
+# components than the number of original features, we will get an approximation
+# of the original set. Indeed, by dropping the components explaining less
+# variance in PCA, we hope to remove noise. Similar thinking happen in kernel
+# PCA; however, we expect a better reconstruction because we use a non-linear
+# kernel to learn the PCA basis and a kernel ridge to learn the mapping
+# function.
+X_reconstructed_kernel_pca = kernel_pca.inverse_transform(
+    kernel_pca.transform(X_test_noisy))
+X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy))
+
+# %%
+plot_digits(X_test, "Uncorrupted test images")
+plot_digits(X_reconstructed_pca,
+            f"PCA reconstruction - "
+            f"MSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}")
+plot_digits(X_reconstructed_kernel_pca,
+            f"Kernel PCA reconstruction - "
+            f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}")
+
+# %%
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 5655eddb0bf31..8bfcea68bb811 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -260,6 +260,14 @@ def _fit_inverse_transform(self, X_transformed, X):
             raise NotImplementedError("Inverse transform not implemented for "
                                       "sparse matrices!")
 
+        # from sklearn.kernel_ridge import KernelRidge
+        # self._inverse_transformer = KernelRidge(
+        #     alpha=self.alpha, kernel=self.kernel, gamma=self.gamma,
+        #     degree=self.degree, coef0=self.coef0,
+        #     kernel_params=self.kernel_params,
+        # )
+        # self._inverse_transformer.fit(X_transformed, X)
+
         n_samples = X_transformed.shape[0]
         K = self._get_kernel(X_transformed)
         K.flat[::n_samples + 1] += self.alpha
@@ -362,10 +370,10 @@ def inverse_transform(self, X):
             raise NotFittedError("The fit_inverse_transform parameter was not"
                                  " set to True when instantiating and hence "
                                  "the inverse transform is not available.")
-
+        # return self._inverse_transformer.predict(X)
         K = self._get_kernel(X, self.X_transformed_fit_)
-        n_samples = self.X_transformed_fit_.shape[0]
-        K.flat[::n_samples + 1] += self.alpha
+        # n_samples = self.X_transformed_fit_.shape[0]
+        # K.flat[::n_samples + 1] += self.alpha
         return np.dot(K, self.dual_coef_)
 
     def _more_tags(self):

From 736ea5143ac23bc88632ad4fa32564dfdf265f54 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 21 Apr 2021 10:34:04 +0200
Subject: [PATCH 02/18] iter

---
 examples/decomposition/plot_kernel_pca.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 03a167b27c304..84db78e9cdbba 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -89,11 +89,11 @@
 axs[1, 0].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
 axs[1, 0].set_ylabel("Principal component #1")
 axs[1, 0].set_xlabel("Principal component #0")
-axs[1, 0].set_title("Projection using PCA")
+axs[1, 0].set_title("Projection of testing data\n using PCA")
 
 axs[1, 1].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
 axs[1, 1].set_xlabel("Principal component #0")
-axs[1, 1].set_title("Projection using KernelPCA")
+axs[1, 1].set_title("Projection of testing data\n using KernelPCA")
 
 fig.subplots_adjust(hspace=0.4)
 
@@ -205,16 +205,16 @@ def plot_digits(X, title):
             f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}")
 
 # %%
-# We created a training and testing sets with 100 digits in each set. Also,
-# we created a corrupted testing set that correspond to the original test set
-# with additional Gaussian noise.
+# We created a training and testing of 1,000 samples and a test set of 100
+# samples. Also, we created a corrupted testing set that correspond to the
+# original test set with additional Gaussian noise.
 #
 # The idea of this section, is to show that we can denoise the corrupted test
 # set by a learning a PCA basis on the uncorrupted train set. We will use
 # both a PCA and a kernel-based PCA.
 pca = PCA(n_components=32)
 kernel_pca = KernelPCA(n_components=200, kernel="rbf", gamma=1e-3,
-                       fit_inverse_transform=True, alpha=10)
+                       fit_inverse_transform=True, alpha=5e-3)
 
 pca.fit(X_train)
 _ = kernel_pca.fit(X_train)
@@ -241,3 +241,7 @@ def plot_digits(X, title):
             f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}")
 
 # %%
+# Even if both PCA and kernel PCA have the same MSE, a qualitative analysis
+# will favor the output of the kernel PCA. However, it should be noted that
+# the results of the denoising with kernel PCA will depend of the parameters
+# `n_components`, `gamma`, and `alpha`.

From e61baf6284f0f0e49f9602b28ceddaca6ab35726 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 21 Apr 2021 10:41:25 +0200
Subject: [PATCH 03/18] iter

---
 doc/modules/decomposition.rst | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index e971d784c63d6..0e05eba9cdfd4 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -183,16 +183,22 @@ has many applications including denoising, compression and structured
 prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 ``transform`` and ``inverse_transform``.
 
-.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_001.png
+.. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png
     :target: ../auto_examples/decomposition/plot_kernel_pca.html
     :align: center
     :scale: 75%
 
+.. note::
+    :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the
+    function to map samples from the PCA basis into the original feature
+    spaces. Thus, the reconstruction obtained with
+    :meth:`KernelPCA.inverse_transform` is an approximation. See the example
+    linked below to go more into details.
+
 .. topic:: Examples:
 
     * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
 
-
 .. _SparsePCA:
 
 Sparse principal components analysis (SparsePCA and MiniBatchSparsePCA)

From a4a9ab28da98e3d24114b453642e77583adcc773 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 21 Apr 2021 10:52:31 +0200
Subject: [PATCH 04/18] add info references

---
 doc/modules/decomposition.rst        | 24 +++++++++++++++++++-----
 sklearn/decomposition/_kernel_pca.py | 25 +++++++++++++++++--------
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index 0e05eba9cdfd4..4bd9fd54c3631 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -178,10 +178,10 @@ Kernel PCA
 ----------
 
 :class:`KernelPCA` is an extension of PCA which achieves non-linear
-dimensionality reduction through the use of kernels (see :ref:`metrics`). It
-has many applications including denoising, compression and structured
-prediction (kernel dependency estimation). :class:`KernelPCA` supports both
-``transform`` and ``inverse_transform``.
+dimensionality reduction through the use of kernels (see :ref:`metrics`)
+[Scholkopf1997]_. It has many applications including denoising, compression and
+structured prediction (kernel dependency estimation). :class:`KernelPCA`
+supports both ``transform`` and ``inverse_transform``.
 
 .. figure:: ../auto_examples/decomposition/images/sphx_glr_plot_kernel_pca_002.png
     :target: ../auto_examples/decomposition/plot_kernel_pca.html
@@ -191,7 +191,7 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 .. note::
     :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the
     function to map samples from the PCA basis into the original feature
-    spaces. Thus, the reconstruction obtained with
+    spaces [Bakir2004]_. Thus, the reconstruction obtained with
     :meth:`KernelPCA.inverse_transform` is an approximation. See the example
     linked below to go more into details.
 
@@ -199,6 +199,20 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 
     * :ref:`sphx_glr_auto_examples_decomposition_plot_kernel_pca.py`
 
+.. topic:: References
+
+    .. [Scholkopf1997] Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       `"Kernel principal component analysis."
+       <https://people.eecs.berkeley.edu/~wainwrig/stat241b/scholkopf_kernel.pdf>`_
+       International conference on artificial neural networks.
+       Springer, Berlin, Heidelberg, 1997.
+
+    .. [Bakir2004] Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+       `"Learning to find pre-images."
+       <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
+       Advances in neural information processing systems 16 (2004): 449-456.
+
+
 .. _SparsePCA:
 
 Sparse principal components analysis (SparsePCA and MiniBatchSparsePCA)
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index af9f525d31aac..d2ad6b6d176d8 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -56,8 +56,8 @@ class KernelPCA(TransformerMixin, BaseEstimator):
         inverse transform (when fit_inverse_transform=True).
 
     fit_inverse_transform : bool, default=False
-        Learn the inverse transform for non-precomputed kernels.
-        (i.e. learn to find the pre-image of a point)
+        Learn the inverse transform for non-precomputed kernels (i.e. learn to
+        find the pre-image of a point). This method is based on [2]_.
 
     eigen_solver : {'auto', 'dense', 'arpack'}, default='auto'
         Select eigensolver to use. If n_components is much less than
@@ -136,11 +136,16 @@ class KernelPCA(TransformerMixin, BaseEstimator):
 
     References
     ----------
-    Kernel PCA was introduced in:
-        Bernhard Schoelkopf, Alexander J. Smola,
-        and Klaus-Robert Mueller. 1999. Kernel principal
-        component analysis. In Advances in kernel methods,
-        MIT Press, Cambridge, MA, USA 327-352.
+    .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
+       "Kernel principal component analysis."
+       International conference on artificial neural networks.
+       Springer, Berlin, Heidelberg, 1997.
+       <https://people.eecs.berkeley.edu/~wainwrig/stat241b/scholkopf_kernel.pdf>`_
+
+    .. [2] `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+       "Learning to find pre-images."
+       Advances in neural information processing systems 16 (2004): 449-456.
+       <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
     """
     @_deprecate_positional_args
     def __init__(self, n_components=None, *, kernel="linear",
@@ -376,12 +381,16 @@ def inverse_transform(self, X):
 
         References
         ----------
-        "Learning to Find Pre-Images", G BakIr et al, 2004.
+        Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+        "Learning to find pre-images."
+        Advances in neural information processing systems 16 (2004): 449-456.
+        <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
         """
         if not self.fit_inverse_transform:
             raise NotFittedError("The fit_inverse_transform parameter was not"
                                  " set to True when instantiating and hence "
                                  "the inverse transform is not available.")
+
         K = self._get_kernel(X, self.X_transformed_fit_)
         return np.dot(K, self.dual_coef_)
 

From e1f090088b6d7dc76619e2170a29173bca1ac0fb Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 31 May 2021 14:37:28 +0200
Subject: [PATCH 05/18] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 examples/decomposition/plot_kernel_pca.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 84db78e9cdbba..ef102f4ec8c5a 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -12,8 +12,8 @@
 not the case with
 :class:`~sklearn.decomposition.PCA`.
 
-On the other hand, we show the limitation of inverting this this projection
-that is only an approximation with :class:`~sklearn.decomposition.KernelPCA`
+On the other hand, we show that inverting this projection is an
+approximation with  :class:`~sklearn.decomposition.KernelPCA`,
 while being exact with :class:`~sklearn.decomposition.PCA`.
 
 Finally, we show that this limitation can be useful in some applications such
@@ -66,7 +66,7 @@
 # kernel.
 from sklearn.decomposition import PCA, KernelPCA
 
-pca = PCA(n_components=None)
+pca = PCA(n_components=2)
 kernel_pca = KernelPCA(
     n_components=None, kernel="rbf", gamma=10, fit_inverse_transform=True,
     alpha=0.1)
@@ -108,7 +108,7 @@
 # the data being two concentric circles centered in zero, the variance of the
 # original data was already maximized. However, we can see that the data have
 # been rotated. As a conclusion, we see that such a projection would not help
-# if define a linear classifier to distinguish samples from both classes.
+# define a linear classifier to distinguish samples from both classes.
 #
 # Using a kernel allows to make a non-linear projection. Here, by using an RBF
 # kernel, we expect that the projection to unfold the dataset but keeping that
@@ -177,10 +177,12 @@
 from sklearn.model_selection import train_test_split
 
 X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
-X = MinMaxScaler().fit_transform(X)
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, stratify=y, random_state=0, train_size=1_000, test_size=100
 )
+min_max_scaler = MinMaxScaler()
+X_train = min_max_scaler.fit_transform(X_train)
+X_test = min_max_scaler.transform(X_test)
 
 rng = np.random.RandomState(0)
 noise = rng.normal(scale=0.25, size=X_test.shape)

From b9d6696b2e3c5fd3c063d42f801a7d8a1689b6fa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 10 Jun 2021 17:59:23 +0200
Subject: [PATCH 06/18] DOC simplify example

---
 examples/decomposition/plot_kernel_pca.py | 141 ++++------------------
 1 file changed, 25 insertions(+), 116 deletions(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index ef102f4ec8c5a..a6e62014bbe7e 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -3,21 +3,17 @@
 Kernel PCA
 ==========
 
-This example shows the difference between Principal Components Analysis
-(:class:`~sklearn.decomposition.PCA`) and
-:class:`~sklearn.decomposition.KernelPCA`.
+This example shows the difference between the Principal Components Analysis
+(:class:`~sklearn.decomposition.PCA`) and its kernalize version
+(:class:`~sklearn.decomposition.KernelPCA`).
 
 On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
 to find a projection of the data that makes them linearly separable while it is
-not the case with
-:class:`~sklearn.decomposition.PCA`.
+not the case with :class:`~sklearn.decomposition.PCA`.
 
 On the other hand, we show that inverting this projection is an
 approximation with  :class:`~sklearn.decomposition.KernelPCA`,
 while being exact with :class:`~sklearn.decomposition.PCA`.
-
-Finally, we show that this limitation can be useful in some applications such
-as image denoising.
 """
 print(__doc__)
 
@@ -75,27 +71,24 @@
 X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)
 
 # %%
-fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(8, 8))
-
-axs[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train)
-axs[0, 0].set_ylabel("Feature #1")
-axs[0, 0].set_xlabel("Feature #0")
-axs[0, 0].set_title("Training data")
+fig, axs = plt.subplots(ncols=3, figsize=(14, 4))
 
-axs[0, 1].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
-axs[0, 1].set_xlabel("Feature #0")
-axs[0, 1].set_title("Testing data")
+axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+axs[0].set_ylabel("Feature #1")
+axs[0].set_xlabel("Feature #0")
+axs[0].set_title("Testing data")
 
-axs[1, 0].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
-axs[1, 0].set_ylabel("Principal component #1")
-axs[1, 0].set_xlabel("Principal component #0")
-axs[1, 0].set_title("Projection of testing data\n using PCA")
+axs[1].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
+axs[1].set_ylabel("Principal component #1")
+axs[1].set_xlabel("Principal component #0")
+axs[1].set_title("Projection of testing data\n using PCA")
 
-axs[1, 1].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
-axs[1, 1].set_xlabel("Principal component #0")
-axs[1, 1].set_title("Projection of testing data\n using KernelPCA")
+axs[2].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
+axs[2].set_ylabel("Principal component #1")
+axs[2].set_xlabel("Principal component #0")
+axs[2].set_title("Projection of testing data\n using KernelPCA")
 
-fig.subplots_adjust(hspace=0.4)
+fig.subplots_adjust(wspace=0.3)
 
 # %%
 # We recall that PCA will project the data using a linear projection.
@@ -103,18 +96,18 @@
 # rescaling of the axis. This rescaling will depend on the variance of the
 # data.
 #
-# Thus, looking at the projection made using PCA (i.e. figure on the
-# bottom-left), we see that there is no change regarding the scaling; indeed
-# the data being two concentric circles centered in zero, the variance of the
-# original data was already maximized. However, we can see that the data have
-# been rotated. As a conclusion, we see that such a projection would not help
-# define a linear classifier to distinguish samples from both classes.
+# Thus, looking at the projection made using PCA (i.e. the middle figure), we
+# see that there is no change regarding the scaling; indeed the data being two
+# concentric circles centered in zero, the variance of the original data was
+# already maximized. However, we can see that the data have been rotated. As a
+# conclusion, we see that such a projection would not help if define a linear
+# classifier to distinguish samples from both classes.
 #
 # Using a kernel allows to make a non-linear projection. Here, by using an RBF
 # kernel, we expect that the projection to unfold the dataset but keeping that
 # point close in the original space should still be close in the new space.
 #
-# We observe such behaviour in the bottom-right figure: the samples of a given
+# We observe such behaviour in the figure on the right: the samples of a given
 # class are closer to each other than the samples from the opposite class. The
 # "radial" effect make that we unrolled the circle. Now, we can use a linear
 # classifier to separate the samples from the two classes.
@@ -163,87 +156,3 @@
 # This method is therefore an approximation leading to small difference. The
 # parameter `alpha` in the :class:`~sklearn.decomposition.KernelPCA` is used
 # to penalized the mapping function to fit more or less the training data.
-#
-# Application to image denoising
-# ------------------------------
-#
-# In this section, we will show how one can use the approximation function
-# learned to denoise image.
-
-# %%
-import numpy as np
-from sklearn.datasets import fetch_openml
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.model_selection import train_test_split
-
-X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, stratify=y, random_state=0, train_size=1_000, test_size=100
-)
-min_max_scaler = MinMaxScaler()
-X_train = min_max_scaler.fit_transform(X_train)
-X_test = min_max_scaler.transform(X_test)
-
-rng = np.random.RandomState(0)
-noise = rng.normal(scale=0.25, size=X_test.shape)
-X_test_noisy = X_test + noise
-
-
-# %%
-def plot_digits(X, title):
-    """Small helper function to plot 100 digits."""
-    fig, axs = plt.subplots(nrows=10, ncols=10, figsize=(8, 8))
-    for img, ax in zip(X, axs.ravel()):
-        ax.imshow(img.reshape((16, 16)), cmap="Greys")
-        ax.axis("off")
-    fig.suptitle(title, fontsize=30)
-
-
-# %%
-plot_digits(X_train, "Uncorrupted train images")
-plot_digits(X_test, "Uncorrupted test images")
-plot_digits(X_test_noisy,
-            f"Noisy test images - "
-            f"MSE: {np.mean((X_test - X_test_noisy) ** 2):.2f}")
-
-# %%
-# We created a training and testing of 1,000 samples and a test set of 100
-# samples. Also, we created a corrupted testing set that correspond to the
-# original test set with additional Gaussian noise.
-#
-# The idea of this section, is to show that we can denoise the corrupted test
-# set by a learning a PCA basis on the uncorrupted train set. We will use
-# both a PCA and a kernel-based PCA.
-pca = PCA(n_components=32)
-kernel_pca = KernelPCA(n_components=200, kernel="rbf", gamma=1e-3,
-                       fit_inverse_transform=True, alpha=5e-3)
-
-pca.fit(X_train)
-_ = kernel_pca.fit(X_train)
-
-# %%
-# Now, can transform and reconstruct the noisy test set. Since we used less
-# components than the number of original features, we will get an approximation
-# of the original set. Indeed, by dropping the components explaining less
-# variance in PCA, we hope to remove noise. Similar thinking happen in kernel
-# PCA; however, we expect a better reconstruction because we use a non-linear
-# kernel to learn the PCA basis and a kernel ridge to learn the mapping
-# function.
-X_reconstructed_kernel_pca = kernel_pca.inverse_transform(
-    kernel_pca.transform(X_test_noisy))
-X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy))
-
-# %%
-plot_digits(X_test, "Uncorrupted test images")
-plot_digits(X_reconstructed_pca,
-            f"PCA reconstruction - "
-            f"MSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}")
-plot_digits(X_reconstructed_kernel_pca,
-            f"Kernel PCA reconstruction - "
-            f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}")
-
-# %%
-# Even if both PCA and kernel PCA have the same MSE, a qualitative analysis
-# will favor the output of the kernel PCA. However, it should be noted that
-# the results of the denoising with kernel PCA will depend of the parameters
-# `n_components`, `gamma`, and `alpha`.

From a28ae8f254817343338b8323d4cd8a1c3ce7719f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 23 Jun 2021 10:53:03 +0200
Subject: [PATCH 07/18] apply suggestion of julien

---
 examples/decomposition/plot_kernel_pca.py | 42 +++++++++++++----------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index a6e62014bbe7e..5709264091e08 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -53,9 +53,9 @@
 
 # %%
 # The samples from each class cannot be linearly separated: we come with a
-# straightline that would split the samples from the inner circle to outer
-# circle. Perfectly, a potential decision function would be a circle separating
-# both circles.
+# straight line that would split the samples from the inner set to outer
+# one. Perfectly, a potential decision boundary would be a circle separating
+# both sample sets.
 #
 # Now, we will use PCA with and without a kernel to see what is the effect of
 # using such a kernel. The kernel used here is a radial basis function (RBF)
@@ -91,10 +91,9 @@
 fig.subplots_adjust(wspace=0.3)
 
 # %%
-# We recall that PCA will project the data using a linear projection.
-# Intuitively, it means that the coordinate system will be rotated with an some
-# rescaling of the axis. This rescaling will depend on the variance of the
-# data.
+# We recall that PCA will project the data linearly. Intuitively, it means that
+# the coordinate system will be rotated after centering and rescaling on each
+# axis. This rescaling will depend on the variance of the data.
 #
 # Thus, looking at the projection made using PCA (i.e. the middle figure), we
 # see that there is no change regarding the scaling; indeed the data being two
@@ -108,9 +107,9 @@
 # point close in the original space should still be close in the new space.
 #
 # We observe such behaviour in the figure on the right: the samples of a given
-# class are closer to each other than the samples from the opposite class. The
-# "radial" effect make that we unrolled the circle. Now, we can use a linear
-# classifier to separate the samples from the two classes.
+# class are closer to each other than the samples from the opposite class,
+# untangling both sample sets. Now, we can use a linear classifier to separate
+# the samples from the two classes.
 #
 # Projecting into the original feature space
 # ------------------------------------------
@@ -147,12 +146,17 @@
 _ = axs[2].set_title("Reconstruction via KernelPCA")
 
 # %%
-# While we see a perfect reconstruction, we observe a different results for
-# :class:`~sklearn.decomposition.KernelPCA`. Indeed,
-# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot rely on an
-# analytical back-projection and thus an extact reconstruction. Instead, a
-# :class:`~sklearn.linear_model.KernelRidge` was trained to learn a projection
-# function to map a sample from the PCA basis into the original feature space.
-# This method is therefore an approximation leading to small difference. The
-# parameter `alpha` in the :class:`~sklearn.decomposition.KernelPCA` is used
-# to penalized the mapping function to fit more or less the training data.
+# While we see a perfect reconstruction with
+# :class:`~sklearn.decomposition.PCA` we observe a different results for
+# :class:`~sklearn.decomposition.KernelPCA`.
+#
+# Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
+# rely on an analytical back-projection and thus an extact reconstruction.
+# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` was internally trained
+# to learn a mapping from the PCA basis to the original feature space. This
+# method is therefore an approximation leading to small difference.
+#
+# To improve the reconstruction using
+# :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
+# `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term
+# which controls the reliance on the training data during the mapping training.

From acfa8118ac30e2a772ab14d5667929d4e42ff169 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 23 Jun 2021 10:53:42 +0200
Subject: [PATCH 08/18] empty commit


From b9f7387c8c9a70bcb989767e2b7ab299fd7568ae Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 24 Jun 2021 12:03:18 +0200
Subject: [PATCH 09/18] Apply suggestions from code review

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 examples/decomposition/plot_kernel_pca.py | 32 ++++++++++++-----------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 5709264091e08..e4e729a819447 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -11,9 +11,9 @@
 to find a projection of the data that makes them linearly separable while it is
 not the case with :class:`~sklearn.decomposition.PCA`.
 
-On the other hand, we show that inverting this projection is an
-approximation with  :class:`~sklearn.decomposition.KernelPCA`,
-while being exact with :class:`~sklearn.decomposition.PCA`.
+Finally, we show that inverting this projection is an approximation with
+:class:`~sklearn.decomposition.KernelPCA`, while inverting is exact with
+:class:`~sklearn.decomposition.PCA`.
 """
 print(__doc__)
 
@@ -26,7 +26,7 @@
 # Projecting data: `PCA` vs. `KernelPCA`
 # --------------------------------------
 #
-# In this section, we will show the advantages of using a kernel when
+# In this section, we show the advantages of using a kernel when
 # projecting data using a Principal Component Analysis (PCA). We create a
 # dataset made of two nested circles.
 from sklearn.datasets import make_circles
@@ -37,7 +37,7 @@
     X, y, stratify=y, random_state=0)
 
 # %%
-# Let's have a quick first look to the dataset generated.
+# Let's have a quick first look at the generated dataset.
 import matplotlib.pyplot as plt
 
 _, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))
@@ -52,10 +52,9 @@
 _ = axs[1].set_title("Testing data")
 
 # %%
-# The samples from each class cannot be linearly separated: we come with a
-# straight line that would split the samples from the inner set to outer
-# one. Perfectly, a potential decision boundary would be a circle separating
-# both sample sets.
+# The samples from each class cannot be linearly separated: there is no
+# straight line that can split the samples from the inner set from the outer
+# set.
 #
 # Now, we will use PCA with and without a kernel to see what is the effect of
 # using such a kernel. The kernel used here is a radial basis function (RBF)
@@ -91,7 +90,7 @@
 fig.subplots_adjust(wspace=0.3)
 
 # %%
-# We recall that PCA will project the data linearly. Intuitively, it means that
+# We recall that PCA projects the data linearly. Intuitively, it means that
 # the coordinate system will be rotated after centering and rescaling on each
 # axis. This rescaling will depend on the variance of the data.
 #
@@ -103,8 +102,9 @@
 # classifier to distinguish samples from both classes.
 #
 # Using a kernel allows to make a non-linear projection. Here, by using an RBF
-# kernel, we expect that the projection to unfold the dataset but keeping that
-# point close in the original space should still be close in the new space.
+# kernel, we expect that the projection to unfold the dataset while keeping
+# approximately preserving the relative distances of pairs of data points that
+# are close to one another in the original space.
 #
 # We observe such behaviour in the figure on the right: the samples of a given
 # class are closer to each other than the samples from the opposite class,
@@ -152,11 +152,13 @@
 #
 # Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
 # rely on an analytical back-projection and thus an extact reconstruction.
-# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` was internally trained
+# Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained
 # to learn a mapping from the PCA basis to the original feature space. This
-# method is therefore an approximation leading to small difference.
+# method is therefore an approximation introducing small differences when
+# attempting to reconstruct the original input.
 #
 # To improve the reconstruction using
 # :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
 # `alpha` in :class:`~sklearn.decomposition.KernelPCA`, the regularization term
-# which controls the reliance on the training data during the mapping training.
+# which controls the reliance on the training data during the training of
+# the mapping.

From 93a965aafd577ec35584ef88fb49e27b95dd9870 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 25 Jun 2021 14:56:01 +0200
Subject: [PATCH 10/18] FIX hyperlink in inverse_transform

---
 sklearn/decomposition/_kernel_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index 837b73389d223..8b0c224ef5166 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -468,7 +468,7 @@ def inverse_transform(self, X):
 
         References
         ----------
-        Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
+        `Bakır, Gökhan H., Jason Weston, and Bernhard Schölkopf.
         "Learning to find pre-images."
         Advances in neural information processing systems 16 (2004): 449-456.
         <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_

From 0540fe5da5ec46c8c7f84e952c16972d3766c797 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 25 Jun 2021 16:07:12 +0200
Subject: [PATCH 11/18] Update examples/decomposition/plot_kernel_pca.py

Co-authored-by: Christos Aridas <chkoar@users.noreply.github.com>
---
 examples/decomposition/plot_kernel_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index e4e729a819447..2f6d06213234f 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -4,7 +4,7 @@
 ==========
 
 This example shows the difference between the Principal Components Analysis
-(:class:`~sklearn.decomposition.PCA`) and its kernalize version
+(:class:`~sklearn.decomposition.PCA`) and its kernalized version
 (:class:`~sklearn.decomposition.KernelPCA`).
 
 On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able

From 42feb3f27ceeb0b8ffc8c14b823e569368766282 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 25 Jun 2021 16:07:18 +0200
Subject: [PATCH 12/18] Update doc/modules/decomposition.rst

Co-authored-by: Christos Aridas <chkoar@users.noreply.github.com>
---
 doc/modules/decomposition.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index fe05804a4158a..3f601b6be6f2c 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -286,7 +286,7 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 .. note::
     :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the
     function to map samples from the PCA basis into the original feature
-    spaces [Bakir2004]_. Thus, the reconstruction obtained with
+    space [Bakir2004]_. Thus, the reconstruction obtained with
     :meth:`KernelPCA.inverse_transform` is an approximation. See the example
     linked below to go more into details.
 

From 03c9887b8f076b585551c067fd4c5dbc645ab384 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 25 Jun 2021 17:16:09 +0200
Subject: [PATCH 13/18] Update examples/decomposition/plot_kernel_pca.py

Co-authored-by: Christos Aridas <chkoar@users.noreply.github.com>
---
 examples/decomposition/plot_kernel_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 2f6d06213234f..1e9fc2eb36887 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -53,7 +53,7 @@
 
 # %%
 # The samples from each class cannot be linearly separated: there is no
-# straight line that can split the samples from the inner set from the outer
+# straight line that can split the samples of the inner set from the outer
 # set.
 #
 # Now, we will use PCA with and without a kernel to see what is the effect of

From f470dd9b82ea017ede3605aca09eb6bf0e0368cd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 29 Jun 2021 15:33:35 +0200
Subject: [PATCH 14/18] Update examples/decomposition/plot_kernel_pca.py

Co-authored-by: Christos Aridas <chkoar@users.noreply.github.com>
---
 examples/decomposition/plot_kernel_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 1e9fc2eb36887..88d745de303f3 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -102,7 +102,7 @@
 # classifier to distinguish samples from both classes.
 #
 # Using a kernel allows to make a non-linear projection. Here, by using an RBF
-# kernel, we expect that the projection to unfold the dataset while keeping
+# kernel, we expect that the projection will unfold the dataset while keeping
 # approximately preserving the relative distances of pairs of data points that
 # are close to one another in the original space.
 #

From 8237ea1376699b2924e12b1a3699da3d1a34f9e7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 3 Nov 2021 18:09:33 +0100
Subject: [PATCH 15/18] Apply suggestion Julien

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 doc/modules/decomposition.rst             |  4 +-
 examples/decomposition/plot_kernel_pca.py | 95 ++++++++++++-----------
 sklearn/decomposition/_kernel_pca.py      |  5 +-
 3 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/doc/modules/decomposition.rst b/doc/modules/decomposition.rst
index 371cfdf21fc07..eac8f063be258 100644
--- a/doc/modules/decomposition.rst
+++ b/doc/modules/decomposition.rst
@@ -285,10 +285,10 @@ prediction (kernel dependency estimation). :class:`KernelPCA` supports both
 
 .. note::
     :meth:`KernelPCA.inverse_transform` relies on a kernel ridge to learn the
-    function to map samples from the PCA basis into the original feature
+    function mapping samples from the PCA basis into the original feature
     space [Bakir2004]_. Thus, the reconstruction obtained with
     :meth:`KernelPCA.inverse_transform` is an approximation. See the example
-    linked below to go more into details.
+    linked below for more details.
 
 .. topic:: Examples:
 
diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index fee3152a9687c..faeec45ed6689 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -8,11 +8,11 @@
 (:class:`~sklearn.decomposition.KernelPCA`).
 
 On the one hand, we show that :class:`~sklearn.decomposition.KernelPCA` is able
-to find a projection of the data that makes them linearly separable while it is
-not the case with :class:`~sklearn.decomposition.PCA`.
+to find a projection of the data which linearly separates them while it is not the case
+with :class:`~sklearn.decomposition.PCA`.
 
 Finally, we show that inverting this projection is an approximation with
-:class:`~sklearn.decomposition.KernelPCA`, while inverting is exact with
+:class:`~sklearn.decomposition.KernelPCA`, while it is exact with
 :class:`~sklearn.decomposition.PCA`.
 """
 
@@ -38,16 +38,16 @@
 # Let's have a quick first look at the generated dataset.
 import matplotlib.pyplot as plt
 
-_, axs = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))
+_, (train_ax, test_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(8, 4))
 
-axs[0].scatter(X_train[:, 0], X_train[:, 1], c=y_train)
-axs[0].set_ylabel("Feature #1")
-axs[0].set_xlabel("Feature #0")
-axs[0].set_title("Training data")
+train_ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train)
+train_ax.set_ylabel("Feature #1")
+train_ax.set_xlabel("Feature #0")
+train_ax.set_title("Training data")
 
-axs[1].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
-axs[1].set_xlabel("Feature #0")
-_ = axs[1].set_title("Testing data")
+test_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+test_ax.set_xlabel("Feature #0")
+_ = test_ax.set_title("Testing data")
 
 # %%
 # The samples from each class cannot be linearly separated: there is no
@@ -68,32 +68,34 @@
 X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)
 
 # %%
-fig, axs = plt.subplots(ncols=3, figsize=(14, 4))
+fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(ncols=3, figsize=(14, 4))
 
-axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
-axs[0].set_ylabel("Feature #1")
-axs[0].set_xlabel("Feature #0")
-axs[0].set_title("Testing data")
+orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+orig_data_ax.set_ylabel("Feature #1")
+orig_data_ax.set_xlabel("Feature #0")
+orig_data_ax.set_title("Testing data")
 
-axs[1].scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
-axs[1].set_ylabel("Principal component #1")
-axs[1].set_xlabel("Principal component #0")
-axs[1].set_title("Projection of testing data\n using PCA")
+pca_proj_ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test)
+pca_proj_ax.set_ylabel("Principal component #1")
+pca_proj_ax.set_xlabel("Principal component #0")
+pca_proj_ax.set_title("Projection of testing data\n using PCA")
 
-axs[2].scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
-axs[2].set_ylabel("Principal component #1")
-axs[2].set_xlabel("Principal component #0")
-axs[2].set_title("Projection of testing data\n using KernelPCA")
+kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
+kernel_pca_proj_ax.set_ylabel("Principal component #1")
+kernel_pca_proj_ax.set_xlabel("Principal component #0")
+kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")
 
 # %%
-# We recall that PCA projects the data linearly. Intuitively, it means that
-# the coordinate system will be rotated after centering and rescaling on each
-# axis. This rescaling will depend on the variance of the data.
+# We recall that PCA transforms the data linearly. Intuitively, it means that
+# the coordinate system will be centered, rescaled on each component
+# with respected to its variance and finally be rotated.
+# The obtained data from this transformation is isotropic and can now be
+# projected on its _principal components_.
 #
 # Thus, looking at the projection made using PCA (i.e. the middle figure), we
 # see that there is no change regarding the scaling; indeed the data being two
-# concentric circles centered in zero, the variance of the original data was
-# already maximized. However, we can see that the data have been rotated. As a
+# concentric circles centered in zero, the original data is already isotropic.
+# However, we can see that the data have been rotated. As a
 # conclusion, we see that such a projection would not help if define a linear
 # classifier to distinguish samples from both classes.
 #
@@ -114,9 +116,8 @@
 # :class:`~sklearn.decomposition.KernelPCA` is related to the reconstruction
 # (i.e. the back projection in the original feature space). With
 # :class:`~sklearn.decomposition.PCA`, the reconstruction will be exact if
-# `n_components` is the same than the number of original features as in this
-# example. Thus, projecting the data on the PCA basis and projecting back will
-# give the same dataset.
+# `n_components` is the same than the number of original features.
+# This is the case in this example.
 #
 # We can investigate if we get a similar outcome with
 # :class:`~sklearn.decomposition.KernelPCA`.
@@ -124,34 +125,34 @@
 X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))
 
 # %%
-fig, axs = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4))
+fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax)  = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4))
 
-axs[0].scatter(X_test[:, 0], X_test[:, 1], c=y_test)
-axs[0].set_ylabel("Feature #1")
-axs[0].set_xlabel("Feature #0")
-axs[0].set_title("Original test data")
+orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
+orig_data_ax.set_ylabel("Feature #1")
+orig_data_ax.set_xlabel("Feature #0")
+orig_data_ax.set_title("Original test data")
 
-axs[1].scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)
-axs[1].set_xlabel("Feature #0")
-axs[1].set_title("Reconstruction via PCA")
+pca_back_proj_ax.scatter(X_reconstructed_pca[:, 0], X_reconstructed_pca[:, 1], c=y_test)
+pca_back_proj_ax.set_xlabel("Feature #0")
+pca_back_proj_ax.set_title("Reconstruction via PCA")
 
-axs[2].scatter(
+kernel_pca_back_proj_ax.scatter(
     X_reconstructed_kernel_pca[:, 0], X_reconstructed_kernel_pca[:, 1], c=y_test
 )
-axs[2].set_xlabel("Feature #0")
-_ = axs[2].set_title("Reconstruction via KernelPCA")
+kernel_pca_back_proj_ax.set_xlabel("Feature #0")
+_ = kernel_pca_back_proj_ax.set_title("Reconstruction via KernelPCA")
 
 # %%
 # While we see a perfect reconstruction with
-# :class:`~sklearn.decomposition.PCA` we observe a different results for
+# :class:`~sklearn.decomposition.PCA` we observe a different result for
 # :class:`~sklearn.decomposition.KernelPCA`.
 #
 # Indeed, :meth:`~sklearn.decomposition.KernelPCA.inverse_transform` cannot
 # rely on an analytical back-projection and thus an extact reconstruction.
 # Instead, a :class:`~sklearn.kernel_ridge.KernelRidge` is internally trained
-# to learn a mapping from the PCA basis to the original feature space. This
-# method is therefore an approximation introducing small differences when
-# attempting to reconstruct the original input.
+# to learn a mapping from the kernalized PCA basis to the original feature
+# space. This method therefore comes with an approximation introducing small
+# differences when back projecting in the original feature space.
 #
 # To improve the reconstruction using
 # :meth:`~sklearn.decomposition.KernelPCA.inverse_transform`, one can tune
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index f8a789daf4add..0efcf2d4fd341 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -216,11 +216,10 @@ class KernelPCA(_ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimato
        Advances in neural information processing systems 16 (2004): 449-456.
        <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.68.5164&rep=rep1&type=pdf>`_
 
-    .. [3] `Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.
+    .. [3] :arxiv:`Halko, Nathan, Per-Gunnar Martinsson, and Joel A. Tropp.
        "Finding structure with randomness: Probabilistic algorithms for
        constructing approximate matrix decompositions."
-       SIAM review 53.2 (2011): 217-288.
-       <https://arxiv.org/pdf/0909.4061.pdf>`_
+       SIAM review 53.2 (2011): 217-288. <0909.4061>`
 
     .. [4] `Martinsson, Per-Gunnar, Vladimir Rokhlin, and Mark Tygert.
        "A randomized algorithm for the decomposition of matrices."

From 005e76219c053de2c95685f30161f2f5e6253cae Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 3 Nov 2021 18:19:01 +0100
Subject: [PATCH 16/18] Update examples/decomposition/plot_kernel_pca.py

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 examples/decomposition/plot_kernel_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index faeec45ed6689..cf166bbcf71e3 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -119,7 +119,7 @@
 # `n_components` is the same than the number of original features.
 # This is the case in this example.
 #
-# We can investigate if we get a similar outcome with
+# We can investigate if we get the original dataset when back projecting with
 # :class:`~sklearn.decomposition.KernelPCA`.
 X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test))
 X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))

From 4f301db2c7cfcd71a316f716d327eb08c2b8cdf4 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 3 Nov 2021 18:21:10 +0100
Subject: [PATCH 17/18] black

---
 examples/decomposition/plot_kernel_pca.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index cf166bbcf71e3..60e40065b2843 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -68,7 +68,9 @@
 X_test_kernel_pca = kernel_pca.fit(X_train).transform(X_test)
 
 # %%
-fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(ncols=3, figsize=(14, 4))
+fig, (orig_data_ax, pca_proj_ax, kernel_pca_proj_ax) = plt.subplots(
+    ncols=3, figsize=(14, 4)
+)
 
 orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
 orig_data_ax.set_ylabel("Feature #1")
@@ -125,7 +127,9 @@
 X_reconstructed_kernel_pca = kernel_pca.inverse_transform(kernel_pca.transform(X_test))
 
 # %%
-fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax)  = plt.subplots(ncols=3, sharex=True, sharey=True, figsize=(13, 4))
+fig, (orig_data_ax, pca_back_proj_ax, kernel_pca_back_proj_ax) = plt.subplots(
+    ncols=3, sharex=True, sharey=True, figsize=(13, 4)
+)
 
 orig_data_ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test)
 orig_data_ax.set_ylabel("Feature #1")

From 1a52093d37992582eb9b1a3fa4a628ad93579936 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 3 Nov 2021 21:49:06 +0100
Subject: [PATCH 18/18] Update plot_kernel_pca.py

---
 examples/decomposition/plot_kernel_pca.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/decomposition/plot_kernel_pca.py b/examples/decomposition/plot_kernel_pca.py
index 60e40065b2843..fe6d63240523e 100644
--- a/examples/decomposition/plot_kernel_pca.py
+++ b/examples/decomposition/plot_kernel_pca.py
@@ -85,7 +85,7 @@
 kernel_pca_proj_ax.scatter(X_test_kernel_pca[:, 0], X_test_kernel_pca[:, 1], c=y_test)
 kernel_pca_proj_ax.set_ylabel("Principal component #1")
 kernel_pca_proj_ax.set_xlabel("Principal component #0")
-kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")
+_ = kernel_pca_proj_ax.set_title("Projection of testing data\n using KernelPCA")
 
 # %%
 # We recall that PCA transforms the data linearly. Intuitively, it means that