From 8db148a0b79ab6e9020fccbd703d508fcfafa716 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Tue, 16 Mar 2021 01:23:14 -0600
Subject: [PATCH 1/4] Improve documentation consistency for
 GaussianProcessRegressor

---
 sklearn/gaussian_process/_gpr.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index b4ab0441efc71..e5f44c7d3a3d2 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -30,9 +30,9 @@ class GaussianProcessRegressor(MultiOutputMixin,
     GaussianProcessRegressor:
 
        * allows prediction without prior fitting (based on the GP prior)
-       * provides an additional method sample_y(X), which evaluates samples
+       * provides an additional method `sample_y(X)`, which evaluates samples
          drawn from the GPR (prior or posterior) at given inputs
-       * exposes a method log_marginal_likelihood(theta), which can be used
+       * exposes a method `log_marginal_likelihood(theta)`, which can be used
          externally for other ways of selecting hyperparameters, e.g., via
          Markov chain Monte Carlo.
 
@@ -68,8 +68,8 @@ class GaussianProcessRegressor(MultiOutputMixin,
         must have the signature::
 
             def optimizer(obj_func, initial_theta, bounds):
-                # * 'obj_func' is the objective function to be minimized, which
-                #   takes the hyperparameters theta as parameter and an
+                # * 'obj_func': the objective function to be minimized, which
+                #   takes the hyperparameters theta as a parameter and an
                 #   optional flag eval_gradient, which determines if the
                 #   gradient is returned additionally to the function value
                 # * 'initial_theta': the initial value for theta, which can be
@@ -80,7 +80,7 @@ def optimizer(obj_func, initial_theta, bounds):
                 # the corresponding value of the target function.
                 return theta_opt, func_min
 
-        Per default, the 'L-BGFS-B' algorithm from scipy.optimize.minimize
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
         is used. If None is passed, the kernel's parameters are kept fixed.
         Available internal optimizers are::
 
@@ -113,7 +113,7 @@ def optimizer(obj_func, initial_theta, bounds):
     random_state : int, RandomState instance or None, default=None
         Determines random number generation used to initialize the centers.
         Pass an int for reproducible results across multiple function calls.
-        See :term: `Glossary <random_state>`.
+        See :term:`Glossary <random_state>`.
 
     Attributes
     ----------
@@ -302,7 +302,7 @@ def predict(self, X, return_std=False, return_cov=False):
 
         Returns
         -------
-        y_mean : ndarray of shape (n_samples, [n_output_dims])
+        y_mean : ndarray of shape (n_samples, [n_targets])
             Mean of predictive distribution a query points.
 
         y_std : ndarray of shape (n_samples,), optional
@@ -399,11 +399,11 @@ def sample_y(self, X, n_samples=1, random_state=0):
             Determines random number generation to randomly draw samples.
             Pass an int for reproducible results across multiple function
             calls.
-            See :term: `Glossary <random_state>`.
+            See :term:`Glossary <random_state>`.
 
         Returns
         -------
-        y_samples : ndarray of shape (n_samples_X, [n_output_dims], n_samples)
+        y_samples : ndarray of shape (n_samples, [n_targets], n_samples)
             Values of n_samples samples drawn from Gaussian process and
             evaluated at query points.
         """

From 44c834f77f705c9e29f611aa79143fec541bb948 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Mon, 22 Mar 2021 15:45:07 -0600
Subject: [PATCH 2/4] Apply suggestions from code review

Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com>
---
 sklearn/gaussian_process/_gpr.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index e5f44c7d3a3d2..0f2021eb26690 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -302,7 +302,7 @@ def predict(self, X, return_std=False, return_cov=False):
 
         Returns
         -------
-        y_mean : ndarray of shape (n_samples, [n_targets])
+        y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)
             Mean of predictive distribution a query points.
 
         y_std : ndarray of shape (n_samples,), optional
@@ -403,7 +403,8 @@ def sample_y(self, X, n_samples=1, random_state=0):
 
         Returns
         -------
-        y_samples : ndarray of shape (n_samples, [n_targets], n_samples)
+        y_samples : ndarray of shape (n_samples, n_samples), or \
+            (n_samples, n_targets, n_samples)
             Values of n_samples samples drawn from Gaussian process and
             evaluated at query points.
         """

From 68b04ec22e8acf447a8c1223554abdd6f14c57e3 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Tue, 23 Mar 2021 14:52:00 -0600
Subject: [PATCH 3/4] More code / documentation improvements for clarity in GPR

---
 sklearn/gaussian_process/_gpr.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 0f2021eb26690..8a80761cf3841 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -211,8 +211,8 @@ def fit(self, X, y):
             if self.alpha.shape[0] == 1:
                 self.alpha = self.alpha[0]
             else:
-                raise ValueError("alpha must be a scalar or an array"
-                                 " with same number of entries as y.(%d != %d)"
+                raise ValueError("alpha must be a scalar or an array "
+                                 "with same number of entries as y. (%d != %d)"
                                  % (self.alpha.shape[0], y.shape[0]))
 
         self.X_train_ = np.copy(X) if self.copy_X_train else X
@@ -283,9 +283,9 @@ def predict(self, X, return_std=False, return_cov=False):
         """Predict using the Gaussian process regression model
 
         We can also predict based on an unfitted model by using the GP prior.
-        In addition to the mean of the predictive distribution, also its
-        standard deviation (return_std=True) or covariance (return_cov=True).
-        Note that at most one of the two can be requested.
+        In addition to the mean of the predictive distribution, optionally also
+        returns its standard deviation (`return_std=True`) or covariance
+        (`return_cov=True`). Note that at most one of the two can be requested.
 
         Parameters
         ----------
@@ -315,8 +315,7 @@ def predict(self, X, return_std=False, return_cov=False):
         """
         if return_std and return_cov:
             raise RuntimeError(
-                "Not returning standard deviation of predictions when "
-                "returning full covariance.")
+                "At most one of return_std or return_cov can be requested.")
 
         if self.kernel is None or self.kernel.requires_vector_input:
             X = self._validate_data(X, ensure_2d=True, dtype="numeric",
@@ -342,14 +341,14 @@ def predict(self, X, return_std=False, return_cov=False):
                 return y_mean
         else:  # Predict based on GP posterior
             K_trans = self.kernel_(X, self.X_train_)
-            y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
+            y_mean = K_trans @ self.alpha_  # Line 4 (y_mean = f_star)
 
             # undo normalisation
             y_mean = self._y_train_std * y_mean + self._y_train_mean
 
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
-                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
+                y_cov = self.kernel_(X) - K_trans @ v  # Line 6
 
                 # undo normalisation
                 y_cov = y_cov * self._y_train_std**2
@@ -362,12 +361,12 @@ def predict(self, X, return_std=False, return_cov=False):
                     # decomposition L and its inverse L_inv
                     L_inv = solve_triangular(self.L_.T,
                                              np.eye(self.L_.shape[0]))
-                    self._K_inv = L_inv.dot(L_inv.T)
+                    self._K_inv = L_inv @ L_inv.T
 
                 # Compute variance of predictive distribution
                 y_var = self.kernel_.diag(X)
                 y_var -= np.einsum("ij,ij->i",
-                                   np.dot(K_trans, self._K_inv), K_trans)
+                                   K_trans @ self._K_inv, K_trans)
 
                 # Check if any of the variances is negative because of
                 # numerical issues. If yes: set the variance to 0.
@@ -389,11 +388,11 @@ def sample_y(self, X, n_samples=1, random_state=0):
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features) or list of object
+        X : array-like of shape (n_samples_X, n_features) or list of object
             Query points where the GP is evaluated.
 
         n_samples : int, default=1
-            The number of samples drawn from the Gaussian process
+            Number of samples drawn from the Gaussian process per query point
 
         random_state : int, RandomState instance or None, default=0
             Determines random number generation to randomly draw samples.
@@ -403,8 +402,8 @@ def sample_y(self, X, n_samples=1, random_state=0):
 
         Returns
         -------
-        y_samples : ndarray of shape (n_samples, n_samples), or \
-            (n_samples, n_targets, n_samples)
+        y_samples : ndarray of shape (n_samples_X, n_samples), or \
+            (n_samples_X, n_targets, n_samples)
             Values of n_samples samples drawn from Gaussian process and
             evaluated at query points.
         """

From fe67c9e798657843ae228c9f6118e550a7fd5b99 Mon Sep 17 00:00:00 2001
From: Christopher Yeh <chrisyeh96@users.noreply.github.com>
Date: Tue, 23 Mar 2021 16:21:13 -0600
Subject: [PATCH 4/4] Undo np.dot -> @

---
 sklearn/gaussian_process/_gpr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 8a80761cf3841..4e8814dd69951 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -341,14 +341,14 @@ def predict(self, X, return_std=False, return_cov=False):
                 return y_mean
         else:  # Predict based on GP posterior
             K_trans = self.kernel_(X, self.X_train_)
-            y_mean = K_trans @ self.alpha_  # Line 4 (y_mean = f_star)
+            y_mean = K_trans.dot(self.alpha_)  # Line 4 (y_mean = f_star)
 
             # undo normalisation
             y_mean = self._y_train_std * y_mean + self._y_train_mean
 
             if return_cov:
                 v = cho_solve((self.L_, True), K_trans.T)  # Line 5
-                y_cov = self.kernel_(X) - K_trans @ v  # Line 6
+                y_cov = self.kernel_(X) - K_trans.dot(v)  # Line 6
 
                 # undo normalisation
                 y_cov = y_cov * self._y_train_std**2
@@ -361,12 +361,12 @@ def predict(self, X, return_std=False, return_cov=False):
                     # decomposition L and its inverse L_inv
                     L_inv = solve_triangular(self.L_.T,
                                              np.eye(self.L_.shape[0]))
-                    self._K_inv = L_inv @ L_inv.T
+                    self._K_inv = L_inv.dot(L_inv.T)
 
                 # Compute variance of predictive distribution
                 y_var = self.kernel_.diag(X)
                 y_var -= np.einsum("ij,ij->i",
-                                   K_trans @ self._K_inv, K_trans)
+                                   np.dot(K_trans, self._K_inv), K_trans)
 
                 # Check if any of the variances is negative because of
                 # numerical issues. If yes: set the variance to 0.