scikit-learn · glemaitre · Apr 21, 2022 · Mar 8, 2022 · Mar 10, 2022 · Mar 10, 2022
diff --git a/doc/conf.py b/doc/conf.py
@@ -253,6 +253,9 @@
         "auto_examples/model_selection/plot_permutation_tests_for_classification"
     ),
     "modules/model_persistence": "model_persistence",
+    "auto_examples/linear_model/plot_bayesian_ridge": (
+        "auto_examples/linear_model/plot_ard"
+    ),
 }
 html_context["redirects"] = redirects
 for old_link in redirects:

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
@@ -746,13 +746,6 @@ There are four more hyperparameters, :math:`\alpha_1`, :math:`\alpha_2`,
 :math:`\alpha` and :math:`\lambda`. These are usually chosen to be
 *non-informative*. By default :math:`\alpha_1 = \alpha_2 =  \lambda_1 = \lambda_2 = 10^{-6}`.
 
-
-.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_bayesian_ridge_001.png
-   :target: ../auto_examples/linear_model/plot_bayesian_ridge.html
-   :align: center
-   :scale: 50%
-
-
 Bayesian Ridge Regression is used for regression::
 
     >>> from sklearn import linear_model
@@ -778,7 +771,6 @@ is more robust to ill-posed problems.
 
 .. topic:: Examples:
 
- * :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge.py`
  * :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`
 
 .. topic:: References:
@@ -789,37 +781,35 @@ is more robust to ill-posed problems.
 
     * Michael E. Tipping, `Sparse Bayesian Learning and the Relevance Vector Machine <http://www.jmlr.org/papers/volume1/tipping01a/tipping01a.pdf>`_, 2001.
 
+.. _automatic_relevance_determination:
 
 Automatic Relevance Determination - ARD
 ---------------------------------------
 
-:class:`ARDRegression` is very similar to `Bayesian Ridge Regression`_,
-but can lead to sparser coefficients :math:`w` [1]_ [2]_.
-:class:`ARDRegression` poses a different prior over :math:`w`, by dropping the
-assumption of the Gaussian being spherical.
+The Automatic Relevance Determination (as being implemented in
+:class:`ARDRegression`) is a kind of linear model which is very similar to the
+`Bayesian Ridge Regression`_, but that leads to sparser coefficients :math:`w`
+[1]_ [2]_.
 
-Instead, the distribution over :math:`w` is assumed to be an axis-parallel,
-elliptical Gaussian distribution.
-
-This means each coefficient :math:`w_{i}` is drawn from a Gaussian distribution,
-centered on zero and with a precision :math:`\lambda_{i}`:
+:class:`ARDRegression` poses a different prior over :math:`w`: it drops
+the spherical Gaussian distribution for a centered elliptic Gaussian
+distribution. This means each coefficient :math:`w_{i}` can itself be drawn from
+a Gaussian distribution, centered on zero and with a precision
+:math:`\lambda_{i}`:
 
 .. math:: p(w|\lambda) = \mathcal{N}(w|0,A^{-1})
 
-with :math:`\text{diag}(A) = \lambda = \{\lambda_{1},...,\lambda_{p}\}`.
-
-In contrast to `Bayesian Ridge Regression`_, each coordinate of :math:`w_{i}`
-has its own standard deviation :math:`\lambda_i`. The prior over all
-:math:`\lambda_i` is chosen to be the same gamma distribution given by
-hyperparameters :math:`\lambda_1` and :math:`\lambda_2`.
+with :math:`A` being a positive definite diagonal matrix and
+:math:`\text{diag}(A) = \lambda = \{\lambda_{1},...,\lambda_{p}\}`.
 
-.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ard_001.png
-   :target: ../auto_examples/linear_model/plot_ard.html
-   :align: center
-   :scale: 50%
+In contrast to the `Bayesian Ridge Regression`_, each coordinate of
+:math:`w_{i}` has its own standard deviation :math:`\frac{1}{\lambda_i}`. The
+prior over all :math:`\lambda_i` is chosen to be the same gamma distribution
+given by the hyperparameters :math:`\lambda_1` and :math:`\lambda_2`.
 
-ARD is also known in the literature as *Sparse Bayesian Learning* and
-*Relevance Vector Machine* [3]_ [4]_.
+ARD is also known in the literature as *Sparse Bayesian Learning* and *Relevance
+Vector Machine* [3]_ [4]_. For a worked-out comparison between ARD and `Bayesian
+Ridge Regression`_, see the example below.
 
 .. topic:: Examples:
 

diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py
@@ -1,119 +1,210 @@
 """
-==================================================
-Automatic Relevance Determination Regression (ARD)
-==================================================
+====================================
+Comparing Linear Bayesian Regressors
+====================================
 
-Fit regression model with Bayesian Ridge Regression.
+This example compares two different bayesian regressors:
 
-See :ref:`bayesian_ridge_regression` for more information on the regressor.
+ - a :ref:`automatic_relevance_determination`
+ - a :ref:`bayesian_ridge_regression`
 
-Compared to the OLS (ordinary least squares) estimator, the coefficient
-weights are slightly shifted toward zeros, which stabilises them.
+In the first part, we use an :ref:`ordinary_least_squares` (OLS) model as a
+baseline for comparing the models' coefficients with respect to the true
+coefficients. Thereafter, we show that the estimation of such models is done by
+iteratively maximizing the marginal log-likelihood of the observations.
 
-The histogram of the estimated weights is very peaked, as a sparsity-inducing
-prior is implied on the weights.
+In the last section we plot predictions and uncertainties for the ARD and the
+Bayesian Ridge regressions using a polynomial feature expansion to fit a
+non-linear relationship between `X` and `y`.
 
-The estimation of the model is done by iteratively maximizing the
-marginal log-likelihood of the observations.
+"""
 
-We also plot predictions and uncertainties for ARD
-for one dimensional regression using polynomial feature expansion.
-Note the uncertainty starts going up on the right side of the plot.
-This is because these test samples are outside of the range of the training
-samples.
+# Author: Arturo Amor <[email protected]>
 
-"""
+# %%
+# Models robustness to recover the ground truth weights
+# =====================================================
+#
+# Generate synthetic dataset
+# --------------------------
+#
+# We generate a dataset where `X` and `y` are linearly linked: 10 of the
+# features of `X` will be used to generate `y`. The other features are not
+# useful at predicting `y`. In addition, we generate a dataset where `n_samples
+# == n_features`. Such a setting is challenging for an OLS model and leads
+# potentially to arbitrary large weights. Having a prior on the weights and a
+# penalty alleviates the problem. Finally, gaussian noise is added.
+
+from sklearn.datasets import make_regression
+
+X, y, true_weights = make_regression(
+    n_samples=100,
+    n_features=100,
+    n_informative=10,
+    noise=8,
+    coef=True,
+    random_state=42,
+)
 
-import numpy as np
+# %%
+# Fit the regressors
+# ------------------
+#
+# We now fit both Bayesian models and the OLS to later compare the models'
+# coefficients.
+
+import pandas as pd
+from sklearn.linear_model import ARDRegression, LinearRegression, BayesianRidge
+
+olr = LinearRegression().fit(X, y)
+brr = BayesianRidge(compute_score=True, n_iter=30).fit(X, y)
+ard = ARDRegression(compute_score=True, n_iter=30).fit(X, y)
+df = pd.DataFrame(
+    {
+        "Weights of true generative process": true_weights,
+        "ARDRegression": ard.coef_,
+        "BayesianRidge": brr.coef_,
+        "LinearRegression": olr.coef_,
+    }
+)
+
+# %%
+# Plot the true and estimated coefficients
+# ----------------------------------------
+#
+# Now we compare the coefficients of each model with the weights of
+# the true generative model.
 import matplotlib.pyplot as plt
-from scipy import stats
+import seaborn as sns
+from matplotlib.colors import SymLogNorm
+
+plt.figure(figsize=(10, 6))
+ax = sns.heatmap(
+    df.T,
+    norm=SymLogNorm(linthresh=10e-4, vmin=-80, vmax=80),
+    cbar_kws={"label": "coefficients' values"},
+    cmap="seismic_r",
+)
+plt.ylabel("linear model")
+plt.xlabel("coefficients")
+plt.tight_layout(rect=(0, 0, 1, 0.95))
+_ = plt.title("Models' coefficients")
 
-from sklearn.linear_model import ARDRegression, LinearRegression
+# %%
+# Due to the added noise, none of the models recover the true weights. Indeed,
+# all models always have more than 10 non-zero coefficients. Compared to the OLS
+# estimator, the coefficients using a Bayesian Ridge regression are slightly
+# shifted toward zero, which stabilises them. The ARD regression provides a
+# sparser solution: some of the non-informative coefficients are set exactly to
+# zero, while shifting others closer to zero. Some non-informative coefficients
+# are still present and retain large values.
 
 # %%
-# Generating simulated data with Gaussian weights
-
-# Parameters of the example
-np.random.seed(0)
-n_samples, n_features = 100, 100
-# Create Gaussian data
-X = np.random.randn(n_samples, n_features)
-# Create weights with a precision lambda_ of 4.
-lambda_ = 4.0
-w = np.zeros(n_features)
-# Only keep 10 weights of interest
-relevant_features = np.random.randint(0, n_features, 10)
-for i in relevant_features:
-    w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_))
-# Create noise with a precision alpha of 50.
-alpha_ = 50.0
-noise = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(alpha_), size=n_samples)
-# Create the target
-y = np.dot(X, w) + noise
+# Plot the marginal log-likelihood
+# --------------------------------
+import numpy as np
+
+ard_scores = -np.array(ard.scores_)
+brr_scores = -np.array(brr.scores_)
+plt.plot(ard_scores, color="navy", label="ARD")
+plt.plot(brr_scores, color="red", label="BayesianRidge")
+plt.ylabel("Log-likelihood")
+plt.xlabel("Iterations")
+plt.xlim(1, 30)
+plt.legend()
+_ = plt.title("Models log-likelihood")
 
 # %%
-# Fit the ARD Regression
-clf = ARDRegression(compute_score=True)
-clf.fit(X, y)
+# Indeed, both models minimize the log-likelihood up to an arbitrary cutoff
+# defined by the `n_iter` parameter.
+#
+# Bayesian regressions with polynomial feature expansion
+# ======================================================
+# Generate synthetic dataset
+# --------------------------
+# We create a target that is a non-linear function of the input feature.
+# Noise following a standard uniform distribution is added.
+
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import PolynomialFeatures, StandardScaler
+
+rng = np.random.RandomState(0)
+n_samples = 110
+
+# sort the data to make plotting easier later
+X = np.sort(-10 * rng.rand(n_samples) + 10)
+noise = rng.normal(0, 1, n_samples) * 1.35
+y = np.sqrt(X) * np.sin(X) + noise
+full_data = pd.DataFrame({"input_feature": X, "target": y})
+X = X.reshape((-1, 1))
+
+# extrapolation
+X_plot = np.linspace(10, 10.4, 10)
+y_plot = np.sqrt(X_plot) * np.sin(X_plot)
+X_plot = np.concatenate((X, X_plot.reshape((-1, 1))))
+y_plot = np.concatenate((y - noise, y_plot))
 
-ols = LinearRegression()
-ols.fit(X, y)
+# %%
+# Fit the regressors
+# ------------------
+#
+# Here we try a degree 10 polynomial to potentially overfit, though the bayesian
+# linear models regularize the size of the polynomial coefficients. As
+# `fit_intercept=True` by default for
+# :class:`~sklearn.linear_model.ARDRegression` and
+# :class:`~sklearn.linear_model.BayesianRidge`, then
+# :class:`~sklearn.preprocessing.PolynomialFeatures` should not introduce an
+# additional bias feature. By setting `return_std=True`, the bayesian regressors
+# return the standard deviation of the posterior distribution for the model
+# parameters.
+
+ard_poly = make_pipeline(
+    PolynomialFeatures(degree=10, include_bias=False),
+    StandardScaler(),
+    ARDRegression(),
+).fit(X, y)
+brr_poly = make_pipeline(
+    PolynomialFeatures(degree=10, include_bias=False),
+    StandardScaler(),
+    BayesianRidge(),
+).fit(X, y)
+
+y_ard, y_ard_std = ard_poly.predict(X_plot, return_std=True)
+y_brr, y_brr_std = brr_poly.predict(X_plot, return_std=True)
 
 # %%
-# Plot the true weights, the estimated weights, the histogram of the
-# weights, and predictions with standard deviations
-plt.figure(figsize=(6, 5))
-plt.title("Weights of the model")
-plt.plot(clf.coef_, color="darkblue", linestyle="-", linewidth=2, label="ARD estimate")
-plt.plot(
-    ols.coef_, color="yellowgreen", linestyle=":", linewidth=2, label="OLS estimate"
+# Plotting polynomial regressions with std errors of the scores
+# -------------------------------------------------------------
+
+ax = sns.scatterplot(
+    data=full_data, x="input_feature", y="target", color="black", alpha=0.75
 )
-plt.plot(w, color="orange", linestyle="-", linewidth=2, label="Ground truth")
-plt.xlabel("Features")
-plt.ylabel("Values of the weights")
-plt.legend(loc=1)
-
-plt.figure(figsize=(6, 5))
-plt.title("Histogram of the weights")
-plt.hist(clf.coef_, bins=n_features, color="navy", log=True)
-plt.scatter(
-    clf.coef_[relevant_features],
-    np.full(len(relevant_features), 5.0),
-    color="gold",
-    marker="o",
-    label="Relevant features",
+ax.plot(X_plot, y_plot, color="black", label="Ground Truth")
+ax.plot(X_plot, y_brr, color="red", label="BayesianRidge with polynomial features")
+ax.plot(X_plot, y_ard, color="navy", label="ARD with polynomial features")
+ax.fill_between(
+    X_plot.ravel(),
+    y_ard - y_ard_std,
+    y_ard + y_ard_std,
+    color="navy",
+    alpha=0.3,
 )
-plt.ylabel("Features")
-plt.xlabel("Values of the weights")
-plt.legend(loc=1)
-
-plt.figure(figsize=(6, 5))
-plt.title("Marginal log-likelihood")
-plt.plot(clf.scores_, color="navy", linewidth=2)
-plt.ylabel("Score")
-plt.xlabel("Iterations")
-
+ax.fill_between(
+    X_plot.ravel(),
+    y_brr - y_brr_std,
+    y_brr + y_brr_std,
+    color="red",
+    alpha=0.3,
+)
+ax.legend()
+_ = ax.set_title("Polynomial fit of a non-linear feature")
 
-# Plotting some predictions for polynomial regression
-def f(x, noise_amount):
-    y = np.sqrt(x) * np.sin(x)
-    noise = np.random.normal(0, 1, len(x))
-    return y + noise_amount * noise
-
-
-degree = 10
-X = np.linspace(0, 10, 100)
-y = f(X, noise_amount=1)
-clf_poly = ARDRegression(threshold_lambda=1e5)
-clf_poly.fit(np.vander(X, degree), y)
-
-X_plot = np.linspace(0, 11, 25)
-y_plot = f(X_plot, noise_amount=0)
-y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True)
-plt.figure(figsize=(6, 5))
-plt.errorbar(X_plot, y_mean, y_std, color="navy", label="Polynomial ARD", linewidth=2)
-plt.plot(X_plot, y_plot, color="gold", linewidth=2, label="Ground Truth")
-plt.ylabel("Output y")
-plt.xlabel("Feature X")
-plt.legend(loc="lower left")
-plt.show()
+# %%
+# The error bars represent one standard deviation of the predicted gaussian
+# distribution of the query points. Notice that the ARD regression captures the
+# ground truth the best when using the default parameters in both models, but
+# further reducing the `lambda_init` hyperparameter of the Bayesian Ridge can
+# reduce its bias (see example
+# :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`).
+# Finally, due to the intrinsic limitations of a polynomial regression, both
+# models fail when extrapolating.