diff --git a/doc/conf.py b/doc/conf.py index 5430a9b78021d..6c79138230ff8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -253,6 +253,9 @@ "auto_examples/model_selection/plot_permutation_tests_for_classification" ), "modules/model_persistence": "model_persistence", + "auto_examples/linear_model/plot_bayesian_ridge": ( + "auto_examples/linear_model/plot_ard" + ), } html_context["redirects"] = redirects for old_link in redirects: diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 24dfa901b1d42..125c78a5043b7 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -746,13 +746,6 @@ There are four more hyperparameters, :math:`\alpha_1`, :math:`\alpha_2`, :math:`\alpha` and :math:`\lambda`. These are usually chosen to be *non-informative*. By default :math:`\alpha_1 = \alpha_2 = \lambda_1 = \lambda_2 = 10^{-6}`. - -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_bayesian_ridge_001.png - :target: ../auto_examples/linear_model/plot_bayesian_ridge.html - :align: center - :scale: 50% - - Bayesian Ridge Regression is used for regression:: >>> from sklearn import linear_model @@ -778,7 +771,6 @@ is more robust to ill-posed problems. .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge.py` * :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py` .. topic:: References: @@ -789,37 +781,35 @@ is more robust to ill-posed problems. * Michael E. Tipping, `Sparse Bayesian Learning and the Relevance Vector Machine `_, 2001. +.. _automatic_relevance_determination: Automatic Relevance Determination - ARD --------------------------------------- -:class:`ARDRegression` is very similar to `Bayesian Ridge Regression`_, -but can lead to sparser coefficients :math:`w` [1]_ [2]_. -:class:`ARDRegression` poses a different prior over :math:`w`, by dropping the -assumption of the Gaussian being spherical. +The Automatic Relevance Determination (as being implemented in +:class:`ARDRegression`) is a kind of linear model which is very similar to the +`Bayesian Ridge Regression`_, but that leads to sparser coefficients :math:`w` +[1]_ [2]_. -Instead, the distribution over :math:`w` is assumed to be an axis-parallel, -elliptical Gaussian distribution. - -This means each coefficient :math:`w_{i}` is drawn from a Gaussian distribution, -centered on zero and with a precision :math:`\lambda_{i}`: +:class:`ARDRegression` poses a different prior over :math:`w`: it drops +the spherical Gaussian distribution for a centered elliptic Gaussian +distribution. This means each coefficient :math:`w_{i}` can itself be drawn from +a Gaussian distribution, centered on zero and with a precision +:math:`\lambda_{i}`: .. math:: p(w|\lambda) = \mathcal{N}(w|0,A^{-1}) -with :math:`\text{diag}(A) = \lambda = \{\lambda_{1},...,\lambda_{p}\}`. - -In contrast to `Bayesian Ridge Regression`_, each coordinate of :math:`w_{i}` -has its own standard deviation :math:`\lambda_i`. The prior over all -:math:`\lambda_i` is chosen to be the same gamma distribution given by -hyperparameters :math:`\lambda_1` and :math:`\lambda_2`. +with :math:`A` being a positive definite diagonal matrix and +:math:`\text{diag}(A) = \lambda = \{\lambda_{1},...,\lambda_{p}\}`. -.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_ard_001.png - :target: ../auto_examples/linear_model/plot_ard.html - :align: center - :scale: 50% +In contrast to the `Bayesian Ridge Regression`_, each coordinate of +:math:`w_{i}` has its own standard deviation :math:`\frac{1}{\lambda_i}`. The +prior over all :math:`\lambda_i` is chosen to be the same gamma distribution +given by the hyperparameters :math:`\lambda_1` and :math:`\lambda_2`. -ARD is also known in the literature as *Sparse Bayesian Learning* and -*Relevance Vector Machine* [3]_ [4]_. +ARD is also known in the literature as *Sparse Bayesian Learning* and *Relevance +Vector Machine* [3]_ [4]_. For a worked-out comparison between ARD and `Bayesian +Ridge Regression`_, see the example below. .. topic:: Examples: diff --git a/examples/linear_model/plot_ard.py b/examples/linear_model/plot_ard.py index 7b5f9143d5853..261fec8aeee3b 100644 --- a/examples/linear_model/plot_ard.py +++ b/examples/linear_model/plot_ard.py @@ -1,119 +1,210 @@ """ -================================================== -Automatic Relevance Determination Regression (ARD) -================================================== +==================================== +Comparing Linear Bayesian Regressors +==================================== -Fit regression model with Bayesian Ridge Regression. +This example compares two different bayesian regressors: -See :ref:`bayesian_ridge_regression` for more information on the regressor. + - a :ref:`automatic_relevance_determination` + - a :ref:`bayesian_ridge_regression` -Compared to the OLS (ordinary least squares) estimator, the coefficient -weights are slightly shifted toward zeros, which stabilises them. +In the first part, we use an :ref:`ordinary_least_squares` (OLS) model as a +baseline for comparing the models' coefficients with respect to the true +coefficients. Thereafter, we show that the estimation of such models is done by +iteratively maximizing the marginal log-likelihood of the observations. -The histogram of the estimated weights is very peaked, as a sparsity-inducing -prior is implied on the weights. +In the last section we plot predictions and uncertainties for the ARD and the +Bayesian Ridge regressions using a polynomial feature expansion to fit a +non-linear relationship between `X` and `y`. -The estimation of the model is done by iteratively maximizing the -marginal log-likelihood of the observations. +""" -We also plot predictions and uncertainties for ARD -for one dimensional regression using polynomial feature expansion. -Note the uncertainty starts going up on the right side of the plot. -This is because these test samples are outside of the range of the training -samples. +# Author: Arturo Amor -""" +# %% +# Models robustness to recover the ground truth weights +# ===================================================== +# +# Generate synthetic dataset +# -------------------------- +# +# We generate a dataset where `X` and `y` are linearly linked: 10 of the +# features of `X` will be used to generate `y`. The other features are not +# useful at predicting `y`. In addition, we generate a dataset where `n_samples +# == n_features`. Such a setting is challenging for an OLS model and leads +# potentially to arbitrary large weights. Having a prior on the weights and a +# penalty alleviates the problem. Finally, gaussian noise is added. + +from sklearn.datasets import make_regression + +X, y, true_weights = make_regression( + n_samples=100, + n_features=100, + n_informative=10, + noise=8, + coef=True, + random_state=42, +) -import numpy as np +# %% +# Fit the regressors +# ------------------ +# +# We now fit both Bayesian models and the OLS to later compare the models' +# coefficients. + +import pandas as pd +from sklearn.linear_model import ARDRegression, LinearRegression, BayesianRidge + +olr = LinearRegression().fit(X, y) +brr = BayesianRidge(compute_score=True, n_iter=30).fit(X, y) +ard = ARDRegression(compute_score=True, n_iter=30).fit(X, y) +df = pd.DataFrame( + { + "Weights of true generative process": true_weights, + "ARDRegression": ard.coef_, + "BayesianRidge": brr.coef_, + "LinearRegression": olr.coef_, + } +) + +# %% +# Plot the true and estimated coefficients +# ---------------------------------------- +# +# Now we compare the coefficients of each model with the weights of +# the true generative model. import matplotlib.pyplot as plt -from scipy import stats +import seaborn as sns +from matplotlib.colors import SymLogNorm + +plt.figure(figsize=(10, 6)) +ax = sns.heatmap( + df.T, + norm=SymLogNorm(linthresh=10e-4, vmin=-80, vmax=80), + cbar_kws={"label": "coefficients' values"}, + cmap="seismic_r", +) +plt.ylabel("linear model") +plt.xlabel("coefficients") +plt.tight_layout(rect=(0, 0, 1, 0.95)) +_ = plt.title("Models' coefficients") -from sklearn.linear_model import ARDRegression, LinearRegression +# %% +# Due to the added noise, none of the models recover the true weights. Indeed, +# all models always have more than 10 non-zero coefficients. Compared to the OLS +# estimator, the coefficients using a Bayesian Ridge regression are slightly +# shifted toward zero, which stabilises them. The ARD regression provides a +# sparser solution: some of the non-informative coefficients are set exactly to +# zero, while shifting others closer to zero. Some non-informative coefficients +# are still present and retain large values. # %% -# Generating simulated data with Gaussian weights - -# Parameters of the example -np.random.seed(0) -n_samples, n_features = 100, 100 -# Create Gaussian data -X = np.random.randn(n_samples, n_features) -# Create weights with a precision lambda_ of 4. -lambda_ = 4.0 -w = np.zeros(n_features) -# Only keep 10 weights of interest -relevant_features = np.random.randint(0, n_features, 10) -for i in relevant_features: - w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_)) -# Create noise with a precision alpha of 50. -alpha_ = 50.0 -noise = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(alpha_), size=n_samples) -# Create the target -y = np.dot(X, w) + noise +# Plot the marginal log-likelihood +# -------------------------------- +import numpy as np + +ard_scores = -np.array(ard.scores_) +brr_scores = -np.array(brr.scores_) +plt.plot(ard_scores, color="navy", label="ARD") +plt.plot(brr_scores, color="red", label="BayesianRidge") +plt.ylabel("Log-likelihood") +plt.xlabel("Iterations") +plt.xlim(1, 30) +plt.legend() +_ = plt.title("Models log-likelihood") # %% -# Fit the ARD Regression -clf = ARDRegression(compute_score=True) -clf.fit(X, y) +# Indeed, both models minimize the log-likelihood up to an arbitrary cutoff +# defined by the `n_iter` parameter. +# +# Bayesian regressions with polynomial feature expansion +# ====================================================== +# Generate synthetic dataset +# -------------------------- +# We create a target that is a non-linear function of the input feature. +# Noise following a standard uniform distribution is added. + +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import PolynomialFeatures, StandardScaler + +rng = np.random.RandomState(0) +n_samples = 110 + +# sort the data to make plotting easier later +X = np.sort(-10 * rng.rand(n_samples) + 10) +noise = rng.normal(0, 1, n_samples) * 1.35 +y = np.sqrt(X) * np.sin(X) + noise +full_data = pd.DataFrame({"input_feature": X, "target": y}) +X = X.reshape((-1, 1)) + +# extrapolation +X_plot = np.linspace(10, 10.4, 10) +y_plot = np.sqrt(X_plot) * np.sin(X_plot) +X_plot = np.concatenate((X, X_plot.reshape((-1, 1)))) +y_plot = np.concatenate((y - noise, y_plot)) -ols = LinearRegression() -ols.fit(X, y) +# %% +# Fit the regressors +# ------------------ +# +# Here we try a degree 10 polynomial to potentially overfit, though the bayesian +# linear models regularize the size of the polynomial coefficients. As +# `fit_intercept=True` by default for +# :class:`~sklearn.linear_model.ARDRegression` and +# :class:`~sklearn.linear_model.BayesianRidge`, then +# :class:`~sklearn.preprocessing.PolynomialFeatures` should not introduce an +# additional bias feature. By setting `return_std=True`, the bayesian regressors +# return the standard deviation of the posterior distribution for the model +# parameters. + +ard_poly = make_pipeline( + PolynomialFeatures(degree=10, include_bias=False), + StandardScaler(), + ARDRegression(), +).fit(X, y) +brr_poly = make_pipeline( + PolynomialFeatures(degree=10, include_bias=False), + StandardScaler(), + BayesianRidge(), +).fit(X, y) + +y_ard, y_ard_std = ard_poly.predict(X_plot, return_std=True) +y_brr, y_brr_std = brr_poly.predict(X_plot, return_std=True) # %% -# Plot the true weights, the estimated weights, the histogram of the -# weights, and predictions with standard deviations -plt.figure(figsize=(6, 5)) -plt.title("Weights of the model") -plt.plot(clf.coef_, color="darkblue", linestyle="-", linewidth=2, label="ARD estimate") -plt.plot( - ols.coef_, color="yellowgreen", linestyle=":", linewidth=2, label="OLS estimate" +# Plotting polynomial regressions with std errors of the scores +# ------------------------------------------------------------- + +ax = sns.scatterplot( + data=full_data, x="input_feature", y="target", color="black", alpha=0.75 ) -plt.plot(w, color="orange", linestyle="-", linewidth=2, label="Ground truth") -plt.xlabel("Features") -plt.ylabel("Values of the weights") -plt.legend(loc=1) - -plt.figure(figsize=(6, 5)) -plt.title("Histogram of the weights") -plt.hist(clf.coef_, bins=n_features, color="navy", log=True) -plt.scatter( - clf.coef_[relevant_features], - np.full(len(relevant_features), 5.0), - color="gold", - marker="o", - label="Relevant features", +ax.plot(X_plot, y_plot, color="black", label="Ground Truth") +ax.plot(X_plot, y_brr, color="red", label="BayesianRidge with polynomial features") +ax.plot(X_plot, y_ard, color="navy", label="ARD with polynomial features") +ax.fill_between( + X_plot.ravel(), + y_ard - y_ard_std, + y_ard + y_ard_std, + color="navy", + alpha=0.3, ) -plt.ylabel("Features") -plt.xlabel("Values of the weights") -plt.legend(loc=1) - -plt.figure(figsize=(6, 5)) -plt.title("Marginal log-likelihood") -plt.plot(clf.scores_, color="navy", linewidth=2) -plt.ylabel("Score") -plt.xlabel("Iterations") - +ax.fill_between( + X_plot.ravel(), + y_brr - y_brr_std, + y_brr + y_brr_std, + color="red", + alpha=0.3, +) +ax.legend() +_ = ax.set_title("Polynomial fit of a non-linear feature") -# Plotting some predictions for polynomial regression -def f(x, noise_amount): - y = np.sqrt(x) * np.sin(x) - noise = np.random.normal(0, 1, len(x)) - return y + noise_amount * noise - - -degree = 10 -X = np.linspace(0, 10, 100) -y = f(X, noise_amount=1) -clf_poly = ARDRegression(threshold_lambda=1e5) -clf_poly.fit(np.vander(X, degree), y) - -X_plot = np.linspace(0, 11, 25) -y_plot = f(X_plot, noise_amount=0) -y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True) -plt.figure(figsize=(6, 5)) -plt.errorbar(X_plot, y_mean, y_std, color="navy", label="Polynomial ARD", linewidth=2) -plt.plot(X_plot, y_plot, color="gold", linewidth=2, label="Ground Truth") -plt.ylabel("Output y") -plt.xlabel("Feature X") -plt.legend(loc="lower left") -plt.show() +# %% +# The error bars represent one standard deviation of the predicted gaussian +# distribution of the query points. Notice that the ARD regression captures the +# ground truth the best when using the default parameters in both models, but +# further reducing the `lambda_init` hyperparameter of the Bayesian Ridge can +# reduce its bias (see example +# :ref:`sphx_glr_auto_examples_linear_model_plot_bayesian_ridge_curvefit.py`). +# Finally, due to the intrinsic limitations of a polynomial regression, both +# models fail when extrapolating. diff --git a/examples/linear_model/plot_bayesian_ridge.py b/examples/linear_model/plot_bayesian_ridge.py deleted file mode 100644 index 8b6b31133cc7c..0000000000000 --- a/examples/linear_model/plot_bayesian_ridge.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -========================= -Bayesian Ridge Regression -========================= - -Computes a Bayesian Ridge Regression on a synthetic dataset. - -See :ref:`bayesian_ridge_regression` for more information on the regressor. - -Compared to the OLS (ordinary least squares) estimator, the coefficient -weights are slightly shifted toward zeros, which stabilises them. - -As the prior on the weights is a Gaussian prior, the histogram of the -estimated weights is Gaussian. - -The estimation of the model is done by iteratively maximizing the -marginal log-likelihood of the observations. - -We also plot predictions and uncertainties for Bayesian Ridge Regression -for one dimensional regression using polynomial feature expansion. -Note the uncertainty starts going up on the right side of the plot. -This is because these test samples are outside of the range of the training -samples. - -""" - -# %% -# Generate simulated data with Gaussian weights -# --------------------------------------------- -import numpy as np -from scipy import stats - -np.random.seed(0) -n_samples, n_features = 100, 100 -X = np.random.randn(n_samples, n_features) # Create Gaussian data -# Create weights with a precision lambda_ of 4. -lambda_ = 4.0 -w = np.zeros(n_features) -# Only keep 10 weights of interest -relevant_features = np.random.randint(0, n_features, 10) - -for i in relevant_features: - w[i] = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(lambda_)) -# Create noise with a precision alpha of 50. -alpha_ = 50.0 -noise = stats.norm.rvs(loc=0, scale=1.0 / np.sqrt(alpha_), size=n_samples) -# Create the target -y = np.dot(X, w) + noise - -# %% -# Fit the Bayesian Ridge Regression and an OLS for comparison -# ----------------------------------------------------------- -from sklearn.linear_model import BayesianRidge, LinearRegression - -clf = BayesianRidge(compute_score=True) -clf.fit(X, y) - -ols = LinearRegression() -ols.fit(X, y) - -# %% -# Plot true weights and estimated weights -# --------------------------------------- -import matplotlib.pyplot as plt - -lw = 2 -plt.figure(figsize=(6, 5)) -plt.title("Weights of the model") -plt.plot(clf.coef_, color="lightgreen", linewidth=lw, label="Bayesian Ridge estimate") -plt.plot(w, color="gold", linewidth=lw, label="Ground truth") -plt.plot(ols.coef_, color="navy", linestyle="--", label="OLS estimate") -plt.xlabel("Features") -plt.ylabel("Values of the weights") -_ = plt.legend(loc="best", prop=dict(size=12)) - -# %% -# Plot histogram of the weights -# ----------------------------- - -plt.figure(figsize=(6, 5)) -plt.title("Histogram of the weights") -plt.hist(clf.coef_, bins=n_features, color="gold", log=True, edgecolor="black") -plt.scatter( - clf.coef_[relevant_features], - np.full(len(relevant_features), 5.0), - color="navy", - label="Relevant features", -) -plt.ylabel("Features") -plt.xlabel("Values of the weights") -_ = plt.legend(loc="upper left") - -# %% -# Plot marginal log-likelihood -# ---------------------------- - -plt.figure(figsize=(6, 5)) -plt.title("Marginal log-likelihood") -plt.plot(clf.scores_, color="navy", linewidth=lw) -plt.ylabel("Score") -_ = plt.xlabel("Iterations") - -# %% -# Plot some predictions for polynomial regression with standard deviations -# ------------------------------------------------------------------------ - - -def f(x, noise_amount): - y = np.sqrt(x) * np.sin(x) - noise = np.random.normal(0, 1, len(x)) - return y + noise_amount * noise - - -degree = 10 -X = np.linspace(0, 10, 100) -y = f(X, noise_amount=0.1) -clf_poly = BayesianRidge() -clf_poly.fit(np.vander(X, degree), y) - -X_plot = np.linspace(0, 11, 25) -y_plot = f(X_plot, noise_amount=0) -y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True) -plt.figure(figsize=(6, 5)) -plt.errorbar( - X_plot, - y_mean, - y_std, - color="navy", - label="Polynomial Bayesian Ridge Regression", - linewidth=lw, -) -plt.plot(X_plot, y_plot, color="gold", linewidth=lw, label="Ground Truth") -plt.ylabel("Output y") -plt.xlabel("Feature X") -_ = plt.legend(loc="lower left")