From 78ecbe641c8c505f3cbb4c45e4db3ffd1709a760 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 29 Sep 2024 23:34:06 +0200 Subject: [PATCH 1/4] DOC remove example OLS with 3D plot --- doc/conf.py | 1 + examples/linear_model/plot_ols.py | 113 +++++++++++++++------------ examples/linear_model/plot_ols_3d.py | 83 -------------------- 3 files changed, 66 insertions(+), 131 deletions(-) delete mode 100644 examples/linear_model/plot_ols_3d.py diff --git a/doc/conf.py b/doc/conf.py index 9ab1966b70e73..903ea36b4dd18 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -475,6 +475,7 @@ def add_js_css_files(app, pagename, templatename, context, doctree): "auto_examples/linear_model/plot_lasso_coordinate_descent_path.py": ( "auto_examples/linear_model/plot_lasso_lasso_lars_elasticnet_path.py" ), + "auto_examples/linear_model/plot_ols_3d": ("auto_examples/linear_model/plot_ols"), } html_context["redirects"] = redirects for old_link in redirects: diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py index 8aaa35ed8d899..c19d97ad045c9 100644 --- a/examples/linear_model/plot_ols.py +++ b/examples/linear_model/plot_ols.py @@ -1,63 +1,80 @@ """ -========================================================= -Linear Regression Example -========================================================= -The example below uses only the first feature of the `diabetes` dataset, -in order to illustrate the data points within the two-dimensional plot. -The straight line can be seen in the plot, showing how linear regression -attempts to draw a straight line that will best minimize the -residual sum of squares between the observed responses in the dataset, -and the responses predicted by the linear approximation. - -The coefficients, residual sum of squares and the coefficient of -determination are also calculated. +===================================== +Simple Ordinary Least Squares Example +===================================== +This example shows how to use the simplest ordinary least squares (OLS) model +called :class:`~sklearn.linear_model.LinearRegression` in scikit-learn. + +For this purpose, we use a single feature from the diabetes dataset and try to +predict the diabetes progression using this linear model. We therefore load the +diabetes dataset and split it into training and test sets. + +Then, we fit the model on the training set and evaluate its performance on the test +set and finally visualize the results on the test set. """ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause -import matplotlib.pyplot as plt -import numpy as np - -from sklearn import datasets, linear_model +# %% +# Data Loading and Preparation +# ---------------------------- +# +# Load the diabetes dataset. For simplicity, we only keep a single feature in the data. +# Then, we split the data and target into training and test sets. +from sklearn.datasets import load_diabetes +from sklearn.model_selection import train_test_split + +X, y = load_diabetes(return_X_y=True) +X = X[:, [2]] # Use only one feature +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20, shuffle=False) + +# %% +# Linear regression model +# ----------------------- +# +# We create a linear regression model and fit it on the training data. +from sklearn.linear_model import LinearRegression + +regressor = LinearRegression().fit(X_train, y_train) + +# %% +# Model evaluation +# ---------------- +# +# We evaluate the model's performance on the test set using the mean squared error +# and the coefficient of determination. from sklearn.metrics import mean_squared_error, r2_score -# Load the diabetes dataset -diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True) - -# Use only one feature -diabetes_X = diabetes_X[:, np.newaxis, 2] - -# Split the data into training/testing sets -diabetes_X_train = diabetes_X[:-20] -diabetes_X_test = diabetes_X[-20:] +y_pred = regressor.predict(X_test) -# Split the targets into training/testing sets -diabetes_y_train = diabetes_y[:-20] -diabetes_y_test = diabetes_y[-20:] +print(f"Mean squared error: {mean_squared_error(y_test, y_pred):.2f}") +print(f"Coefficient of determination: {r2_score(y_test, y_pred):.2f}") -# Create linear regression object -regr = linear_model.LinearRegression() - -# Train the model using the training sets -regr.fit(diabetes_X_train, diabetes_y_train) - -# Make predictions using the testing set -diabetes_y_pred = regr.predict(diabetes_X_test) - -# The coefficients -print("Coefficients: \n", regr.coef_) -# The mean squared error -print("Mean squared error: %.2f" % mean_squared_error(diabetes_y_test, diabetes_y_pred)) -# The coefficient of determination: 1 is perfect prediction -print("Coefficient of determination: %.2f" % r2_score(diabetes_y_test, diabetes_y_pred)) +# %% +# Plotting the results +# -------------------- +# +# Finally, we visualize the results on the test set. +import matplotlib.pyplot as plt -# Plot outputs -plt.scatter(diabetes_X_test, diabetes_y_test, color="black") -plt.plot(diabetes_X_test, diabetes_y_pred, color="blue", linewidth=3) +fig, ax = plt.subplots() -plt.xticks(()) -plt.yticks(()) +ax.scatter(X_test, y_test, label="Data points") +ax.plot(X_test, y_pred, linewidth=3, color="tab:orange", label="Model predictions") +ax.set(xlabel="Feature", ylabel="Target", title="Linear Regression") +ax.legend() plt.show() + +# %% +# Conclusion +# ---------- +# +# This example shows how to use the simplest linear model called +# :class:`~sklearn.linear_model.LinearRegression` in scikit-learn. For this purpose, we +# use a single feature from the diabetes dataset and try to predict the diabetes +# progression using this linear model. We therefore load the diabetes dataset and split +# it into training and test sets. +# diff --git a/examples/linear_model/plot_ols_3d.py b/examples/linear_model/plot_ols_3d.py deleted file mode 100644 index cd848f659e8d8..0000000000000 --- a/examples/linear_model/plot_ols_3d.py +++ /dev/null @@ -1,83 +0,0 @@ -""" -========================================================= -Sparsity Example: Fitting only features 1 and 2 -========================================================= - -Features 1 and 2 of the diabetes-dataset are fitted and -plotted below. It illustrates that although feature 2 -has a strong coefficient on the full model, it does not -give us much regarding `y` when compared to just feature 1. -""" - -# Authors: The scikit-learn developers -# SPDX-License-Identifier: BSD-3-Clause - -# %% -# First we load the diabetes dataset. - -import numpy as np - -from sklearn import datasets - -X, y = datasets.load_diabetes(return_X_y=True) -indices = (0, 1) - -X_train = X[:-20, indices] -X_test = X[-20:, indices] -y_train = y[:-20] -y_test = y[-20:] - -# %% -# Next we fit a linear regression model. - -from sklearn import linear_model - -ols = linear_model.LinearRegression() -_ = ols.fit(X_train, y_train) - - -# %% -# Finally we plot the figure from three different views. - -import matplotlib.pyplot as plt - -# unused but required import for doing 3d projections with matplotlib < 3.2 -import mpl_toolkits.mplot3d # noqa: F401 - - -def plot_figs(fig_num, elev, azim, X_train, clf): - fig = plt.figure(fig_num, figsize=(4, 3)) - plt.clf() - ax = fig.add_subplot(111, projection="3d", elev=elev, azim=azim) - - ax.scatter(X_train[:, 0], X_train[:, 1], y_train, c="k", marker="+") - ax.plot_surface( - np.array([[-0.1, -0.1], [0.15, 0.15]]), - np.array([[-0.1, 0.15], [-0.1, 0.15]]), - clf.predict( - np.array([[-0.1, -0.1, 0.15, 0.15], [-0.1, 0.15, -0.1, 0.15]]).T - ).reshape((2, 2)), - alpha=0.5, - ) - ax.set_xlabel("X_1") - ax.set_ylabel("X_2") - ax.set_zlabel("Y") - ax.xaxis.set_ticklabels([]) - ax.yaxis.set_ticklabels([]) - ax.zaxis.set_ticklabels([]) - - -# Generate the three different figures from different views -elev = 43.5 -azim = -110 -plot_figs(1, elev, azim, X_train, ols) - -elev = -0.5 -azim = 0 -plot_figs(2, elev, azim, X_train, ols) - -elev = -0.5 -azim = 90 -plot_figs(3, elev, azim, X_train, ols) - -plt.show() From 4477e50e0cb711db12e435487057dfc59a5f0e43 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 16 Oct 2024 11:00:19 +0200 Subject: [PATCH 2/4] improve wording conclusion --- examples/linear_model/plot_ols.py | 45 ++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py index c19d97ad045c9..65968ae90a7b3 100644 --- a/examples/linear_model/plot_ols.py +++ b/examples/linear_model/plot_ols.py @@ -1,9 +1,9 @@ """ -===================================== -Simple Ordinary Least Squares Example -===================================== +============================== +Ordinary Least Squares Example +============================== -This example shows how to use the simplest ordinary least squares (OLS) model +This example shows how to use the ordinary least squares (OLS) model called :class:`~sklearn.linear_model.LinearRegression` in scikit-learn. For this purpose, we use a single feature from the diabetes dataset and try to @@ -34,7 +34,9 @@ # Linear regression model # ----------------------- # -# We create a linear regression model and fit it on the training data. +# We create a linear regression model and fit it on the training data. Note that by +# default, an intercept is added to the model. We can control this behavior by setting +# the `fit_intercept` parameter. from sklearn.linear_model import LinearRegression regressor = LinearRegression().fit(X_train, y_train) @@ -56,15 +58,22 @@ # Plotting the results # -------------------- # -# Finally, we visualize the results on the test set. +# Finally, we visualize the results on the train and test data. import matplotlib.pyplot as plt -fig, ax = plt.subplots() +fig, ax = plt.subplots(ncols=2, figsize=(10, 5), sharex=True, sharey=True) -ax.scatter(X_test, y_test, label="Data points") -ax.plot(X_test, y_pred, linewidth=3, color="tab:orange", label="Model predictions") -ax.set(xlabel="Feature", ylabel="Target", title="Linear Regression") -ax.legend() +ax[0].scatter(X_train, y_train, label="Train data points") +ax[0].plot(X_train, regressor.predict(X_train), linewidth=3, color="tab:orange", label="Model predictions") +ax[0].set(xlabel="Feature", ylabel="Target", title="Train set") +ax[0].legend() + +ax[1].scatter(X_test, y_test, label="Test data points") +ax[1].plot(X_test, y_pred, linewidth=3, color="tab:orange", label="Model predictions") +ax[1].set(xlabel="Feature", ylabel="Target", title="Test set") +ax[1].legend() + +fig.suptitle("Linear Regression") plt.show() @@ -72,9 +81,13 @@ # Conclusion # ---------- # -# This example shows how to use the simplest linear model called -# :class:`~sklearn.linear_model.LinearRegression` in scikit-learn. For this purpose, we -# use a single feature from the diabetes dataset and try to predict the diabetes -# progression using this linear model. We therefore load the diabetes dataset and split -# it into training and test sets. +# The trained model corresponds to the estimator that minimizes the mean squared error +# between the predicted and the true target values on the training data. This means that +# there is no other model that fits the training data better in terms of squared error. +# We therefore obtain an estimator of the conditional mean of the target given the +# data. # +# Note that in higher dimensions, minimizing only the squared error might lead to +# overfitting. Therefore, regularization techniques are commonly used to prevent this +# issue, such as those implemented in :class:`~sklearn.linear_model.Ridge` or +# :class:`~sklearn.linear_model.Lasso`. From 7f6bf9479c9092b34b42f6c7244dd69867758d39 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 16 Oct 2024 11:03:33 +0200 Subject: [PATCH 3/4] lint --- examples/linear_model/plot_ols.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py index 65968ae90a7b3..b4cef3aaae918 100644 --- a/examples/linear_model/plot_ols.py +++ b/examples/linear_model/plot_ols.py @@ -64,7 +64,13 @@ fig, ax = plt.subplots(ncols=2, figsize=(10, 5), sharex=True, sharey=True) ax[0].scatter(X_train, y_train, label="Train data points") -ax[0].plot(X_train, regressor.predict(X_train), linewidth=3, color="tab:orange", label="Model predictions") +ax[0].plot( + X_train, + regressor.predict(X_train), + linewidth=3, + color="tab:orange", + label="Model predictions", +) ax[0].set(xlabel="Feature", ylabel="Target", title="Train set") ax[0].legend() From d82277eabfa25909b6588be9848196a1c8ab42b7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 16 Oct 2024 16:25:30 +0200 Subject: [PATCH 4/4] remove statemetn --- examples/linear_model/plot_ols.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_ols.py b/examples/linear_model/plot_ols.py index b4cef3aaae918..aeb8e986459fc 100644 --- a/examples/linear_model/plot_ols.py +++ b/examples/linear_model/plot_ols.py @@ -88,10 +88,8 @@ # ---------- # # The trained model corresponds to the estimator that minimizes the mean squared error -# between the predicted and the true target values on the training data. This means that -# there is no other model that fits the training data better in terms of squared error. -# We therefore obtain an estimator of the conditional mean of the target given the -# data. +# between the predicted and the true target values on the training data. We therefore +# obtain an estimator of the conditional mean of the target given the data. # # Note that in higher dimensions, minimizing only the squared error might lead to # overfitting. Therefore, regularization techniques are commonly used to prevent this