From c3839d9afa590cfb683d1c79d45731bb600a5b03 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 27 Oct 2021 18:10:08 +0200
Subject: [PATCH 01/35] FIX compute the AIC and BIC using MSE

---
 sklearn/linear_model/_least_angle.py | 36 +++++++++++++++-------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 351aa20f549c2..1158c296a0d01 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -1961,8 +1961,8 @@ class LassoLarsIC(LassoLars):
 
     (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
 
-    AIC is the Akaike information criterion and BIC is the Bayes
-    Information criterion. Such criteria are useful to select the value
+    AIC is the Akaike information criterion [2]_ and BIC is the Bayes
+    Information criterion [3]_. Such criteria are useful to select the value
     of the regularization parameter by making a trade-off between the
     goodness of fit and the complexity of the model. A good model should
     explain well the data while being simple.
@@ -2078,14 +2078,19 @@ class LassoLarsIC(LassoLars):
 
     Notes
     -----
-    The estimation of the number of degrees of freedom is given by:
+    The number of degrees of freedom is computed as in [1]_.
 
-    "On the degrees of freedom of the lasso"
-    Hui Zou, Trevor Hastie, and Robert Tibshirani
-    Ann. Statist. Volume 35, Number 5 (2007), 2173-2192.
+    References
+    ----------
+    .. [1] "On the degrees of freedom of the lasso"
+           Hui Zou, Trevor Hastie, and Robert Tibshirani
+           Ann. Statist. Volume 35, Number 5 (2007), 2173-2192.
+
+    .. [2] `Wikipedia entry on the Akaike information criterion
+            <https://en.wikipedia.org/wiki/Akaike_information_criterion>`_
 
-    https://en.wikipedia.org/wiki/Akaike_information_criterion
-    https://en.wikipedia.org/wiki/Bayesian_information_criterion
+    .. [3] `Wikipedia entry on the Bayesian information criterion
+            <https://en.wikipedia.org/wiki/Bayesian_information_criterion>`_
 
     Examples
     --------
@@ -2183,11 +2188,10 @@ def fit(self, X, y, copy_X=None):
         else:
             raise ValueError("criterion should be either bic or aic")
 
-        R = y[:, np.newaxis] - np.dot(X, coef_path_)  # residuals
-        mean_squared_error = np.mean(R ** 2, axis=0)
-        sigma2 = np.var(y)
+        residuals = y[:, np.newaxis] - np.dot(X, coef_path_)
+        mean_squared_error = np.mean(residuals ** 2, axis=0)
 
-        df = np.zeros(coef_path_.shape[1], dtype=int)  # Degrees of freedom
+        degrees_freedom = np.zeros(coef_path_.shape[1], dtype=int)
         for k, coef in enumerate(coef_path_.T):
             mask = np.abs(coef) > np.finfo(coef.dtype).eps
             if not np.any(mask):
@@ -2195,13 +2199,11 @@ def fit(self, X, y, copy_X=None):
             # get the number of degrees of freedom equal to:
             # Xc = X[:, mask]
             # Trace(Xc * inv(Xc.T, Xc) * Xc.T) ie the number of non-zero coefs
-            df[k] = np.sum(mask)
+            degrees_freedom[k] = np.sum(mask)
 
         self.alphas_ = alphas_
-        eps64 = np.finfo("float64").eps
-        self.criterion_ = (
-            n_samples * mean_squared_error / (sigma2 + eps64) + K * df
-        )  # Eqns. 2.15--16 in (Zou et al, 2007)
+        # Eqns. 2.15--16 in (Zou et al, 2007)
+        self.criterion_ = n_samples * np.log(mean_squared_error) + K * degrees_freedom
         n_best = np.argmin(self.criterion_)
 
         self.alpha_ = alphas_[n_best]

From b46e6eec8cdaa4686975e8bf39844da4c7dcd703 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 28 Oct 2021 00:31:25 +0200
Subject: [PATCH 02/35] iter

---
 sklearn/linear_model/_least_angle.py          | 14 +++++++-----
 .../linear_model/tests/test_least_angle.py    | 22 +++++++++++++++++++
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 1158c296a0d01..ac5765698c94b 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2049,8 +2049,7 @@ class LassoLarsIC(LassoLars):
     criterion_ : array-like of shape (n_alphas,)
         The value of the information criteria ('aic', 'bic') across all
         alphas. The alpha which has the smallest information criterion is
-        chosen. This value is larger by a factor of ``n_samples`` compared to
-        Eqns. 2.15 and 2.16 in (Zou et al, 2007).
+        chosen.
 
     n_features_in_ : int
         Number of features seen during :term:`fit`.
@@ -2191,7 +2190,7 @@ def fit(self, X, y, copy_X=None):
         residuals = y[:, np.newaxis] - np.dot(X, coef_path_)
         mean_squared_error = np.mean(residuals ** 2, axis=0)
 
-        degrees_freedom = np.zeros(coef_path_.shape[1], dtype=int)
+        degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int)
         for k, coef in enumerate(coef_path_.T):
             mask = np.abs(coef) > np.finfo(coef.dtype).eps
             if not np.any(mask):
@@ -2199,11 +2198,14 @@ def fit(self, X, y, copy_X=None):
             # get the number of degrees of freedom equal to:
             # Xc = X[:, mask]
             # Trace(Xc * inv(Xc.T, Xc) * Xc.T) ie the number of non-zero coefs
-            degrees_freedom[k] = np.sum(mask)
+            degrees_of_freedom[k] = np.sum(mask)
 
         self.alphas_ = alphas_
-        # Eqns. 2.15--16 in (Zou et al, 2007)
-        self.criterion_ = n_samples * np.log(mean_squared_error) + K * degrees_freedom
+        self.criterion_ = (
+            n_samples * (np.log(2 * np.pi) + 1)  # constant that could be neglected
+            + n_samples * np.log(mean_squared_error)
+            + K * degrees_of_freedom
+        )
         n_best = np.argmin(self.criterion_)
 
         self.alpha_ = alphas_[n_best]
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 1e4c39cfe254d..643e50354024d 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -5,6 +5,8 @@
 from scipy import linalg
 from sklearn.base import clone
 from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import ignore_warnings
@@ -938,3 +940,23 @@ def test_lars_numeric_consistency(LARS, has_coef_path, args):
     if has_coef_path:
         assert_allclose(model_64.coef_path_, model_32.coef_path_, rtol=rtol, atol=atol)
     assert_allclose(model_64.intercept_, model_32.intercept_, rtol=rtol, atol=atol)
+
+
+@pytest.mark.parametrize("criterion", ["aic", "bic"])
+def test_lassolarsic_alpha_selection(criterion):
+    """Check that we properly compute the AIC and BIC score.
+
+    In this test, we reproduce the example of the Fig. 2 of Zou et al.
+    In this example, only 7 features should be selected.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/14566
+    https://github.com/scikit-learn/scikit-learn/issues/17145
+    """
+    model = make_pipeline(
+        StandardScaler(), LassoLarsIC(criterion=criterion, normalize=False)
+    )
+    model.fit(X, y)
+
+    best_alpha_selected = np.argmin(model[-1].criterion_)
+    assert best_alpha_selected == 7

From e864611ac0df3234e2ac4df1e51ab93452b4dc48 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 28 Oct 2021 00:33:23 +0200
Subject: [PATCH 03/35] add changelog

---
 doc/whats_new/v1.0.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 1cc84b74198ce..139ca63c425c5 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -21,6 +21,12 @@ Changelog
   and :class:`decomposition.MiniBatchSparsePCA` to be convex and match the referenced
   article. :pr:`19210` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+:mod:`sklearn.linear_model`
+...........................
+
+- |Fix| Fixed the computation of AIC and BIC in
+  :class:`linear_model.LassoLarsIC`.
+  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 .. _changes_1_0_1:
 

From 0b61363e51ba965d956a1e9427757e6c757c12d6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Thu, 28 Oct 2021 00:35:13 +0200
Subject: [PATCH 04/35] acknowledge

---
 doc/whats_new/v1.0.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 139ca63c425c5..2890dcc225194 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -26,7 +26,8 @@ Changelog
 
 - |Fix| Fixed the computation of AIC and BIC in
   :class:`linear_model.LassoLarsIC`.
-  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`.
+  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>` and
+  :user:`Andrés Babino <ababino>`.
 
 .. _changes_1_0_1:
 

From 776490e27531523e1aa51036fbc2a071c20f60bd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Fri, 29 Oct 2021 18:59:58 +0200
Subject: [PATCH 05/35] iter

---
 .../plot_lasso_model_selection.py             | 383 ++++++++++--------
 sklearn/linear_model/_least_angle.py          |  79 +++-
 .../linear_model/tests/test_least_angle.py    |  27 +-
 sklearn/utils/estimator_checks.py             |   5 +
 4 files changed, 323 insertions(+), 171 deletions(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index b2792c92f15bd..bff167c8688a5 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -1,171 +1,238 @@
-"""
-===================================================
-Lasso model selection: Cross-Validation / AIC / BIC
-===================================================
-
-Use the Akaike information criterion (AIC), the Bayes Information
-criterion (BIC) and cross-validation to select an optimal value
-of the regularization parameter alpha of the :ref:`lasso` estimator.
-
-Results obtained with LassoLarsIC are based on AIC/BIC criteria.
-
-Information-criterion based model selection is very fast, but it
-relies on a proper estimation of degrees of freedom, are
-derived for large samples (asymptotic results) and assume the model
-is correct, i.e. that the data are actually generated by this model.
-They also tend to break when the problem is badly conditioned
-(more features than samples).
-
-For cross-validation, we use 20-fold with 2 algorithms to compute the
-Lasso path: coordinate descent, as implemented by the LassoCV class, and
-Lars (least angle regression) as implemented by the LassoLarsCV class.
-Both algorithms give roughly the same results. They differ with regards
-to their execution speed and sources of numerical errors.
-
-Lars computes a path solution only for each kink in the path. As a
-result, it is very efficient when there are only of few kinks, which is
-the case if there are few features or samples. Also, it is able to
-compute the full path without setting any meta parameter. On the
-opposite, coordinate descent compute the path points on a pre-specified
-grid (here we use the default). Thus it is more efficient if the number
-of grid points is smaller than the number of kinks in the path. Such a
-strategy can be interesting if the number of features is really large
-and there are enough samples to select a large amount. In terms of
-numerical errors, for heavily correlated variables, Lars will accumulate
-more errors, while the coordinate descent algorithm will only sample the
-path on a grid.
-
-Note how the optimal value of alpha varies for each fold. This
-illustrates why nested-cross validation is necessary when trying to
-evaluate the performance of a method for which a parameter is chosen by
-cross-validation: this choice of parameter may not be optimal for unseen
-data.
-
-"""
-
-# Author: Olivier Grisel, Gael Varoquaux, Alexandre Gramfort
-# License: BSD 3 clause
-
-import time
-
+# """
+# ===================================================
+# Lasso model selection: Cross-Validation / AIC / BIC
+# ===================================================
+
+# Use the Akaike information criterion (AIC), the Bayes Information
+# criterion (BIC) and cross-validation to select an optimal value
+# of the regularization parameter alpha of the :ref:`lasso` estimator.
+
+# Results obtained with LassoLarsIC are based on AIC/BIC criteria.
+
+# Information-criterion based model selection is very fast, but it
+# relies on a proper estimation of degrees of freedom, are
+# derived for large samples (asymptotic results) and assume the model
+# is correct, i.e. that the data are actually generated by this model.
+# They also tend to break when the problem is badly conditioned
+# (more features than samples).
+
+# For cross-validation, we use 20-fold with 2 algorithms to compute the
+# Lasso path: coordinate descent, as implemented by the LassoCV class, and
+# Lars (least angle regression) as implemented by the LassoLarsCV class.
+# Both algorithms give roughly the same results. They differ with regards
+# to their execution speed and sources of numerical errors.
+
+# Lars computes a path solution only for each kink in the path. As a
+# result, it is very efficient when there are only of few kinks, which is
+# the case if there are few features or samples. Also, it is able to
+# compute the full path without setting any meta parameter. On the
+# opposite, coordinate descent compute the path points on a pre-specified
+# grid (here we use the default). Thus it is more efficient if the number
+# of grid points is smaller than the number of kinks in the path. Such a
+# strategy can be interesting if the number of features is really large
+# and there are enough samples to select a large amount. In terms of
+# numerical errors, for heavily correlated variables, Lars will accumulate
+# more errors, while the coordinate descent algorithm will only sample the
+# path on a grid.
+
+# Note how the optimal value of alpha varies for each fold. This
+# illustrates why nested-cross validation is necessary when trying to
+# evaluate the performance of a method for which a parameter is chosen by
+# cross-validation: this choice of parameter may not be optimal for unseen
+# data.
+
+# """
+
+# # Author: Olivier Grisel, Gael Varoquaux, Alexandre Gramfort
+# # License: BSD 3 clause
+
+# import time
+
+# import numpy as np
+# import matplotlib.pyplot as plt
+
+# from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
+# from sklearn import datasets
+
+# # This is to avoid division by zero while doing np.log10
+# EPSILON = 1e-4
+
+# X, y = datasets.load_diabetes(return_X_y=True)
+
+# rng = np.random.RandomState(42)
+# X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features
+
+# # normalize data as done by Lars to allow for comparison
+# X /= np.sqrt(np.sum(X ** 2, axis=0))
+
+# # #############################################################################
+# # LassoLarsIC: least angle regression with BIC/AIC criterion
+
+# model_bic = LassoLarsIC(criterion="bic", normalize=False)
+# t1 = time.time()
+# model_bic.fit(X, y)
+# t_bic = time.time() - t1
+# alpha_bic_ = model_bic.alpha_
+
+# model_aic = LassoLarsIC(criterion="aic", normalize=False)
+# model_aic.fit(X, y)
+# alpha_aic_ = model_aic.alpha_
+
+
+# def plot_ic_criterion(model, name, color):
+#     criterion_ = model.criterion_
+#     plt.semilogx(
+#         model.alphas_ + EPSILON,
+#         criterion_,
+#         "--",
+#         color=color,
+#         linewidth=3,
+#         label="%s criterion" % name,
+#     )
+#     plt.axvline(
+#         model.alpha_ + EPSILON,
+#         color=color,
+#         linewidth=3,
+#         label="alpha: %s estimate" % name,
+#     )
+#     plt.xlabel(r"$\alpha$")
+#     plt.ylabel("criterion")
+
+
+# plt.figure()
+# plot_ic_criterion(model_aic, "AIC", "b")
+# plot_ic_criterion(model_bic, "BIC", "r")
+# plt.legend()
+# plt.title("Information-criterion for model selection (training time %.3fs)" % t_bic)
+
+# # #############################################################################
+# # LassoCV: coordinate descent
+
+# # Compute paths
+# print("Computing regularization path using the coordinate descent lasso...")
+# t1 = time.time()
+# model = LassoCV(cv=20).fit(X, y)
+# t_lasso_cv = time.time() - t1
+
+# # Display results
+# plt.figure()
+# ymin, ymax = 2300, 3800
+# plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ":")
+# plt.plot(
+#     model.alphas_ + EPSILON,
+#     model.mse_path_.mean(axis=-1),
+#     "k",
+#     label="Average across the folds",
+#     linewidth=2,
+# )
+# plt.axvline(
+#     model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate"
+# )
+
+# plt.legend()
+
+# plt.xlabel(r"$\alpha$")
+# plt.ylabel("Mean square error")
+# plt.title(
+#     "Mean square error on each fold: coordinate descent (train time: %.2fs)"
+#     % t_lasso_cv
+# )
+# plt.axis("tight")
+# plt.ylim(ymin, ymax)
+
+# # #############################################################################
+# # LassoLarsCV: least angle regression
+
+# # Compute paths
+# print("Computing regularization path using the Lars lasso...")
+# t1 = time.time()
+# model = LassoLarsCV(cv=20, normalize=False).fit(X, y)
+# t_lasso_lars_cv = time.time() - t1
+
+# # Display results
+# plt.figure()
+# plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ":")
+# plt.semilogx(
+#     model.cv_alphas_ + EPSILON,
+#     model.mse_path_.mean(axis=-1),
+#     "k",
+#     label="Average across the folds",
+#     linewidth=2,
+# )
+# plt.axvline(model.alpha_, linestyle="--", color="k", label="alpha CV")
+# plt.legend()
+
+# plt.xlabel(r"$\alpha$")
+# plt.ylabel("Mean square error")
+# plt.title(f"Mean square error on each fold: "
+# "Lars (train time: {t_lasso_lars_cv:.2f}s)")
+# plt.axis("tight")
+# plt.ylim(ymin, ymax)
+
+# plt.show()
+
+# %%
+from sklearn.datasets import load_diabetes
+
+X, y = load_diabetes(return_X_y=True)
+n_samples = X.shape[0]
+
+# %%
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LassoLarsIC
+from sklearn.pipeline import make_pipeline
+
+model_aic = make_pipeline(
+    StandardScaler(),
+    LassoLarsIC(criterion="aic", normalize=False),
+).fit(X, y)
+model_bic = make_pipeline(
+    StandardScaler(),
+    LassoLarsIC(criterion="bic", normalize=False),
+).fit(X, y)
+
+# %%
 import numpy as np
-import matplotlib.pyplot as plt
 
-from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
-from sklearn import datasets
-
-# This is to avoid division by zero while doing np.log10
-EPSILON = 1e-4
-
-X, y = datasets.load_diabetes(return_X_y=True)
-
-rng = np.random.RandomState(42)
-X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features
-
-# normalize data as done by Lars to allow for comparison
-X /= np.sqrt(np.sum(X ** 2, axis=0))
-
-# #############################################################################
-# LassoLarsIC: least angle regression with BIC/AIC criterion
-
-model_bic = LassoLarsIC(criterion="bic", normalize=False)
-t1 = time.time()
-model_bic.fit(X, y)
-t_bic = time.time() - t1
-alpha_bic_ = model_bic.alpha_
-
-model_aic = LassoLarsIC(criterion="aic", normalize=False)
-model_aic.fit(X, y)
-alpha_aic_ = model_aic.alpha_
-
-
-def plot_ic_criterion(model, name, color):
-    criterion_ = model.criterion_
-    plt.semilogx(
-        model.alphas_ + EPSILON,
-        criterion_,
-        "--",
-        color=color,
-        linewidth=3,
-        label="%s criterion" % name,
-    )
-    plt.axvline(
-        model.alpha_ + EPSILON,
-        color=color,
-        linewidth=3,
-        label="alpha: %s estimate" % name,
-    )
-    plt.xlabel(r"$\alpha$")
-    plt.ylabel("criterion")
-
-
-plt.figure()
-plot_ic_criterion(model_aic, "AIC", "b")
-plot_ic_criterion(model_bic, "BIC", "r")
-plt.legend()
-plt.title("Information-criterion for model selection (training time %.3fs)" % t_bic)
+aic_criterion = (
+    model_aic[-1].criterion_
+    - (n_samples * np.log(2 * np.pi * model_aic[-1].variance_noise_))
+    - n_samples
+)
+bic_criterion = (
+    model_bic[-1].criterion_
+    - (n_samples * np.log(2 * np.pi * model_bic[-1].variance_noise_))
+    - n_samples
+)
 
-# #############################################################################
-# LassoCV: coordinate descent
+# %%
+aic_alpha_indice = np.argmin(model_aic[-1].criterion_)
+bic_alpha_indice = np.argmin(model_bic[-1].criterion_)
 
-# Compute paths
-print("Computing regularization path using the coordinate descent lasso...")
-t1 = time.time()
-model = LassoCV(cv=20).fit(X, y)
-t_lasso_cv = time.time() - t1
+# %%
+import matplotlib.pyplot as plt
 
-# Display results
-plt.figure()
-ymin, ymax = 2300, 3800
-plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ":")
 plt.plot(
-    model.alphas_ + EPSILON,
-    model.mse_path_.mean(axis=-1),
-    "k",
-    label="Average across the folds",
-    linewidth=2,
+    aic_criterion,
+    label="AIC criterion",
 )
-plt.axvline(
-    model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate"
+plt.plot(
+    bic_criterion,
+    label="BIC criterion",
 )
 
-plt.legend()
-
-plt.xlabel(r"$\alpha$")
-plt.ylabel("Mean square error")
-plt.title(
-    "Mean square error on each fold: coordinate descent (train time: %.2fs)"
-    % t_lasso_cv
+plt.vlines(
+    aic_alpha_indice,
+    aic_criterion.min(),
+    aic_criterion.max(),
 )
-plt.axis("tight")
-plt.ylim(ymin, ymax)
-
-# #############################################################################
-# LassoLarsCV: least angle regression
-
-# Compute paths
-print("Computing regularization path using the Lars lasso...")
-t1 = time.time()
-model = LassoLarsCV(cv=20, normalize=False).fit(X, y)
-t_lasso_lars_cv = time.time() - t1
-
-# Display results
-plt.figure()
-plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ":")
-plt.semilogx(
-    model.cv_alphas_ + EPSILON,
-    model.mse_path_.mean(axis=-1),
-    "k",
-    label="Average across the folds",
-    linewidth=2,
+plt.vlines(
+    bic_alpha_indice,
+    bic_criterion.min(),
+    bic_criterion.max(),
 )
-plt.axvline(model.alpha_, linestyle="--", color="k", label="alpha CV")
-plt.legend()
 
-plt.xlabel(r"$\alpha$")
-plt.ylabel("Mean square error")
-plt.title("Mean square error on each fold: Lars (train time: %.2fs)" % t_lasso_lars_cv)
-plt.axis("tight")
-plt.ylim(ymin, ymax)
+plt.legend()
 
-plt.show()
+# %%
+# %%
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index ac5765698c94b..aba6b7e7b9087 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -19,6 +19,7 @@
 
 from ._base import LinearModel
 from ._base import _deprecate_normalize
+from ._ridge import ridge_regression
 from ..base import RegressorMixin, MultiOutputMixin
 
 # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
@@ -1971,7 +1972,7 @@ class LassoLarsIC(LassoLars):
 
     Parameters
     ----------
-    criterion : {'bic' , 'aic'}, default='aic'
+    criterion : {'aic', 'bic'}, default='aic'
         The type of criterion to use.
 
     fit_intercept : bool, default=True
@@ -2025,6 +2026,13 @@ class LassoLarsIC(LassoLars):
         As a consequence using LassoLarsIC only makes sense for problems where
         a sparse solution is expected and/or reached.
 
+    noise_variance : float, default=None
+        The estimated noise variance of the data. If `None`, we will compute
+        an unbiased estimator using an OLS model. However, it is only possible
+        in the case `n_samples > n_features`.
+
+        .. versionadded:: 1.1
+
     Attributes
     ----------
     coef_ : array-like of shape (n_features,)
@@ -2051,6 +2059,12 @@ class LassoLarsIC(LassoLars):
         alphas. The alpha which has the smallest information criterion is
         chosen.
 
+    noise_variance_ : float
+        The estimated noise variance from the data used to compute the
+        criterion.
+
+        .. versionadded:: 1.1
+
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
@@ -2113,6 +2127,7 @@ def __init__(
         eps=np.finfo(float).eps,
         copy_X=True,
         positive=False,
+        noise_variance=None,
     ):
         self.criterion = criterion
         self.fit_intercept = fit_intercept
@@ -2124,6 +2139,7 @@ def __init__(
         self.precompute = precompute
         self.eps = eps
         self.fit_path = True
+        self.noise_variance = noise_variance
 
     def _more_tags(self):
         return {"multioutput": False}
@@ -2181,15 +2197,16 @@ def fit(self, X, y, copy_X=None):
         n_samples = X.shape[0]
 
         if self.criterion == "aic":
-            K = 2  # AIC
+            factor_criterion = 2
         elif self.criterion == "bic":
-            K = log(n_samples)  # BIC
+            factor_criterion = log(n_samples)
         else:
-            raise ValueError("criterion should be either bic or aic")
+            raise ValueError(
+                f"criterion should be either bic or aic, got {self.criterion!r}"
+            )
 
         residuals = y[:, np.newaxis] - np.dot(X, coef_path_)
-        mean_squared_error = np.mean(residuals ** 2, axis=0)
-
+        residuals_sum_squares = np.sum(residuals ** 2, axis=0)
         degrees_of_freedom = np.zeros(coef_path_.shape[1], dtype=int)
         for k, coef in enumerate(coef_path_.T):
             mask = np.abs(coef) > np.finfo(coef.dtype).eps
@@ -2201,10 +2218,18 @@ def fit(self, X, y, copy_X=None):
             degrees_of_freedom[k] = np.sum(mask)
 
         self.alphas_ = alphas_
+
+        if self.noise_variance is None:
+            self.noise_variance_ = self._estimate_noise_variance(
+                X, y, positive=self.positive
+            )
+        else:
+            self.noise_variance_ = self.noise_variance
+
         self.criterion_ = (
-            n_samples * (np.log(2 * np.pi) + 1)  # constant that could be neglected
-            + n_samples * np.log(mean_squared_error)
-            + K * degrees_of_freedom
+            n_samples * np.log(2 * np.pi * self.noise_variance_)
+            + residuals_sum_squares / self.noise_variance_
+            + factor_criterion * degrees_of_freedom
         )
         n_best = np.argmin(self.criterion_)
 
@@ -2212,3 +2237,39 @@ def fit(self, X, y, copy_X=None):
         self.coef_ = coef_path_[:, n_best]
         self._set_intercept(Xmean, ymean, Xstd)
         return self
+
+    def _estimate_noise_variance(self, X, y, positive):
+        """Compute a variance estimator of an OLS model.
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Data to be fitted by the OLS model. We expect the data to be
+            centered.
+
+        y : ndarray of shape (n_samples,)
+            Associated target
+
+        Returns
+        -------
+        noise_variance : float
+            An estimator of the noise variance of an OLS model.
+
+        Note
+        ----
+        Instead of using a ordinary linear regression, we will use a ridge
+        model with a very low alpha for numerical stability in case of
+        collinearity features.
+        """
+        if X.shape[0] <= X.shape[1]:
+            raise ValueError(
+                f"You are using {self.__class__.__name__} in the case where the number "
+                "of samples is smaller than the number of features. In this setting, "
+                "getting a good estimator for the variance of the noise is not "
+                "possible. Provide an estimate of the noise variance in the "
+                "constructor."
+            )
+        ols_coef = ridge_regression(
+            X, y, alpha=1e-12, positive=positive, check_input=False
+        )
+        return np.sum((y - X @ ols_coef) ** 2) / (X.shape[0] - X.shape[1])
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 643e50354024d..774c5327bcc71 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -900,8 +900,8 @@ def test_copy_X_with_auto_gram():
 def test_lars_dtype_match(LARS, has_coef_path, args, dtype):
     # The test ensures that the fit method preserves input dtype
     rng = np.random.RandomState(0)
-    X = rng.rand(6, 6).astype(dtype)
-    y = rng.rand(6).astype(dtype)
+    X = rng.rand(20, 6).astype(dtype)
+    y = rng.rand(20).astype(dtype)
 
     model = LARS(**args)
     model.fit(X, y)
@@ -930,8 +930,8 @@ def test_lars_numeric_consistency(LARS, has_coef_path, args):
     atol = 1e-5
 
     rng = np.random.RandomState(0)
-    X_64 = rng.rand(6, 6)
-    y_64 = rng.rand(6)
+    X_64 = rng.rand(10, 6)
+    y_64 = rng.rand(10)
 
     model_64 = LARS(**args).fit(X_64, y_64)
     model_32 = LARS(**args).fit(X_64.astype(np.float32), y_64.astype(np.float32))
@@ -960,3 +960,22 @@ def test_lassolarsic_alpha_selection(criterion):
 
     best_alpha_selected = np.argmin(model[-1].criterion_)
     assert best_alpha_selected == 7
+
+
+def test_lassolarsic_noise_variance():
+    """Check the behaviour when `n_samples` < `n_features` and that one needs
+    to provide the noise variance."""
+    rng = np.random.RandomState(0)
+    X, y = datasets.make_regression(n_samples=10, n_features=100, random_state=rng)
+
+    model = make_pipeline(StandardScaler(), LassoLarsIC(normalize=False))
+
+    err_msg = (
+        "You are using LassoLarsIC in the case where the number of samples is smaller"
+        " than the number of features"
+    )
+    with pytest.raises(ValueError, match=err_msg):
+        model.fit(X, y)
+
+    model.set_params(lassolarsic__noise_variance=1.0)
+    model.fit(X, y).predict(X)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index ccc6ff23ed8fc..b2e5527cbdf9f 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -638,6 +638,11 @@ def _set_checking_parameters(estimator):
         # This is ugly :-/
         estimator.n_components = 1
 
+    if name == "LassoLarsIC":
+        # Noise variance estimation does not work when `n_samples < n_features`.
+        # We need to provide the noise variance explicitely.
+        estimator.set_params(noise_variance=1.0)
+
     if hasattr(estimator, "n_clusters"):
         estimator.n_clusters = min(estimator.n_clusters, 2)
 

From caea26c464afdbffd94631aa9d820009c8396aff Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 12:13:05 +0200
Subject: [PATCH 06/35] iter

---
 .../plot_lasso_model_selection.py             | 405 +++++++++---------
 1 file changed, 198 insertions(+), 207 deletions(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index bff167c8688a5..cc0cdd0aa02de 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -1,238 +1,229 @@
-# """
-# ===================================================
-# Lasso model selection: Cross-Validation / AIC / BIC
-# ===================================================
-
-# Use the Akaike information criterion (AIC), the Bayes Information
-# criterion (BIC) and cross-validation to select an optimal value
-# of the regularization parameter alpha of the :ref:`lasso` estimator.
-
-# Results obtained with LassoLarsIC are based on AIC/BIC criteria.
-
-# Information-criterion based model selection is very fast, but it
-# relies on a proper estimation of degrees of freedom, are
-# derived for large samples (asymptotic results) and assume the model
-# is correct, i.e. that the data are actually generated by this model.
-# They also tend to break when the problem is badly conditioned
-# (more features than samples).
-
-# For cross-validation, we use 20-fold with 2 algorithms to compute the
-# Lasso path: coordinate descent, as implemented by the LassoCV class, and
-# Lars (least angle regression) as implemented by the LassoLarsCV class.
-# Both algorithms give roughly the same results. They differ with regards
-# to their execution speed and sources of numerical errors.
-
-# Lars computes a path solution only for each kink in the path. As a
-# result, it is very efficient when there are only of few kinks, which is
-# the case if there are few features or samples. Also, it is able to
-# compute the full path without setting any meta parameter. On the
-# opposite, coordinate descent compute the path points on a pre-specified
-# grid (here we use the default). Thus it is more efficient if the number
-# of grid points is smaller than the number of kinks in the path. Such a
-# strategy can be interesting if the number of features is really large
-# and there are enough samples to select a large amount. In terms of
-# numerical errors, for heavily correlated variables, Lars will accumulate
-# more errors, while the coordinate descent algorithm will only sample the
-# path on a grid.
-
-# Note how the optimal value of alpha varies for each fold. This
-# illustrates why nested-cross validation is necessary when trying to
-# evaluate the performance of a method for which a parameter is chosen by
-# cross-validation: this choice of parameter may not be optimal for unseen
-# data.
-
-# """
-
-# # Author: Olivier Grisel, Gael Varoquaux, Alexandre Gramfort
-# # License: BSD 3 clause
-
-# import time
-
-# import numpy as np
-# import matplotlib.pyplot as plt
-
-# from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
-# from sklearn import datasets
-
-# # This is to avoid division by zero while doing np.log10
-# EPSILON = 1e-4
-
-# X, y = datasets.load_diabetes(return_X_y=True)
-
-# rng = np.random.RandomState(42)
-# X = np.c_[X, rng.randn(X.shape[0], 14)]  # add some bad features
-
-# # normalize data as done by Lars to allow for comparison
-# X /= np.sqrt(np.sum(X ** 2, axis=0))
-
-# # #############################################################################
-# # LassoLarsIC: least angle regression with BIC/AIC criterion
-
-# model_bic = LassoLarsIC(criterion="bic", normalize=False)
-# t1 = time.time()
-# model_bic.fit(X, y)
-# t_bic = time.time() - t1
-# alpha_bic_ = model_bic.alpha_
-
-# model_aic = LassoLarsIC(criterion="aic", normalize=False)
-# model_aic.fit(X, y)
-# alpha_aic_ = model_aic.alpha_
-
-
-# def plot_ic_criterion(model, name, color):
-#     criterion_ = model.criterion_
-#     plt.semilogx(
-#         model.alphas_ + EPSILON,
-#         criterion_,
-#         "--",
-#         color=color,
-#         linewidth=3,
-#         label="%s criterion" % name,
-#     )
-#     plt.axvline(
-#         model.alpha_ + EPSILON,
-#         color=color,
-#         linewidth=3,
-#         label="alpha: %s estimate" % name,
-#     )
-#     plt.xlabel(r"$\alpha$")
-#     plt.ylabel("criterion")
-
-
-# plt.figure()
-# plot_ic_criterion(model_aic, "AIC", "b")
-# plot_ic_criterion(model_bic, "BIC", "r")
-# plt.legend()
-# plt.title("Information-criterion for model selection (training time %.3fs)" % t_bic)
-
-# # #############################################################################
-# # LassoCV: coordinate descent
-
-# # Compute paths
-# print("Computing regularization path using the coordinate descent lasso...")
-# t1 = time.time()
-# model = LassoCV(cv=20).fit(X, y)
-# t_lasso_cv = time.time() - t1
-
-# # Display results
-# plt.figure()
-# ymin, ymax = 2300, 3800
-# plt.semilogx(model.alphas_ + EPSILON, model.mse_path_, ":")
-# plt.plot(
-#     model.alphas_ + EPSILON,
-#     model.mse_path_.mean(axis=-1),
-#     "k",
-#     label="Average across the folds",
-#     linewidth=2,
-# )
-# plt.axvline(
-#     model.alpha_ + EPSILON, linestyle="--", color="k", label="alpha: CV estimate"
-# )
-
-# plt.legend()
-
-# plt.xlabel(r"$\alpha$")
-# plt.ylabel("Mean square error")
-# plt.title(
-#     "Mean square error on each fold: coordinate descent (train time: %.2fs)"
-#     % t_lasso_cv
-# )
-# plt.axis("tight")
-# plt.ylim(ymin, ymax)
-
-# # #############################################################################
-# # LassoLarsCV: least angle regression
-
-# # Compute paths
-# print("Computing regularization path using the Lars lasso...")
-# t1 = time.time()
-# model = LassoLarsCV(cv=20, normalize=False).fit(X, y)
-# t_lasso_lars_cv = time.time() - t1
-
-# # Display results
-# plt.figure()
-# plt.semilogx(model.cv_alphas_ + EPSILON, model.mse_path_, ":")
-# plt.semilogx(
-#     model.cv_alphas_ + EPSILON,
-#     model.mse_path_.mean(axis=-1),
-#     "k",
-#     label="Average across the folds",
-#     linewidth=2,
-# )
-# plt.axvline(model.alpha_, linestyle="--", color="k", label="alpha CV")
-# plt.legend()
-
-# plt.xlabel(r"$\alpha$")
-# plt.ylabel("Mean square error")
-# plt.title(f"Mean square error on each fold: "
-# "Lars (train time: {t_lasso_lars_cv:.2f}s)")
-# plt.axis("tight")
-# plt.ylim(ymin, ymax)
-
-# plt.show()
+"""
+=================================================
+Lasso model selection: AIC-BIC / cross-validation
+=================================================
+
+This example focuses on model selection when dealing Lasso models that are
+linear models with a L1 penalty for regression problems.
+
+Indeed, several strategies can be used to select the value of the
+regularization parameter: using an information criterion, namely AIC or BIC or
+via cross-validation.
+
+In remainder, we will discuss in details the different strategies.
+"""
+
+# Author: Olivier Grisel
+#         Gael Varoquaux
+#         Alexandre Gramfort
+#         Guillaume Lemaitre
+# License: BSD 3 clause
+
+# %%
+import sklearn
+
+sklearn.set_config(display="diagram")
 
 # %%
+# Dataset
+# -------
+# In this example, we will use the diabetes dataset.
 from sklearn.datasets import load_diabetes
 
-X, y = load_diabetes(return_X_y=True)
-n_samples = X.shape[0]
+X, y = load_diabetes(return_X_y=True, as_frame=True)
+X.head()
 
 # %%
+# In addition, we will add some random features to the original data to
+# make obvious the feature selection performed by the Lasso model.
+import numpy as np
+import pandas as pd
+
+rng = np.random.RandomState(42)
+n_random_features = 14
+X_random = pd.DataFrame(
+    rng.randn(X.shape[0], n_random_features),
+    columns=[f"random_{i:02d}" for i in range(n_random_features)],
+)
+X = pd.concat([X, X_random], axis=1)
+X.head()
+
+# %%
+# Selecting Lasso via an information criterion
+# --------------------------------------------
+# :class:`~sklearn.linear_model.LassoLarsIC` provides an Lasso estimator that
+# uses the Akaike information criterion (AIC) or the Bayes information
+# criterion (BIC) to select the optimal value of the regularization
+# parameter alpha.
+#
+# Before fitting the model, we will standardize the data with a
+# :class:`~sklearn.preprocessing.StandardardScaler`. In addition, we will
+# measure the time to fit and tune the hyperparameter alpha in order to
+# compare with the cross-validation strategy.
+#
+# We will first fit a Lasso model with the AIC criterion.
+import time
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LassoLarsIC
 from sklearn.pipeline import make_pipeline
 
-model_aic = make_pipeline(
-    StandardScaler(),
-    LassoLarsIC(criterion="aic", normalize=False),
-).fit(X, y)
-model_bic = make_pipeline(
-    StandardScaler(),
-    LassoLarsIC(criterion="bic", normalize=False),
+start_time = time.time()
+lasso_lars_ic = make_pipeline(
+    StandardScaler(), LassoLarsIC(criterion="aic", normalize=False)
 ).fit(X, y)
+fit_time = time.time() - start_time
 
 # %%
-import numpy as np
+# We store the AIC metric for each value of alpha used during `fit`.
+results = pd.DataFrame(
+    {
+        "alphas": lasso_lars_ic[-1].alphas_,
+        "AIC criterion": lasso_lars_ic[-1].criterion_,
+    }
+).set_index("alphas")
+alpha_aic = lasso_lars_ic[-1].alpha_
 
-aic_criterion = (
-    model_aic[-1].criterion_
-    - (n_samples * np.log(2 * np.pi * model_aic[-1].variance_noise_))
-    - n_samples
+# %%
+# Now, we perform the same analysis using the BIC criterion.
+lasso_lars_ic.set_params(lassolarsic__criterion="bic").fit(X, y)
+results["BIC criterion"] = lasso_lars_ic[-1].criterion_
+alpha_bic = lasso_lars_ic[-1].alpha_
+
+# %%
+# Finally, we can plot the AIC and BIC values for the different alpha values.
+# The vertical lines in the plot corresponds to the alpha chosen for each
+# criterion. The selected alpha corresponds to the minimum of the AIC or BIC
+# criterion.
+ax = results.plot()
+ax.vlines(
+    alpha_aic,
+    results["AIC criterion"].min(),
+    results["AIC criterion"].max(),
+    label="alpha: AIC estimate",
+    linestyles="--",
+    color="tab:blue",
+)
+ax.vlines(
+    alpha_bic,
+    results["BIC criterion"].min(),
+    results["BIC criterion"].max(),
+    label="alpha: BIC estimate",
+    linestyle="--",
+    color="tab:orange",
 )
-bic_criterion = (
-    model_bic[-1].criterion_
-    - (n_samples * np.log(2 * np.pi * model_bic[-1].variance_noise_))
-    - n_samples
+ax.set_xlabel(r"$\alpha$")
+ax.set_ylabel("criterion")
+ax.set_xscale("log")
+ax.legend()
+_ = ax.set_title(
+    f"Information-criterion for model selection (training time {fit_time:.2f}s)"
 )
 
 # %%
-aic_alpha_indice = np.argmin(model_aic[-1].criterion_)
-bic_alpha_indice = np.argmin(model_bic[-1].criterion_)
+# Information-criterion based model selection is very fast. It relies on
+# computing the criterion on the in-sample set provided at `fit`. Both criteria
+# are computed estimate the model error on the full training set and penalize
+# this over optimistic error. However, this penalty relies on a proper
+# estimation of degrees of freedom and noise variance, are derived for large
+# samples (asymptotic results) and assume the model is correct, i.e. that the
+# data are actually generated by this model.
+#
+# These models also tend to break when the problem is badly conditioned (more
+# features than samples) and it is then required to provide a estimate of the
+# noise variance.
+#
+# Selecting Lasso via cross-validation
+# ------------------------------------
+# Lasso estimator can be implemented with different solvers: coordinate descent
+# and least angle regression. They differ with regards to their execution speed
+# and sources of numerical errors.
+#
+# In scikit-learn, two different estimators are available with integrated
+# cross-validation: :class:`~sklearn.linear_model.LassoCV` and
+# :class:`~sklearn.linear_model.LassoLarsCV` that respectively solve the
+# problem with coordinate descent and least angle regression.
+#
+# In the remainder of this section, we will present both approaches. For both
+# algorithms, we will use a 20-fold cross-validation strategy.
+#
+# Lasso via coordinate descent
+# ............................
+# Lets' start to make the hyperparameter tuning using
+# :class:`~sklearn.linear_model.LassoCV`.
+from sklearn.linear_model import LassoCV
+
+start_time = time.time()
+model = make_pipeline(StandardScaler(), LassoCV(cv=20)).fit(X, y)
+fit_time = time.time() - start_time
 
 # %%
 import matplotlib.pyplot as plt
 
+ymin, ymax = 2300, 3800
+plt.semilogx(model[-1].alphas_, model[-1].mse_path_, linestyle=":")
 plt.plot(
-    aic_criterion,
-    label="AIC criterion",
-)
-plt.plot(
-    bic_criterion,
-    label="BIC criterion",
+    model[-1].alphas_,
+    model[-1].mse_path_.mean(axis=-1),
+    color="black",
+    label="Average across the folds",
+    linewidth=2,
 )
+plt.axvline(model[-1].alpha_, linestyle="--", color="black", label="alpha: CV estimate")
 
-plt.vlines(
-    aic_alpha_indice,
-    aic_criterion.min(),
-    aic_criterion.max(),
+plt.ylim(ymin, ymax)
+plt.xlabel(r"$\alpha$")
+plt.ylabel("Mean square error")
+plt.legend()
+_ = plt.title(
+    f"Mean square error on each fold: coordinate descent (train time: {fit_time:.2f}s)"
 )
-plt.vlines(
-    bic_alpha_indice,
-    bic_criterion.min(),
-    bic_criterion.max(),
+
+# %%
+# Lasso via least angle regression
+# ................................
+# Lets' start to make the hyperparameter tuning using
+# :class:`~sklearn.linear_model.LassoLarsCV`.
+from sklearn.linear_model import LassoLarsCV
+
+start_time = time.time()
+model = make_pipeline(StandardScaler(), LassoLarsCV(cv=20, normalize=False)).fit(X, y)
+fit_time = time.time() - start_time
+
+# %%
+plt.semilogx(model[-1].cv_alphas_, model[-1].mse_path_, ":")
+plt.semilogx(
+    model[-1].cv_alphas_,
+    model[-1].mse_path_.mean(axis=-1),
+    color="black",
+    label="Average across the folds",
+    linewidth=2,
 )
+plt.axvline(model[-1].alpha_, linestyle="--", color="black", label="alpha CV")
 
+plt.ylim(ymin, ymax)
+plt.xlabel(r"$\alpha$")
+plt.ylabel("Mean square error")
 plt.legend()
+_ = plt.title(f"Mean square error on each fold: Lars (train time: {fit_time:.2f}s)")
 
 # %%
-# %%
+# Summary of cross-validation approach
+# ....................................
+# Both algorithms give roughly the same results.
+#
+# Lars computes a path solution only for each kink in the path. As a result, it
+# is very efficient when there are only of few kinks, which is the case if
+# there are few features or samples. Also, it is able to compute the full path
+# without setting any meta parameter. On the opposite, coordinate descent
+# compute the path points on a pre-specified grid (here we use the default).
+# Thus it is more efficient if the number of grid points is smaller than the
+# number of kinks in the path. Such a strategy can be interesting if the number
+# of features is really large and there are enough samples to select a large
+# amount. In terms of numerical errors, for heavily correlated variables, Lars
+# will accumulate more errors, while the coordinate descent algorithm will only
+# sample the path on a grid.
+#
+# Note how the optimal value of alpha varies for each fold. This illustrates
+# why nested-cross validation is necessary when trying to evaluate the
+# performance of a method for which a parameter is chosen by cross-validation:
+# this choice of parameter may not be optimal for unseen data.

From 979f629966c7ee39c3b591abfffc630ae97038db Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 13:15:41 +0200
Subject: [PATCH 07/35] iter

---
 doc/modules/linear_model.rst                  | 63 ++++++++++++++++++-
 .../plot_lasso_model_selection.py             |  2 +-
 sklearn/linear_model/_least_angle.py          | 12 ++--
 3 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 18e1bf468dc62..b7b3b4e2ade3c 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -298,6 +298,7 @@ features, it is often faster than :class:`LassoCV`.
 
 .. centered:: |lasso_cv_1| |lasso_cv_2|
 
+.. _lasso_lars_ic:
 
 Information-criteria based model selection
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -318,11 +319,69 @@ They also tend to break when the problem is badly conditioned
     :align: center
     :scale: 50%
 
+Mathematical details
+""""""""""""""""""""
+
+The definition of AIC (and thus BIC) might differ in the literature. In this
+section, we give more information regarding the criterion computed in
+scikit-learn. The AIC criterion is defined as:
+
+.. math::
+    -2 \ln(\hat{L}) + 2 d \,
+
+where :math:`\hat{L}` is the maximum likelihood of the model and
+:math:`d` is the number of parameters (as well referred as degrees of
+freedom in the previous section).
+
+For a linear Gaussian model, the maximum log-likelihood is defined as:
+
+.. math::
+    - \frac{n}{2} \ln(2 \pi) - \frac{n}{2} \ln(\hat{\sigma^2}) - \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{2\hat{\sigma^2}}\,
+
+where :math:`\hat{\sigma^2}` is an estimator of the noise variance,
+:math:`y_i` and :math:`\hat{y_i}` are respectively the true and predicted
+targets, and :math:`n` is the number of samples.
+
+Plugging the maximum log-likelihood in the AIC formula yields:
+
+.. math::
+    n \ln(2 \pi \hat{\sigma^2}) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma^2}} + 2d \.
+
+The left-hand term of the above expression is sometimes discarded since it is a
+constant and does not change the rank of the models evaluated. In addition,
+it is sometimes stated that the AIC is equivalent to the :math:`C_p` statistic
+[12]_. However, this is important to note that it is up to some constant
+and factor term.
+
+At last, we mentioned above that :math:`\hat{\sigma^2}` is an estimator of the
+noise variance. In :class:`LassoLarsIC` when the parameter `noise_variance` is
+not provided (default), the noise variance is estimated via the unbiased
+estimator [13]_ defined as:
+
+.. math::
+    \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{n - p} \,
+
+where :math:`p` is the number of parameters in the model and :math:`\hat{y_i}`
+is the predicted target using an ordinary least squares regression. In
+scikit-learn, we used a ridge model with a very small regularization in case
+of ill-conditioned design matrix.
 
 .. topic:: Examples:
 
   * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py`
 
+.. topic:: References
+
+  .. [12] `Zou, Hui, Trevor Hastie, and Robert Tibshirani.
+           "On the degrees of freedom of the lasso."
+           The Annals of Statistics 35.5 (2007): 2173-2192.
+           <https://arxiv.org/pdf/0712.0881.pdf>`_
+
+  .. [13] `Cherkassky, Vladimir, and Yunqian Ma.
+           "Comparison of model selection for regression."
+           Neural computation 15.7 (2003): 1691-1714.
+           <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.392.8794&rep=rep1&type=pdf>`_
+
 Comparison with the regularization parameter of SVM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -934,8 +993,8 @@ to warm-starting (see :term:`Glossary <warm_start>`).
 
     .. [6] Mark Schmidt, Nicolas Le Roux, and Francis Bach: `Minimizing Finite Sums with the Stochastic Average Gradient. <https://hal.inria.fr/hal-00860051/document>`_
 
-    .. [7] Aaron Defazio, Francis Bach, Simon Lacoste-Julien: 
-        :arxiv:`SAGA: A Fast Incremental Gradient Method With Support for 
+    .. [7] Aaron Defazio, Francis Bach, Simon Lacoste-Julien:
+        :arxiv:`SAGA: A Fast Incremental Gradient Method With Support for
         Non-Strongly Convex Composite Objectives. <1407.0202>`
 
     .. [8] https://en.wikipedia.org/wiki/Broyden%E2%80%93Fletcher%E2%80%93Goldfarb%E2%80%93Shanno_algorithm
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index cc0cdd0aa02de..e36590ff024fc 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -57,7 +57,7 @@
 # parameter alpha.
 #
 # Before fitting the model, we will standardize the data with a
-# :class:`~sklearn.preprocessing.StandardardScaler`. In addition, we will
+# :class:`~sklearn.preprocessing.StandardScaler`. In addition, we will
 # measure the time to fit and tune the hyperparameter alpha in order to
 # compare with the cross-validation strategy.
 #
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index aba6b7e7b9087..05f3df8108d0e 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -1968,7 +1968,7 @@ class LassoLarsIC(LassoLars):
     goodness of fit and the complexity of the model. A good model should
     explain well the data while being simple.
 
-    Read more in the :ref:`User Guide <least_angle_regression>`.
+    Read more in the :ref:`User Guide <lasso_lars_ic>`.
 
     Parameters
     ----------
@@ -2093,11 +2093,15 @@ class LassoLarsIC(LassoLars):
     -----
     The number of degrees of freedom is computed as in [1]_.
 
+    To have more details regarding the mathematical formulation of the
+    AIC and BIC criteria, please refer to :ref:`User Guide <lasso_lars_ic>`.
+
     References
     ----------
-    .. [1] "On the degrees of freedom of the lasso"
-           Hui Zou, Trevor Hastie, and Robert Tibshirani
-           Ann. Statist. Volume 35, Number 5 (2007), 2173-2192.
+    .. [1] `Zou, Hui, Trevor Hastie, and Robert Tibshirani.
+            "On the degrees of freedom of the lasso."
+            The Annals of Statistics 35.5 (2007): 2173-2192.
+            <https://arxiv.org/pdf/0712.0881.pdf>`_
 
     .. [2] `Wikipedia entry on the Akaike information criterion
             <https://en.wikipedia.org/wiki/Akaike_information_criterion>`_

From 61a6b6cf67c785e48deecedcaee959095ae5dbca Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 13:54:07 +0200
Subject: [PATCH 08/35] DOC add a new example

---
 doc/modules/linear_model.rst                |   5 +-
 examples/linear_model/plot_lasso_lars_ic.py | 114 ++++++++++++++++++++
 2 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 examples/linear_model/plot_lasso_lars_ic.py

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index b7b3b4e2ade3c..9c9d32b2bfe51 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -314,8 +314,8 @@ is correct, i.e. that the data are generated by this model.
 They also tend to break when the problem is badly conditioned
 (more features than samples).
 
-.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_model_selection_001.png
-    :target: ../auto_examples/linear_model/plot_lasso_model_selection.html
+.. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lars_ic_001.png
+    :target: ../auto_examples/linear_model/plot_lasso_lars_ic.html
     :align: center
     :scale: 50%
 
@@ -369,6 +369,7 @@ of ill-conditioned design matrix.
 .. topic:: Examples:
 
   * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py`
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_lars_ic.py`
 
 .. topic:: References
 
diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py
new file mode 100644
index 0000000000000..66dbcfa27e42a
--- /dev/null
+++ b/examples/linear_model/plot_lasso_lars_ic.py
@@ -0,0 +1,114 @@
+"""
+==============================================
+Lasso model selection via information criteria
+==============================================
+
+This example reproduces the example of Fig. 2 of [1]_. A
+:class:`~sklearn.linear_model.LassoLarsIC` estimator is fit on a
+diabetes dataset and uses the the AIC and the BIC criteria to select
+the best model.
+
+.. topic:: References
+
+    .. [ZHT2007] `Zou, Hui, Trevor Hastie, and Robert Tibshirani.
+       "On the degrees of freedom of the lasso."
+       The Annals of Statistics 35.5 (2007): 2173-2192.
+       <https://arxiv.org/pdf/0712.0881.pdf>`_
+"""
+
+# Author: Alexandre Gramfort
+#         Guillaume Lemaitre
+# License: BSD 3 clause
+
+# %%
+import sklearn
+
+sklearn.set_config(display="diagram")
+
+# %%
+# We will use the diabetes dataset.
+from sklearn.datasets import load_diabetes
+
+X, y = load_diabetes(return_X_y=True, as_frame=True)
+n_samples = X.shape[0]
+X.head()
+
+# %%
+# Scikit-learn provides an estimator called
+# :class:`~sklearn.linear_model.LinearLarsIC` that we will use an information
+# criterion, namely the AIC or BIC, to select the best model. Before to fit
+# this model, we will scale the dataset.
+#
+# In the following, we are going to fit two models to compare the values
+# reported by the AIC and the BIC.
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LassoLarsIC
+from sklearn.pipeline import make_pipeline
+
+lasso_lars_ic = make_pipeline(
+    StandardScaler(), LassoLarsIC(criterion="aic", normalize=False)
+).fit(X, y)
+
+
+# %%
+# To be in line with the defintion in [ZHT2007]_, we need to rescale the
+# AIC and the BIC. Indeed, Zou et al. are neglecting a some constant terms
+# compared to the true definition of AIC derivated from the maximum
+# log-likelihood of a linear model. You can refer to
+# :ref:`mathematical detail section for the User Guide <lasso_lars_ic>`.
+def zou_et_al_criterion_rescaling(criterion, n_samples, noise_variance):
+    """Rescale the information criterion to follow Zou et al. definition."""
+    return criterion - n_samples * np.log(2 * np.pi * noise_variance) - n_samples
+
+
+# %%
+import numpy as np
+
+aic_criterion = zou_et_al_criterion_rescaling(
+    lasso_lars_ic[-1].criterion_,
+    n_samples,
+    lasso_lars_ic[-1].noise_variance_,
+)
+
+index_alpha_path_aic = np.flatnonzero(
+    lasso_lars_ic[-1].alphas_ == lasso_lars_ic[-1].alpha_
+)[0]
+
+# %%
+lasso_lars_ic.set_params(lassolarsic__criterion="bic").fit(X, y)
+
+bic_criterion = zou_et_al_criterion_rescaling(
+    lasso_lars_ic[-1].criterion_,
+    n_samples,
+    lasso_lars_ic[-1].noise_variance_,
+)
+
+index_alpha_path_bic = np.flatnonzero(
+    lasso_lars_ic[-1].alphas_ == lasso_lars_ic[-1].alpha_
+)[0]
+
+# %%
+# Now that we collected the AIC and BIC, we can as well check that the minimum
+# of both criteria happens at the same alpha. Then, we can simplify the
+# following plot.
+index_alpha_path_aic == index_alpha_path_bic
+
+# %%
+# Now, we can plot the AIC and BIC criterion and the subsequent selected
+# regularization parameter.
+import matplotlib.pyplot as plt
+
+plt.plot(aic_criterion, color="tab:blue", marker="o", label="AIC criterion")
+plt.plot(bic_criterion, color="tab:orange", marker="o", label="BIC criterion")
+plt.vlines(
+    index_alpha_path_bic,
+    aic_criterion.min(),
+    aic_criterion.max(),
+    color="black",
+    linestyle="--",
+    label="Selected alpha",
+)
+plt.legend()
+plt.ylabel("Information criterion")
+plt.xlabel("Lasso model sequence")
+_ = plt.title("Lasso model selection via AIC and BIC")

From 668461a5bcac8bf72c7b374f37c788e2a83b767b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 13:59:19 +0200
Subject: [PATCH 09/35] iter

---
 doc/whats_new/v1.1.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 372f47e0c7c4b..acfaee38643a3 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -94,6 +94,21 @@ Changelog
   multilabel classification.
   :pr:`19689` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+:mod:`sklearn.linear_model`
+...........................
+
+- |API| Add a parameter `noise_variance` to :class:`linear_model.LassoLarsIC`
+  in order to provide an estimate of the noise variance. This is particularly
+  relevant when `n_features > n_samples` and the the estimator of the noise
+  variance cannot be computed.
+  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`
+
+- |Fix| Fixed the computation of AIC and BIC in
+  :class:`linear_model.LassoLarsIC`. An error is now raised when
+  `n_features > n_samples` and that not noise variance is provided.
+  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>` and
+  :user:`Andrés Babino <ababino>`.
+
 :mod:`sklearn.metrics`
 ......................
 

From 880edbf03b121b9fabbdb8bf0cbaaa7af04a91f3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 14:00:15 +0200
Subject: [PATCH 10/35] iter

---
 doc/whats_new/v1.0.rst | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index 2890dcc225194..dc77c8bc70a82 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -21,14 +21,6 @@ Changelog
   and :class:`decomposition.MiniBatchSparsePCA` to be convex and match the referenced
   article. :pr:`19210` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
-:mod:`sklearn.linear_model`
-...........................
-
-- |Fix| Fixed the computation of AIC and BIC in
-  :class:`linear_model.LassoLarsIC`.
-  :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>` and
-  :user:`Andrés Babino <ababino>`.
-
 .. _changes_1_0_1:
 
 Version 1.0.1

From e3ef92bca9a92d6778947e4dc4b32eaf29b9625a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 14:29:23 +0200
Subject: [PATCH 11/35] iter

---
 doc/modules/linear_model.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 9c9d32b2bfe51..ccb28919a69f1 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -327,7 +327,7 @@ section, we give more information regarding the criterion computed in
 scikit-learn. The AIC criterion is defined as:
 
 .. math::
-    -2 \ln(\hat{L}) + 2 d \,
+    AIC = -2 \ln(\hat{L}) + 2 d
 
 where :math:`\hat{L}` is the maximum likelihood of the model and
 :math:`d` is the number of parameters (as well referred as degrees of
@@ -336,7 +336,7 @@ freedom in the previous section).
 For a linear Gaussian model, the maximum log-likelihood is defined as:
 
 .. math::
-    - \frac{n}{2} \ln(2 \pi) - \frac{n}{2} \ln(\hat{\sigma^2}) - \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{2\hat{\sigma^2}}\,
+    \ln(\hat{L}) = - \frac{n}{2} \ln(2 \pi) - \frac{n}{2} \ln(\hat{\sigma^2}) - \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{2\hat{\sigma^2}}
 
 where :math:`\hat{\sigma^2}` is an estimator of the noise variance,
 :math:`y_i` and :math:`\hat{y_i}` are respectively the true and predicted
@@ -345,7 +345,7 @@ targets, and :math:`n` is the number of samples.
 Plugging the maximum log-likelihood in the AIC formula yields:
 
 .. math::
-    n \ln(2 \pi \hat{\sigma^2}) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma^2}} + 2d \.
+    n \ln(2 \pi \hat{\sigma^2}) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma^2}} + 2d
 
 The left-hand term of the above expression is sometimes discarded since it is a
 constant and does not change the rank of the models evaluated. In addition,
@@ -359,7 +359,7 @@ not provided (default), the noise variance is estimated via the unbiased
 estimator [13]_ defined as:
 
 .. math::
-    \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{n - p} \,
+    \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{n - p}
 
 where :math:`p` is the number of parameters in the model and :math:`\hat{y_i}`
 is the predicted target using an ordinary least squares regression. In

From d81bca98333cbad18e3a53f23542dc1f391d99a6 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 15:47:35 +0200
Subject: [PATCH 12/35] iter

---
 doc/modules/linear_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index ccb28919a69f1..c2d67f7328e34 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -345,7 +345,7 @@ targets, and :math:`n` is the number of samples.
 Plugging the maximum log-likelihood in the AIC formula yields:
 
 .. math::
-    n \ln(2 \pi \hat{\sigma^2}) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma^2}} + 2d
+    AIC = n \ln(2 \pi \hat{\sigma^2}) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma^2}} + 2d
 
 The left-hand term of the above expression is sometimes discarded since it is a
 constant and does not change the rank of the models evaluated. In addition,

From 97c3259de9fb9fe5ae9088eb58e7ec1800f889ed Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 22:05:03 +0200
Subject: [PATCH 13/35] Apply suggestions from code review

Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 doc/modules/linear_model.rst                  | 23 ++++++++++---------
 doc/whats_new/v1.1.rst                        |  4 ++--
 examples/linear_model/plot_lasso_lars_ic.py   | 10 ++++----
 .../plot_lasso_model_selection.py             | 22 +++++++++---------
 4 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index c2d67f7328e34..167309d56724e 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -327,33 +327,33 @@ section, we give more information regarding the criterion computed in
 scikit-learn. The AIC criterion is defined as:
 
 .. math::
-    AIC = -2 \ln(\hat{L}) + 2 d
+    AIC = -2 \log(\hat{L}) + 2 d
 
 where :math:`\hat{L}` is the maximum likelihood of the model and
-:math:`d` is the number of parameters (as well referred as degrees of
+:math:`d` is the number of parameters (as well referred to as degrees of
 freedom in the previous section).
 
 For a linear Gaussian model, the maximum log-likelihood is defined as:
 
 .. math::
-    \ln(\hat{L}) = - \frac{n}{2} \ln(2 \pi) - \frac{n}{2} \ln(\hat{\sigma^2}) - \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{2\hat{\sigma^2}}
+    \log(\hat{L}) = - \frac{n}{2} \log(2 \pi) - \frac{n}{2} \ln(\hat{\sigma}^2) - \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{2\hat{\sigma}^2}
 
-where :math:`\hat{\sigma^2}` is an estimator of the noise variance,
+where :math:`\hat{\sigma}^2` is an estimate of the noise variance,
 :math:`y_i` and :math:`\hat{y_i}` are respectively the true and predicted
 targets, and :math:`n` is the number of samples.
 
 Plugging the maximum log-likelihood in the AIC formula yields:
 
 .. math::
-    AIC = n \ln(2 \pi \hat{\sigma^2}) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma^2}} + 2d
+    AIC = n \log(2 \pi \hat{\sigma}^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma}^2} + 2d
 
-The left-hand term of the above expression is sometimes discarded since it is a
-constant and does not change the rank of the models evaluated. In addition,
+The first term of the above expression is sometimes discarded since it is a
+constant when :math:`\hat{\sigma}^2` is provided. In addition,
 it is sometimes stated that the AIC is equivalent to the :math:`C_p` statistic
 [12]_. However, this is important to note that it is up to some constant
 and factor term.
 
-At last, we mentioned above that :math:`\hat{\sigma^2}` is an estimator of the
+At last, we mentioned above that :math:`\hat{\sigma}^2` is an estimate of the
 noise variance. In :class:`LassoLarsIC` when the parameter `noise_variance` is
 not provided (default), the noise variance is estimated via the unbiased
 estimator [13]_ defined as:
@@ -361,10 +361,11 @@ estimator [13]_ defined as:
 .. math::
     \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{n - p}
 
-where :math:`p` is the number of parameters in the model and :math:`\hat{y_i}`
+where :math:`p` is the number of features and :math:`\hat{y_i}`
 is the predicted target using an ordinary least squares regression. In
-scikit-learn, we used a ridge model with a very small regularization in case
-of ill-conditioned design matrix.
+scikit-learn, we use a ridge model with a very small regularization in case
+of ill-conditioned design matrix. Note, that this formula is valid only when
+`n_samples > n_features`.
 
 .. topic:: Examples:
 
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 1980a304ad765..f3b0a64a57ef7 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -106,13 +106,13 @@ Changelog
 
 - |API| Add a parameter `noise_variance` to :class:`linear_model.LassoLarsIC`
   in order to provide an estimate of the noise variance. This is particularly
-  relevant when `n_features > n_samples` and the the estimator of the noise
+  relevant when `n_features > n_samples` and the estimator of the noise
   variance cannot be computed.
   :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`
 
 - |Fix| Fixed the computation of AIC and BIC in
   :class:`linear_model.LassoLarsIC`. An error is now raised when
-  `n_features > n_samples` and that not noise variance is provided.
+  `n_features > n_samples` and the noise variance is not provided.
   :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>` and
   :user:`Andrés Babino <ababino>`.
 
diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py
index 66dbcfa27e42a..c9353836d5e75 100644
--- a/examples/linear_model/plot_lasso_lars_ic.py
+++ b/examples/linear_model/plot_lasso_lars_ic.py
@@ -3,9 +3,9 @@
 Lasso model selection via information criteria
 ==============================================
 
-This example reproduces the example of Fig. 2 of [1]_. A
+This example reproduces the example of Fig. 2 of [ZHT2007]_. A
 :class:`~sklearn.linear_model.LassoLarsIC` estimator is fit on a
-diabetes dataset and uses the the AIC and the BIC criteria to select
+diabetes dataset and the AIC and the BIC criteria are used to select
 the best model.
 
 .. topic:: References
@@ -35,8 +35,8 @@
 
 # %%
 # Scikit-learn provides an estimator called
-# :class:`~sklearn.linear_model.LinearLarsIC` that we will use an information
-# criterion, namely the AIC or BIC, to select the best model. Before to fit
+# :class:`~sklearn.linear_model.LinearLarsIC` that uses an information
+# criterion, namely the AIC or BIC, to select the best model. Before fitting
 # this model, we will scale the dataset.
 #
 # In the following, we are going to fit two models to compare the values
@@ -52,7 +52,7 @@
 
 # %%
 # To be in line with the defintion in [ZHT2007]_, we need to rescale the
-# AIC and the BIC. Indeed, Zou et al. are neglecting a some constant terms
+# AIC and the BIC. Indeed, Zou et al. are ignoring some constant terms
 # compared to the true definition of AIC derivated from the maximum
 # log-likelihood of a linear model. You can refer to
 # :ref:`mathematical detail section for the User Guide <lasso_lars_ic>`.
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index e36590ff024fc..74e4de9b06107 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -3,14 +3,14 @@
 Lasso model selection: AIC-BIC / cross-validation
 =================================================
 
-This example focuses on model selection when dealing Lasso models that are
+This example focuses on model selection for Lasso models that are
 linear models with a L1 penalty for regression problems.
 
 Indeed, several strategies can be used to select the value of the
 regularization parameter: using an information criterion, namely AIC or BIC or
 via cross-validation.
 
-In remainder, we will discuss in details the different strategies.
+In what follows, we will discuss in details the different strategies.
 """
 
 # Author: Olivier Grisel
@@ -93,7 +93,7 @@
 # Finally, we can plot the AIC and BIC values for the different alpha values.
 # The vertical lines in the plot corresponds to the alpha chosen for each
 # criterion. The selected alpha corresponds to the minimum of the AIC or BIC
-# criterion.
+# criteria.
 ax = results.plot()
 ax.vlines(
     alpha_aic,
@@ -120,16 +120,16 @@
 )
 
 # %%
-# Information-criterion based model selection is very fast. It relies on
+# Model selection with an information-criterion is very fast. It relies on
 # computing the criterion on the in-sample set provided at `fit`. Both criteria
 # are computed estimate the model error on the full training set and penalize
-# this over optimistic error. However, this penalty relies on a proper
-# estimation of degrees of freedom and noise variance, are derived for large
+# this over optimistic error. However, this penalty relies on a the proper
+# estimation of the degrees of freedom and the noise variance. There also are derived for large
 # samples (asymptotic results) and assume the model is correct, i.e. that the
 # data are actually generated by this model.
 #
 # These models also tend to break when the problem is badly conditioned (more
-# features than samples) and it is then required to provide a estimate of the
+# features than samples). It is then required to provide a estimate of the
 # noise variance.
 #
 # Selecting Lasso via cross-validation
@@ -148,7 +148,7 @@
 #
 # Lasso via coordinate descent
 # ............................
-# Lets' start to make the hyperparameter tuning using
+# Let's start by making the hyperparameter tuning using
 # :class:`~sklearn.linear_model.LassoCV`.
 from sklearn.linear_model import LassoCV
 
@@ -181,7 +181,7 @@
 # %%
 # Lasso via least angle regression
 # ................................
-# Lets' start to make the hyperparameter tuning using
+# Let's start by making the hyperparameter tuning using
 # :class:`~sklearn.linear_model.LassoLarsCV`.
 from sklearn.linear_model import LassoLarsCV
 
@@ -214,8 +214,8 @@
 # Lars computes a path solution only for each kink in the path. As a result, it
 # is very efficient when there are only of few kinks, which is the case if
 # there are few features or samples. Also, it is able to compute the full path
-# without setting any meta parameter. On the opposite, coordinate descent
-# compute the path points on a pre-specified grid (here we use the default).
+# without setting any hyperparameter. On the opposite, coordinate descent
+# computes the path points on a pre-specified grid (here we use the default).
 # Thus it is more efficient if the number of grid points is smaller than the
 # number of kinks in the path. Such a strategy can be interesting if the number
 # of features is really large and there are enough samples to select a large

From ee7b6a416e669ab39a134a72c83db9bbb5a6e85b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 22:42:12 +0200
Subject: [PATCH 14/35] iter

---
 .../linear_model/plot_lasso_model_selection.py     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 74e4de9b06107..66f2f5004358f 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -124,9 +124,9 @@
 # computing the criterion on the in-sample set provided at `fit`. Both criteria
 # are computed estimate the model error on the full training set and penalize
 # this over optimistic error. However, this penalty relies on a the proper
-# estimation of the degrees of freedom and the noise variance. There also are derived for large
-# samples (asymptotic results) and assume the model is correct, i.e. that the
-# data are actually generated by this model.
+# estimation of the degrees of freedom and the noise variance. There also are
+# derived for large samples (asymptotic results) and assume the model is
+# correct, i.e. that the data are actually generated by this model.
 #
 # These models also tend to break when the problem is badly conditioned (more
 # features than samples). It is then required to provide a estimate of the
@@ -218,10 +218,10 @@
 # computes the path points on a pre-specified grid (here we use the default).
 # Thus it is more efficient if the number of grid points is smaller than the
 # number of kinks in the path. Such a strategy can be interesting if the number
-# of features is really large and there are enough samples to select a large
-# amount. In terms of numerical errors, for heavily correlated variables, Lars
-# will accumulate more errors, while the coordinate descent algorithm will only
-# sample the path on a grid.
+# of features is really large and there are enough samples to be selected in
+# each of the cross-validation fold. In terms of numerical errors, for heavily
+# correlated variables, Lars will accumulate more errors, while the coordinate
+# descent algorithm will only sample the path on a grid.
 #
 # Note how the optimal value of alpha varies for each fold. This illustrates
 # why nested-cross validation is necessary when trying to evaluate the

From fada91c5d998d6754089583f5adf5ec6fdb5dc0e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sat, 30 Oct 2021 23:37:38 +0200
Subject: [PATCH 15/35] Apply suggestions from code review

Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 sklearn/linear_model/_least_angle.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 05f3df8108d0e..ab6cc04d99263 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2243,7 +2243,7 @@ def fit(self, X, y, copy_X=None):
         return self
 
     def _estimate_noise_variance(self, X, y, positive):
-        """Compute a variance estimator of an OLS model.
+        """Compute an estimate of the variance with an OLS model.
 
         Parameters
         ----------
@@ -2263,13 +2263,13 @@ def _estimate_noise_variance(self, X, y, positive):
         ----
         Instead of using a ordinary linear regression, we will use a ridge
         model with a very low alpha for numerical stability in case of
-        collinearity features.
+        collinear features.
         """
         if X.shape[0] <= X.shape[1]:
             raise ValueError(
                 f"You are using {self.__class__.__name__} in the case where the number "
                 "of samples is smaller than the number of features. In this setting, "
-                "getting a good estimator for the variance of the noise is not "
+                "getting a good estimate for the variance of the noise is not "
                 "possible. Provide an estimate of the noise variance in the "
                 "constructor."
             )

From 25e3e6ea5686baf625c01a91f0c7b29cbc6ac4e8 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Sun, 31 Oct 2021 13:01:00 +0100
Subject: [PATCH 16/35] Update sklearn/linear_model/_least_angle.py

Co-authored-by: Alexandre Gramfort <alexandre.gramfort@m4x.org>
---
 sklearn/linear_model/_least_angle.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index ab6cc04d99263..c5f3720a852b6 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2028,7 +2028,7 @@ class LassoLarsIC(LassoLars):
 
     noise_variance : float, default=None
         The estimated noise variance of the data. If `None`, we will compute
-        an unbiased estimator using an OLS model. However, it is only possible
+        an unbiased estimate using an OLS model. However, it is only possible
         in the case `n_samples > n_features`.
 
         .. versionadded:: 1.1

From 794418aab6a9c27a1510fbc56219a2fdbef43b07 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 Nov 2021 13:29:20 +0100
Subject: [PATCH 17/35] iter

---
 doc/modules/linear_model.rst | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 167309d56724e..aed6fc9fd5680 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -307,20 +307,24 @@ Alternatively, the estimator :class:`LassoLarsIC` proposes to use the
 Akaike information criterion (AIC) and the Bayes Information criterion (BIC).
 It is a computationally cheaper alternative to find the optimal value of alpha
 as the regularization path is computed only once instead of k+1 times
-when using k-fold cross-validation. However, such criteria needs a
-proper estimation of the degrees of freedom of the solution, are
-derived for large samples (asymptotic results) and assume the model
-is correct, i.e. that the data are generated by this model.
-They also tend to break when the problem is badly conditioned
-(more features than samples).
+when using k-fold cross-validation.
+
+Indeed, these criteria are computed on the in-sample training set. In short,
+they penalized the over-optimistic scores of the different Lasso models by
+their flexibility (cf. to "Mathematical details" section below).
+
+However, such criteria needs a proper estimation of the degrees of freedom of
+the solution, are derived for large samples (asymptotic results) and assume the
+model is correct, i.e. that the data are generated by this model. They also
+tend to break when the problem is badly conditioned (more features than
+samples).
 
 .. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lars_ic_001.png
     :target: ../auto_examples/linear_model/plot_lasso_lars_ic.html
     :align: center
     :scale: 50%
 
-Mathematical details
-""""""""""""""""""""
+**Mathematical details**
 
 The definition of AIC (and thus BIC) might differ in the literature. In this
 section, we give more information regarding the criterion computed in

From 30386c4688df2b3e5db1d6a8fe3b69af1aba9b2b Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 Nov 2021 13:31:02 +0100
Subject: [PATCH 18/35] iter

---
 doc/modules/linear_model.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index aed6fc9fd5680..a66de8801ea3a 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -340,16 +340,16 @@ freedom in the previous section).
 For a linear Gaussian model, the maximum log-likelihood is defined as:
 
 .. math::
-    \log(\hat{L}) = - \frac{n}{2} \log(2 \pi) - \frac{n}{2} \ln(\hat{\sigma}^2) - \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{2\hat{\sigma}^2}
+    \log(\hat{L}) = - \frac{n}{2} \log(2 \pi) - \frac{n}{2} \ln(\hat{\sigma}^2) - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{2\hat{\sigma}^2}
 
 where :math:`\hat{\sigma}^2` is an estimate of the noise variance,
-:math:`y_i` and :math:`\hat{y_i}` are respectively the true and predicted
+:math:`y_i` and :math:`\hat{y}_i` are respectively the true and predicted
 targets, and :math:`n` is the number of samples.
 
 Plugging the maximum log-likelihood in the AIC formula yields:
 
 .. math::
-    AIC = n \log(2 \pi \hat{\sigma}^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{\hat{\sigma}^2} + 2d
+    AIC = n \log(2 \pi \hat{\sigma}^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\hat{\sigma}^2} + 2d
 
 The first term of the above expression is sometimes discarded since it is a
 constant when :math:`\hat{\sigma}^2` is provided. In addition,
@@ -363,9 +363,9 @@ not provided (default), the noise variance is estimated via the unbiased
 estimator [13]_ defined as:
 
 .. math::
-    \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y_i})^2}{n - p}
+    \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{n - p}
 
-where :math:`p` is the number of features and :math:`\hat{y_i}`
+where :math:`p` is the number of features and :math:`\hat{y}_i`
 is the predicted target using an ordinary least squares regression. In
 scikit-learn, we use a ridge model with a very small regularization in case
 of ill-conditioned design matrix. Note, that this formula is valid only when

From 228ca79758b1303626450770197f6f60c967528e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 Nov 2021 13:32:54 +0100
Subject: [PATCH 19/35] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 doc/modules/linear_model.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a66de8801ea3a..454c163688801 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -354,8 +354,8 @@ Plugging the maximum log-likelihood in the AIC formula yields:
 The first term of the above expression is sometimes discarded since it is a
 constant when :math:`\hat{\sigma}^2` is provided. In addition,
 it is sometimes stated that the AIC is equivalent to the :math:`C_p` statistic
-[12]_. However, this is important to note that it is up to some constant
-and factor term.
+[12]_. In a strict sense, however, it is equivalent only up to some constant
+and a multiplicative factor.
 
 At last, we mentioned above that :math:`\hat{\sigma}^2` is an estimate of the
 noise variance. In :class:`LassoLarsIC` when the parameter `noise_variance` is

From 10ececf879a930b6eeb3f0be419c6642e79e2fbd Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 2 Nov 2021 14:10:16 +0100
Subject: [PATCH 20/35] iter

---
 doc/modules/linear_model.rst         |  8 +++-----
 sklearn/linear_model/_least_angle.py | 15 ++++-----------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 454c163688801..6bab49c996644 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -365,11 +365,9 @@ estimator [13]_ defined as:
 .. math::
     \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{n - p}
 
-where :math:`p` is the number of features and :math:`\hat{y}_i`
-is the predicted target using an ordinary least squares regression. In
-scikit-learn, we use a ridge model with a very small regularization in case
-of ill-conditioned design matrix. Note, that this formula is valid only when
-`n_samples > n_features`.
+where :math:`p` is the number of features and :math:`\hat{y}_i` is the
+predicted target using an ordinary least squares regression. Note, that this
+formula is valid only when `n_samples > n_features`.
 
 .. topic:: Examples:
 
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index c5f3720a852b6..edcb38d89632d 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -19,7 +19,7 @@
 
 from ._base import LinearModel
 from ._base import _deprecate_normalize
-from ._ridge import ridge_regression
+from ._base import LinearRegression
 from ..base import RegressorMixin, MultiOutputMixin
 
 # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
@@ -2258,12 +2258,6 @@ def _estimate_noise_variance(self, X, y, positive):
         -------
         noise_variance : float
             An estimator of the noise variance of an OLS model.
-
-        Note
-        ----
-        Instead of using a ordinary linear regression, we will use a ridge
-        model with a very low alpha for numerical stability in case of
-        collinear features.
         """
         if X.shape[0] <= X.shape[1]:
             raise ValueError(
@@ -2273,7 +2267,6 @@ def _estimate_noise_variance(self, X, y, positive):
                 "possible. Provide an estimate of the noise variance in the "
                 "constructor."
             )
-        ols_coef = ridge_regression(
-            X, y, alpha=1e-12, positive=positive, check_input=False
-        )
-        return np.sum((y - X @ ols_coef) ** 2) / (X.shape[0] - X.shape[1])
+        ols_model = LinearRegression(positive=positive, fit_intercept=False)
+        y_pred = ols_model.fit(X, y).predict(X)
+        return np.sum((y - y_pred) ** 2) / (X.shape[0] - X.shape[1])

From e353b6aa313c32e83be392ae49cd9974f40a776d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 3 Nov 2021 15:25:10 +0100
Subject: [PATCH 21/35] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 doc/modules/linear_model.rst         | 6 +++---
 sklearn/linear_model/_least_angle.py | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 6bab49c996644..d35d9a1cf8168 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -310,13 +310,13 @@ as the regularization path is computed only once instead of k+1 times
 when using k-fold cross-validation.
 
 Indeed, these criteria are computed on the in-sample training set. In short,
-they penalized the over-optimistic scores of the different Lasso models by
+they penalize the over-optimistic scores of the different Lasso models by
 their flexibility (cf. to "Mathematical details" section below).
 
-However, such criteria needs a proper estimation of the degrees of freedom of
+However, such criteria need a proper estimation of the degrees of freedom of
 the solution, are derived for large samples (asymptotic results) and assume the
 model is correct, i.e. that the data are generated by this model. They also
-tend to break when the problem is badly conditioned (more features than
+tend to break when the problem is badly conditioned (e.g. more features than
 samples).
 
 .. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lars_ic_001.png
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index edcb38d89632d..3d93bda7a33d1 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2029,7 +2029,7 @@ class LassoLarsIC(LassoLars):
     noise_variance : float, default=None
         The estimated noise variance of the data. If `None`, we will compute
         an unbiased estimate using an OLS model. However, it is only possible
-        in the case `n_samples > n_features`.
+        in the case `n_samples > n_features + fit_intercept`.
 
         .. versionadded:: 1.1
 
@@ -2259,7 +2259,7 @@ def _estimate_noise_variance(self, X, y, positive):
         noise_variance : float
             An estimator of the noise variance of an OLS model.
         """
-        if X.shape[0] <= X.shape[1]:
+        if X.shape[0] <= X.shape[1] + self.fit_intercept:
             raise ValueError(
                 f"You are using {self.__class__.__name__} in the case where the number "
                 "of samples is smaller than the number of features. In this setting, "
@@ -2269,4 +2269,4 @@ def _estimate_noise_variance(self, X, y, positive):
             )
         ols_model = LinearRegression(positive=positive, fit_intercept=False)
         y_pred = ols_model.fit(X, y).predict(X)
-        return np.sum((y - y_pred) ** 2) / (X.shape[0] - X.shape[1])
+        return np.sum((y - y_pred) ** 2) / (X.shape[0] - X.shape[1] - self.fit_intercept)

From 1c0744733b8579c13c85ced29b49122bd01199b3 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Wed, 3 Nov 2021 15:26:09 +0100
Subject: [PATCH 22/35] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 sklearn/linear_model/tests/test_least_angle.py | 6 +-----
 sklearn/utils/estimator_checks.py              | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 774c5327bcc71..72f1796726986 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -946,12 +946,8 @@ def test_lars_numeric_consistency(LARS, has_coef_path, args):
 def test_lassolarsic_alpha_selection(criterion):
     """Check that we properly compute the AIC and BIC score.
 
-    In this test, we reproduce the example of the Fig. 2 of Zou et al.
+    In this test, we reproduce the example of the Fig. 2 of Zou et al. (reference [1] in LassoLarsIC)
     In this example, only 7 features should be selected.
-
-    Non-regression test for:
-    https://github.com/scikit-learn/scikit-learn/issues/14566
-    https://github.com/scikit-learn/scikit-learn/issues/17145
     """
     model = make_pipeline(
         StandardScaler(), LassoLarsIC(criterion=criterion, normalize=False)
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index dc5ef1df707b6..1b81214d961b8 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -660,7 +660,7 @@ def _set_checking_parameters(estimator):
 
     if name == "LassoLarsIC":
         # Noise variance estimation does not work when `n_samples < n_features`.
-        # We need to provide the noise variance explicitely.
+        # We need to provide the noise variance explicitly.
         estimator.set_params(noise_variance=1.0)
 
     if hasattr(estimator, "n_clusters"):

From 47be241cfa0e10cea3f9c69b155dad49bdea77f7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 9 Nov 2021 11:18:04 +0100
Subject: [PATCH 23/35] iter

---
 doc/modules/linear_model.rst         |  5 ++---
 sklearn/linear_model/_least_angle.py | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index d35d9a1cf8168..66246010ab040 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -315,9 +315,8 @@ their flexibility (cf. to "Mathematical details" section below).
 
 However, such criteria need a proper estimation of the degrees of freedom of
 the solution, are derived for large samples (asymptotic results) and assume the
-model is correct, i.e. that the data are generated by this model. They also
-tend to break when the problem is badly conditioned (e.g. more features than
-samples).
+correct model is candidates under investigation. They also tend to break when
+the problem is badly conditioned (e.g. more features than samples).
 
 .. figure:: ../auto_examples/linear_model/images/sphx_glr_plot_lasso_lars_ic_001.png
     :target: ../auto_examples/linear_model/plot_lasso_lars_ic.html
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 3d93bda7a33d1..347824afeb2d9 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2113,7 +2113,9 @@ class LassoLarsIC(LassoLars):
     --------
     >>> from sklearn import linear_model
     >>> reg = linear_model.LassoLarsIC(criterion='bic', normalize=False)
-    >>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])
+    >>> X = [[-2, 2], [-1, 1], [0, 0], [1, 1], [2, 2]]
+    >>> y = [-2.2222, -1.1111, 0, -1.1111, -2.2222]
+    >>> reg.fit(X, y)
     LassoLarsIC(criterion='bic', normalize=False)
     >>> print(reg.coef_)
     [ 0.  -1.11...]
@@ -2252,7 +2254,11 @@ def _estimate_noise_variance(self, X, y, positive):
             centered.
 
         y : ndarray of shape (n_samples,)
-            Associated target
+            Associated target.
+
+        positive : bool, default=False
+            Restrict coefficients to be >= 0. This should be inline with
+            the `positive` parameter from `LassoLarsIC`.
 
         Returns
         -------
@@ -2267,6 +2273,9 @@ def _estimate_noise_variance(self, X, y, positive):
                 "possible. Provide an estimate of the noise variance in the "
                 "constructor."
             )
+        # X is supposed to be centered and we don't need to fit with an intercept
         ols_model = LinearRegression(positive=positive, fit_intercept=False)
         y_pred = ols_model.fit(X, y).predict(X)
-        return np.sum((y - y_pred) ** 2) / (X.shape[0] - X.shape[1] - self.fit_intercept)
+        return np.sum((y - y_pred) ** 2) / (
+            X.shape[0] - X.shape[1] - self.fit_intercept
+        )

From 121b48f20b83f22f18a48d5d2e923d063ba4c19d Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 9 Nov 2021 11:22:57 +0100
Subject: [PATCH 24/35] iter

---
 doc/modules/linear_model.rst                   | 12 ++++++------
 sklearn/linear_model/tests/test_least_angle.py | 12 ++++++++----
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 66246010ab040..fffec70724514 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -339,30 +339,30 @@ freedom in the previous section).
 For a linear Gaussian model, the maximum log-likelihood is defined as:
 
 .. math::
-    \log(\hat{L}) = - \frac{n}{2} \log(2 \pi) - \frac{n}{2} \ln(\hat{\sigma}^2) - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{2\hat{\sigma}^2}
+    \log(\hat{L}) = - \frac{n}{2} \log(2 \pi) - \frac{n}{2} \ln(\sigma^2) - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{2\sigma^2}
 
-where :math:`\hat{\sigma}^2` is an estimate of the noise variance,
+where :math:`\sigma^2` is an estimate of the noise variance,
 :math:`y_i` and :math:`\hat{y}_i` are respectively the true and predicted
 targets, and :math:`n` is the number of samples.
 
 Plugging the maximum log-likelihood in the AIC formula yields:
 
 .. math::
-    AIC = n \log(2 \pi \hat{\sigma}^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\hat{\sigma}^2} + 2d
+    AIC = n \log(2 \pi \sigma^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sigma^2} + 2d
 
 The first term of the above expression is sometimes discarded since it is a
-constant when :math:`\hat{\sigma}^2` is provided. In addition,
+constant when :math:`\sigma^2` is provided. In addition,
 it is sometimes stated that the AIC is equivalent to the :math:`C_p` statistic
 [12]_. In a strict sense, however, it is equivalent only up to some constant
 and a multiplicative factor.
 
-At last, we mentioned above that :math:`\hat{\sigma}^2` is an estimate of the
+At last, we mentioned above that :math:`\sigma^2` is an estimate of the
 noise variance. In :class:`LassoLarsIC` when the parameter `noise_variance` is
 not provided (default), the noise variance is estimated via the unbiased
 estimator [13]_ defined as:
 
 .. math::
-    \hat{\sigma^2} = \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{n - p}
+    \sigma^2 = \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{n - p}
 
 where :math:`p` is the number of features and :math:`\hat{y}_i` is the
 predicted target using an ordinary least squares regression. Note, that this
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 72f1796726986..7f02d75a644b5 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -946,8 +946,9 @@ def test_lars_numeric_consistency(LARS, has_coef_path, args):
 def test_lassolarsic_alpha_selection(criterion):
     """Check that we properly compute the AIC and BIC score.
 
-    In this test, we reproduce the example of the Fig. 2 of Zou et al. (reference [1] in LassoLarsIC)
-    In this example, only 7 features should be selected.
+    In this test, we reproduce the example of the Fig. 2 of Zou et al.
+    (reference [1] in LassoLarsIC) In this example, only 7 features should be
+    selected.
     """
     model = make_pipeline(
         StandardScaler(), LassoLarsIC(criterion=criterion, normalize=False)
@@ -958,13 +959,16 @@ def test_lassolarsic_alpha_selection(criterion):
     assert best_alpha_selected == 7
 
 
-def test_lassolarsic_noise_variance():
+@pytest.mark.parametrize("fit_intercept", [True, False])
+def test_lassolarsic_noise_variance(fit_intercept):
     """Check the behaviour when `n_samples` < `n_features` and that one needs
     to provide the noise variance."""
     rng = np.random.RandomState(0)
     X, y = datasets.make_regression(n_samples=10, n_features=100, random_state=rng)
 
-    model = make_pipeline(StandardScaler(), LassoLarsIC(normalize=False))
+    model = make_pipeline(
+        StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept, normalize=False)
+    )
 
     err_msg = (
         "You are using LassoLarsIC in the case where the number of samples is smaller"

From 50c862dc296c38dc342d4086a56e7fbbe5ed456e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 9 Nov 2021 11:32:27 +0100
Subject: [PATCH 25/35] iter

---
 doc/modules/linear_model.rst         | 9 +++++++++
 sklearn/mixture/_gaussian_mixture.py | 6 ++++++
 2 files changed, 15 insertions(+)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index fffec70724514..32e7350d4cf6e 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -323,6 +323,8 @@ the problem is badly conditioned (e.g. more features than samples).
     :align: center
     :scale: 50%
 
+.. _aic_bic:
+
 **Mathematical details**
 
 The definition of AIC (and thus BIC) might differ in the literature. In this
@@ -336,6 +338,13 @@ where :math:`\hat{L}` is the maximum likelihood of the model and
 :math:`d` is the number of parameters (as well referred to as degrees of
 freedom in the previous section).
 
+The definition of BIC replace the constant `2` by `log(N)`:
+
+.. math::
+    BIC = -2 \log(\hat{L}) + \log(N) d
+
+where :math:`N` is the number of samples.
+
 For a linear Gaussian model, the maximum log-likelihood is defined as:
 
 .. math::
diff --git a/sklearn/mixture/_gaussian_mixture.py b/sklearn/mixture/_gaussian_mixture.py
index 995366b247778..42b76e05de6ae 100644
--- a/sklearn/mixture/_gaussian_mixture.py
+++ b/sklearn/mixture/_gaussian_mixture.py
@@ -813,6 +813,9 @@ def _n_parameters(self):
     def bic(self, X):
         """Bayesian information criterion for the current model on the input X.
 
+        You can refer to this :ref:`mathematical section <aic_bic>` for more
+        details regarding the formulation of the BIC used.
+
         Parameters
         ----------
         X : array of shape (n_samples, n_dimensions)
@@ -830,6 +833,9 @@ def bic(self, X):
     def aic(self, X):
         """Akaike information criterion for the current model on the input X.
 
+        You can refer to this :ref:`mathematical section <aic_bic>` for more
+        details regarding the formulation of the AIC used.
+
         Parameters
         ----------
         X : array of shape (n_samples, n_dimensions)

From 37c2c938e41ee3977e170aa93f27580ecc17d733 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 9 Nov 2021 14:11:00 +0100
Subject: [PATCH 26/35] iter

---
 examples/linear_model/plot_lasso_model_selection.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 66f2f5004358f..0bdff01943a0c 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -89,6 +89,9 @@
 results["BIC criterion"] = lasso_lars_ic[-1].criterion_
 alpha_bic = lasso_lars_ic[-1].alpha_
 
+# %%
+results
+
 # %%
 # Finally, we can plot the AIC and BIC values for the different alpha values.
 # The vertical lines in the plot corresponds to the alpha chosen for each

From 73a9f5ebc4d4d3f631f7c4f3b98e3929539d8c64 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 22 Nov 2021 11:02:23 +0100
Subject: [PATCH 27/35] christian improvements

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/linear_model/plot_lasso_lars_ic.py   | 17 ++++++-------
 .../plot_lasso_model_selection.py             | 24 +++++++++----------
 sklearn/linear_model/_least_angle.py          |  2 +-
 .../linear_model/tests/test_least_angle.py    |  2 +-
 4 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py
index c9353836d5e75..cb2387dbab517 100644
--- a/examples/linear_model/plot_lasso_lars_ic.py
+++ b/examples/linear_model/plot_lasso_lars_ic.py
@@ -35,12 +35,13 @@
 
 # %%
 # Scikit-learn provides an estimator called
-# :class:`~sklearn.linear_model.LinearLarsIC` that uses an information
-# criterion, namely the AIC or BIC, to select the best model. Before fitting
+# :class:`~sklearn.linear_model.LinearLarsIC` that uses either Akaike's
+# information criterion (AIC) or the Bayesian information criterion (BIC) to
+# select the best model. Before fitting
 # this model, we will scale the dataset.
 #
 # In the following, we are going to fit two models to compare the values
-# reported by the AIC and the BIC.
+# reported by AIC and BIC.
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LassoLarsIC
 from sklearn.pipeline import make_pipeline
@@ -53,11 +54,11 @@
 # %%
 # To be in line with the defintion in [ZHT2007]_, we need to rescale the
 # AIC and the BIC. Indeed, Zou et al. are ignoring some constant terms
-# compared to the true definition of AIC derivated from the maximum
+# compared to the original definition of AIC derivated from the maximum
 # log-likelihood of a linear model. You can refer to
 # :ref:`mathematical detail section for the User Guide <lasso_lars_ic>`.
 def zou_et_al_criterion_rescaling(criterion, n_samples, noise_variance):
-    """Rescale the information criterion to follow Zou et al. definition."""
+    """Rescale the information criterion to follow the definition of Zou et al."""
     return criterion - n_samples * np.log(2 * np.pi * noise_variance) - n_samples
 
 
@@ -88,13 +89,13 @@ def zou_et_al_criterion_rescaling(criterion, n_samples, noise_variance):
 )[0]
 
 # %%
-# Now that we collected the AIC and BIC, we can as well check that the minimum
-# of both criteria happens at the same alpha. Then, we can simplify the
+# Now that we collected the AIC and BIC, we can as well check that the minima
+# of both criteria happen at the same alpha. Then, we can simplify the
 # following plot.
 index_alpha_path_aic == index_alpha_path_bic
 
 # %%
-# Now, we can plot the AIC and BIC criterion and the subsequent selected
+# Finally, we can plot the AIC and BIC criterion and the subsequent selected
 # regularization parameter.
 import matplotlib.pyplot as plt
 
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 0bdff01943a0c..80549725a066f 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -4,11 +4,11 @@
 =================================================
 
 This example focuses on model selection for Lasso models that are
-linear models with a L1 penalty for regression problems.
+linear models with an L1 penalty for regression problems.
 
 Indeed, several strategies can be used to select the value of the
-regularization parameter: using an information criterion, namely AIC or BIC or
-via cross-validation.
+regularization parameter: via cross-validation or using an information criterion, namely AIC or BIC.
+.
 
 In what follows, we will discuss in details the different strategies.
 """
@@ -34,8 +34,8 @@
 X.head()
 
 # %%
-# In addition, we will add some random features to the original data to
-# make obvious the feature selection performed by the Lasso model.
+# In addition, we add some random features to the original data to
+# better illustrate the feature selection performed by the Lasso model.
 import numpy as np
 import pandas as pd
 
@@ -51,7 +51,7 @@
 # %%
 # Selecting Lasso via an information criterion
 # --------------------------------------------
-# :class:`~sklearn.linear_model.LassoLarsIC` provides an Lasso estimator that
+# :class:`~sklearn.linear_model.LassoLarsIC` provides a Lasso estimator that
 # uses the Akaike information criterion (AIC) or the Bayes information
 # criterion (BIC) to select the optimal value of the regularization
 # parameter alpha.
@@ -94,9 +94,9 @@
 
 # %%
 # Finally, we can plot the AIC and BIC values for the different alpha values.
-# The vertical lines in the plot corresponds to the alpha chosen for each
+# The vertical lines in the plot correspond to the alpha chosen for each
 # criterion. The selected alpha corresponds to the minimum of the AIC or BIC
-# criteria.
+# criterion.
 ax = results.plot()
 ax.vlines(
     alpha_aic,
@@ -126,18 +126,18 @@
 # Model selection with an information-criterion is very fast. It relies on
 # computing the criterion on the in-sample set provided at `fit`. Both criteria
 # are computed estimate the model error on the full training set and penalize
-# this over optimistic error. However, this penalty relies on a the proper
-# estimation of the degrees of freedom and the noise variance. There also are
+# this overly optimistic error. However, this penalty relies on a the proper
+# estimation of the degrees of freedom and the noise variance. Both are
 # derived for large samples (asymptotic results) and assume the model is
 # correct, i.e. that the data are actually generated by this model.
 #
 # These models also tend to break when the problem is badly conditioned (more
-# features than samples). It is then required to provide a estimate of the
+# features than samples). It is then required to provide an estimate of the
 # noise variance.
 #
 # Selecting Lasso via cross-validation
 # ------------------------------------
-# Lasso estimator can be implemented with different solvers: coordinate descent
+# The Lasso estimator can be implemented with different solvers: coordinate descent
 # and least angle regression. They differ with regards to their execution speed
 # and sources of numerical errors.
 #
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 347824afeb2d9..5127aace2c68e 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2273,7 +2273,7 @@ def _estimate_noise_variance(self, X, y, positive):
                 "possible. Provide an estimate of the noise variance in the "
                 "constructor."
             )
-        # X is supposed to be centered and we don't need to fit with an intercept
+        # X and y are already centered and we don't need to fit with an intercept
         ols_model = LinearRegression(positive=positive, fit_intercept=False)
         y_pred = ols_model.fit(X, y).predict(X)
         return np.sum((y - y_pred) ** 2) / (
diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index 7f02d75a644b5..e24176a1c4f29 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -964,7 +964,7 @@ def test_lassolarsic_noise_variance(fit_intercept):
     """Check the behaviour when `n_samples` < `n_features` and that one needs
     to provide the noise variance."""
     rng = np.random.RandomState(0)
-    X, y = datasets.make_regression(n_samples=10, n_features=100, random_state=rng)
+    X, y = datasets.make_regression(n_samples=10, n_features=11 - fit_intercept, random_state=rng)
 
     model = make_pipeline(
         StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept, normalize=False)

From e3dc24443b654be5e666190ceffef55a6769dd19 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 22 Nov 2021 11:23:51 +0100
Subject: [PATCH 28/35] last review

---
 examples/linear_model/plot_lasso_lars_ic.py   |  8 +++++
 .../plot_lasso_model_selection.py             | 31 ++++++++++++++++---
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py
index cb2387dbab517..6bb37ac2c487d 100644
--- a/examples/linear_model/plot_lasso_lars_ic.py
+++ b/examples/linear_model/plot_lasso_lars_ic.py
@@ -8,6 +8,14 @@
 diabetes dataset and the AIC and the BIC criteria are used to select
 the best model.
 
+.. note::
+    It is important to note the optimization to find `alpha` with
+    :class:`~sklearn.linear_model.LassoLarsIC` relies on the AIC or BIC
+    criterion that are computed insample, thus on the training set directly.
+    This approach differs from the cross-validation procedure. For a comparison
+    of the two approaches, you can refer to the following example:
+    :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py`.
+
 .. topic:: References
 
     .. [ZHT2007] `Zou, Hui, Trevor Hastie, and Robert Tibshirani.
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 80549725a066f..ba8fda9b325a7 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -7,8 +7,8 @@
 linear models with an L1 penalty for regression problems.
 
 Indeed, several strategies can be used to select the value of the
-regularization parameter: via cross-validation or using an information criterion, namely AIC or BIC.
-.
+regularization parameter: via cross-validation or using an information
+criterion, namely AIC or BIC.
 
 In what follows, we will discuss in details the different strategies.
 """
@@ -46,7 +46,8 @@
     columns=[f"random_{i:02d}" for i in range(n_random_features)],
 )
 X = pd.concat([X, X_random], axis=1)
-X.head()
+# Show only a subset of the columns
+X[X.columns[::3]].head()
 
 # %%
 # Selecting Lasso via an information criterion
@@ -89,8 +90,15 @@
 results["BIC criterion"] = lasso_lars_ic[-1].criterion_
 alpha_bic = lasso_lars_ic[-1].alpha_
 
+
 # %%
-results
+# We can check which value of `alpha` lead to the minimum AIC and BIC.
+def highlight_min(x):
+    x_min = x.min()
+    return ["font-weight: bold" if v == x_min else "" for v in x]
+
+
+results.style.apply(highlight_min)
 
 # %%
 # Finally, we can plot the AIC and BIC values for the different alpha values.
@@ -230,3 +238,18 @@
 # why nested-cross validation is necessary when trying to evaluate the
 # performance of a method for which a parameter is chosen by cross-validation:
 # this choice of parameter may not be optimal for unseen data.
+#
+# Conclusion
+# ----------
+# In this tutorial, we presented to approaches for selecting the best
+# hyperparameter `alpha`: one strategy that find the optimal value of `alpha`
+# only using the training set and some information criterion and another based
+# on cross-validation.
+#
+# In this example, both approaches are working similarly. The insample
+# hyperparameter selection even shows its efficacy in terms of computational
+# performance. However, it should be noted that it can only be used when the
+# number of samples is large enough in regards to the number of features.
+#
+# That's why, hyperparameter optimization via cross-validation is a safe
+# strategy since it would work is the different settings.

From 227f2a1367b04b5e040fd209d1ceae84e43bd69e Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 22 Nov 2021 11:24:27 +0100
Subject: [PATCH 29/35] last review

---
 examples/linear_model/plot_lasso_model_selection.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index ba8fda9b325a7..999dcee1c4193 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -145,9 +145,9 @@ def highlight_min(x):
 #
 # Selecting Lasso via cross-validation
 # ------------------------------------
-# The Lasso estimator can be implemented with different solvers: coordinate descent
-# and least angle regression. They differ with regards to their execution speed
-# and sources of numerical errors.
+# The Lasso estimator can be implemented with different solvers: coordinate
+# descent and least angle regression. They differ with regards to their
+# execution speed and sources of numerical errors.
 #
 # In scikit-learn, two different estimators are available with integrated
 # cross-validation: :class:`~sklearn.linear_model.LassoCV` and

From 19f903227c16721fc593f768746fa7c522e116c7 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 22 Nov 2021 11:31:43 +0100
Subject: [PATCH 30/35] black

---
 sklearn/linear_model/tests/test_least_angle.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py
index e24176a1c4f29..0db0a2fbb29ff 100644
--- a/sklearn/linear_model/tests/test_least_angle.py
+++ b/sklearn/linear_model/tests/test_least_angle.py
@@ -964,7 +964,9 @@ def test_lassolarsic_noise_variance(fit_intercept):
     """Check the behaviour when `n_samples` < `n_features` and that one needs
     to provide the noise variance."""
     rng = np.random.RandomState(0)
-    X, y = datasets.make_regression(n_samples=10, n_features=11 - fit_intercept, random_state=rng)
+    X, y = datasets.make_regression(
+        n_samples=10, n_features=11 - fit_intercept, random_state=rng
+    )
 
     model = make_pipeline(
         StandardScaler(), LassoLarsIC(fit_intercept=fit_intercept, normalize=False)

From d282d94f5d03e55cb1cf79f0063fd76ab1c90994 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 22 Nov 2021 15:36:38 +0100
Subject: [PATCH 31/35] Apply suggestions from code review

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/linear_model/plot_lasso_lars_ic.py   |  6 ++---
 .../plot_lasso_model_selection.py             | 26 +++++++++----------
 sklearn/linear_model/_least_angle.py          |  4 +--
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py
index 6bb37ac2c487d..cf7706962f91b 100644
--- a/examples/linear_model/plot_lasso_lars_ic.py
+++ b/examples/linear_model/plot_lasso_lars_ic.py
@@ -9,9 +9,9 @@
 the best model.
 
 .. note::
-    It is important to note the optimization to find `alpha` with
+    It is important to note that the optimization to find `alpha` with
     :class:`~sklearn.linear_model.LassoLarsIC` relies on the AIC or BIC
-    criterion that are computed insample, thus on the training set directly.
+    criterion that are computed in-sample, thus on the training set directly.
     This approach differs from the cross-validation procedure. For a comparison
     of the two approaches, you can refer to the following example:
     :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py`.
@@ -62,7 +62,7 @@
 # %%
 # To be in line with the defintion in [ZHT2007]_, we need to rescale the
 # AIC and the BIC. Indeed, Zou et al. are ignoring some constant terms
-# compared to the original definition of AIC derivated from the maximum
+# compared to the original definition of AIC derived from the maximum
 # log-likelihood of a linear model. You can refer to
 # :ref:`mathematical detail section for the User Guide <lasso_lars_ic>`.
 def zou_et_al_criterion_rescaling(criterion, n_samples, noise_variance):
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 999dcee1c4193..1546175b0ebb8 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -92,7 +92,7 @@
 
 
 # %%
-# We can check which value of `alpha` lead to the minimum AIC and BIC.
+# We can check which value of `alpha` leads to the minimum AIC and BIC.
 def highlight_min(x):
     x_min = x.min()
     return ["font-weight: bold" if v == x_min else "" for v in x]
@@ -132,9 +132,9 @@ def highlight_min(x):
 
 # %%
 # Model selection with an information-criterion is very fast. It relies on
-# computing the criterion on the in-sample set provided at `fit`. Both criteria
-# are computed estimate the model error on the full training set and penalize
-# this overly optimistic error. However, this penalty relies on a the proper
+# computing the criterion on the in-sample set provided to `fit`. Both criteria
+# estimate the model generalization error based on the training set error and penalize
+# this overly optimistic error. However, this penalty relies on a proper
 # estimation of the degrees of freedom and the noise variance. Both are
 # derived for large samples (asymptotic results) and assume the model is
 # correct, i.e. that the data are actually generated by this model.
@@ -222,7 +222,7 @@ def highlight_min(x):
 # ....................................
 # Both algorithms give roughly the same results.
 #
-# Lars computes a path solution only for each kink in the path. As a result, it
+# Lars computes a solution path only for each kink in the path. As a result, it
 # is very efficient when there are only of few kinks, which is the case if
 # there are few features or samples. Also, it is able to compute the full path
 # without setting any hyperparameter. On the opposite, coordinate descent
@@ -241,15 +241,15 @@ def highlight_min(x):
 #
 # Conclusion
 # ----------
-# In this tutorial, we presented to approaches for selecting the best
-# hyperparameter `alpha`: one strategy that find the optimal value of `alpha`
-# only using the training set and some information criterion and another based
-# on cross-validation.
+# In this tutorial, we presented two approaches for selecting the best
+# hyperparameter `alpha`: one strategy finds the optimal value of `alpha`
+# by only using the training set and some information criterion, and another
+# strategy is based on cross-validation.
 #
-# In this example, both approaches are working similarly. The insample
+# In this example, both approaches are working similarly. The in-sample
 # hyperparameter selection even shows its efficacy in terms of computational
 # performance. However, it should be noted that it can only be used when the
-# number of samples is large enough in regards to the number of features.
+# number of samples is large enough compared to the number of features.
 #
-# That's why, hyperparameter optimization via cross-validation is a safe
-# strategy since it would work is the different settings.
+# That's why hyperparameter optimization via cross-validation is a safe
+# strategy: it works in different settings.
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 5127aace2c68e..b7b5683fc0f26 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2027,8 +2027,8 @@ class LassoLarsIC(LassoLars):
         a sparse solution is expected and/or reached.
 
     noise_variance : float, default=None
-        The estimated noise variance of the data. If `None`, we will compute
-        an unbiased estimate using an OLS model. However, it is only possible
+        The estimated noise variance of the data. If `None`, an unbiased
+        estimate is computed by an OLS model. However, it is only possible
         in the case `n_samples > n_features + fit_intercept`.
 
         .. versionadded:: 1.1

From 369f6a79f00e10f5593cafffe07861f2b3c3357f Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Mon, 22 Nov 2021 15:40:26 +0100
Subject: [PATCH 32/35] iter

---
 examples/linear_model/plot_lasso_model_selection.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 1546175b0ebb8..8954465b2e45d 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -133,9 +133,9 @@ def highlight_min(x):
 # %%
 # Model selection with an information-criterion is very fast. It relies on
 # computing the criterion on the in-sample set provided to `fit`. Both criteria
-# estimate the model generalization error based on the training set error and penalize
-# this overly optimistic error. However, this penalty relies on a proper
-# estimation of the degrees of freedom and the noise variance. Both are
+# estimate the model generalization error based on the training set error and
+# penalize this overly optimistic error. However, this penalty relies on a
+# proper estimation of the degrees of freedom and the noise variance. Both are
 # derived for large samples (asymptotic results) and assume the model is
 # correct, i.e. that the data are actually generated by this model.
 #
@@ -237,7 +237,8 @@ def highlight_min(x):
 # Note how the optimal value of alpha varies for each fold. This illustrates
 # why nested-cross validation is necessary when trying to evaluate the
 # performance of a method for which a parameter is chosen by cross-validation:
-# this choice of parameter may not be optimal for unseen data.
+# this choice of parameter may not be optimal for a final evaluation on
+# unseen test set only.
 #
 # Conclusion
 # ----------

From e426eb4acdecf7f67c19ccd25cb81931cd408201 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 23 Nov 2021 11:09:57 +0100
Subject: [PATCH 33/35] Apply suggestions from code review

Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
---
 doc/modules/linear_model.rst                       |  4 ++--
 doc/whats_new/v1.1.rst                             | 14 +++++++-------
 examples/linear_model/plot_lasso_lars_ic.py        |  2 +-
 .../linear_model/plot_lasso_model_selection.py     |  4 ++--
 sklearn/linear_model/_least_angle.py               | 10 +++++-----
 5 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 32e7350d4cf6e..a58ebc8ed03af 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -338,7 +338,7 @@ where :math:`\hat{L}` is the maximum likelihood of the model and
 :math:`d` is the number of parameters (as well referred to as degrees of
 freedom in the previous section).
 
-The definition of BIC replace the constant `2` by `log(N)`:
+The definition of BIC replace the constant :math:`2` by :math:`\log(N)`:
 
 .. math::
     BIC = -2 \log(\hat{L}) + \log(N) d
@@ -357,7 +357,7 @@ targets, and :math:`n` is the number of samples.
 Plugging the maximum log-likelihood in the AIC formula yields:
 
 .. math::
-    AIC = n \log(2 \pi \sigma^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sigma^2} + 2d
+    AIC = n \log(2 \pi \sigma^2) + \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sigma^2} + 2 d
 
 The first term of the above expression is sometimes discarded since it is a
 constant when :math:`\sigma^2` is provided. In addition,
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 36498fd823fdc..94fc0ca160396 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -144,15 +144,15 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
-- |API| Add a parameter `noise_variance` to :class:`linear_model.LassoLarsIC`
-  in order to provide an estimate of the noise variance. This is particularly
-  relevant when `n_features > n_samples` and the estimator of the noise
-  variance cannot be computed.
+- |API| :class:`linear_model.LassoLarsIC` now exposes `noise_variance` as
+  a parameter in order to provide an estimate of the noise variance.
+  This is particularly relevant when `n_features > n_samples` and the
+  estimator of the noise variance cannot be computed.
   :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>`
 
-- |Fix| Fixed the computation of AIC and BIC in
-  :class:`linear_model.LassoLarsIC`. An error is now raised when
-  `n_features > n_samples` and the noise variance is not provided.
+- |Fix| :class:`linear_model.LassoLarsIC` now correctly computes AIC
+  and BIC. An error is now raised when `n_features > n_samples` and
+  when the noise variance is not provided.
   :pr:`21481` by :user:`Guillaume Lemaitre <glemaitre>` and
   :user:`Andrés Babino <ababino>`.
 
diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py
index cf7706962f91b..511c7ae0b7e10 100644
--- a/examples/linear_model/plot_lasso_lars_ic.py
+++ b/examples/linear_model/plot_lasso_lars_ic.py
@@ -11,7 +11,7 @@
 .. note::
     It is important to note that the optimization to find `alpha` with
     :class:`~sklearn.linear_model.LassoLarsIC` relies on the AIC or BIC
-    criterion that are computed in-sample, thus on the training set directly.
+    criteria that are computed in-sample, thus on the training set directly.
     This approach differs from the cross-validation procedure. For a comparison
     of the two approaches, you can refer to the following example:
     :ref:`sphx_glr_auto_examples_linear_model_plot_lasso_model_selection.py`.
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 8954465b2e45d..cced653450380 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -249,8 +249,8 @@ def highlight_min(x):
 #
 # In this example, both approaches are working similarly. The in-sample
 # hyperparameter selection even shows its efficacy in terms of computational
-# performance. However, it should be noted that it can only be used when the
-# number of samples is large enough compared to the number of features.
+# performance. However, it can only be used when the number of samples is large
+# enough compared to the number of features.
 #
 # That's why hyperparameter optimization via cross-validation is a safe
 # strategy: it works in different settings.
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index b7b5683fc0f26..302d1026c8ccc 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2029,7 +2029,7 @@ class LassoLarsIC(LassoLars):
     noise_variance : float, default=None
         The estimated noise variance of the data. If `None`, an unbiased
         estimate is computed by an OLS model. However, it is only possible
-        in the case `n_samples > n_features + fit_intercept`.
+        in the case where `n_samples > n_features + fit_intercept`.
 
         .. versionadded:: 1.1
 
@@ -2057,7 +2057,7 @@ class LassoLarsIC(LassoLars):
     criterion_ : array-like of shape (n_alphas,)
         The value of the information criteria ('aic', 'bic') across all
         alphas. The alpha which has the smallest information criterion is
-        chosen.
+        chosen, as specified in [1]_.
 
     noise_variance_ : float
         The estimated noise variance from the data used to compute the
@@ -2203,9 +2203,9 @@ def fit(self, X, y, copy_X=None):
         n_samples = X.shape[0]
 
         if self.criterion == "aic":
-            factor_criterion = 2
+            criterion_factor = 2
         elif self.criterion == "bic":
-            factor_criterion = log(n_samples)
+            criterion_factor = log(n_samples)
         else:
             raise ValueError(
                 f"criterion should be either bic or aic, got {self.criterion!r}"
@@ -2235,7 +2235,7 @@ def fit(self, X, y, copy_X=None):
         self.criterion_ = (
             n_samples * np.log(2 * np.pi * self.noise_variance_)
             + residuals_sum_squares / self.noise_variance_
-            + factor_criterion * degrees_of_freedom
+            + criterion_factor * degrees_of_freedom
         )
         n_best = np.argmin(self.criterion_)
 

From 6f08ddfc7228eed06ae94c3661be0bb0be097c5a Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 23 Nov 2021 11:24:52 +0100
Subject: [PATCH 34/35] reviews

---
 doc/modules/linear_model.rst                   |  4 ++--
 examples/linear_model/plot_lasso_lars_ic.py    |  4 ++--
 .../linear_model/plot_lasso_model_selection.py | 18 ++++++++++--------
 sklearn/linear_model/_least_angle.py           |  4 ++--
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index a58ebc8ed03af..7243990bb5ffe 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -384,10 +384,10 @@ formula is valid only when `n_samples > n_features`.
 
 .. topic:: References
 
-  .. [12] `Zou, Hui, Trevor Hastie, and Robert Tibshirani.
+  .. [12] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani.
            "On the degrees of freedom of the lasso."
            The Annals of Statistics 35.5 (2007): 2173-2192.
-           <https://arxiv.org/pdf/0712.0881.pdf>`_
+           <0712.0881.pdf>`
 
   .. [13] `Cherkassky, Vladimir, and Yunqian Ma.
            "Comparison of model selection for regression."
diff --git a/examples/linear_model/plot_lasso_lars_ic.py b/examples/linear_model/plot_lasso_lars_ic.py
index 511c7ae0b7e10..2f5392696ecc9 100644
--- a/examples/linear_model/plot_lasso_lars_ic.py
+++ b/examples/linear_model/plot_lasso_lars_ic.py
@@ -18,10 +18,10 @@
 
 .. topic:: References
 
-    .. [ZHT2007] `Zou, Hui, Trevor Hastie, and Robert Tibshirani.
+    .. [ZHT2007] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani.
        "On the degrees of freedom of the lasso."
        The Annals of Statistics 35.5 (2007): 2173-2192.
-       <https://arxiv.org/pdf/0712.0881.pdf>`_
+       <0712.0881>`
 """
 
 # Author: Alexandre Gramfort
diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index cced653450380..0bcabcea66a15 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -171,15 +171,16 @@ def highlight_min(x):
 import matplotlib.pyplot as plt
 
 ymin, ymax = 2300, 3800
-plt.semilogx(model[-1].alphas_, model[-1].mse_path_, linestyle=":")
+lasso = model[-1]
+plt.semilogx(lasso.alphas_, lasso.mse_path_, linestyle=":")
 plt.plot(
-    model[-1].alphas_,
-    model[-1].mse_path_.mean(axis=-1),
+    lasso.alphas_,
+    lasso.mse_path_.mean(axis=-1),
     color="black",
     label="Average across the folds",
     linewidth=2,
 )
-plt.axvline(model[-1].alpha_, linestyle="--", color="black", label="alpha: CV estimate")
+plt.axvline(lasso.alpha_, linestyle="--", color="black", label="alpha: CV estimate")
 
 plt.ylim(ymin, ymax)
 plt.xlabel(r"$\alpha$")
@@ -201,15 +202,16 @@ def highlight_min(x):
 fit_time = time.time() - start_time
 
 # %%
-plt.semilogx(model[-1].cv_alphas_, model[-1].mse_path_, ":")
+lasso = model[-1]
+plt.semilogx(lasso.cv_alphas_, lasso.mse_path_, ":")
 plt.semilogx(
-    model[-1].cv_alphas_,
-    model[-1].mse_path_.mean(axis=-1),
+    lasso.cv_alphas_,
+    lasso.mse_path_.mean(axis=-1),
     color="black",
     label="Average across the folds",
     linewidth=2,
 )
-plt.axvline(model[-1].alpha_, linestyle="--", color="black", label="alpha CV")
+plt.axvline(lasso.alpha_, linestyle="--", color="black", label="alpha CV")
 
 plt.ylim(ymin, ymax)
 plt.xlabel(r"$\alpha$")
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 302d1026c8ccc..1780cdf2ccf48 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -2098,10 +2098,10 @@ class LassoLarsIC(LassoLars):
 
     References
     ----------
-    .. [1] `Zou, Hui, Trevor Hastie, and Robert Tibshirani.
+    .. [1] :arxiv:`Zou, Hui, Trevor Hastie, and Robert Tibshirani.
             "On the degrees of freedom of the lasso."
             The Annals of Statistics 35.5 (2007): 2173-2192.
-            <https://arxiv.org/pdf/0712.0881.pdf>`_
+            <0712.0881>`
 
     .. [2] `Wikipedia entry on the Akaike information criterion
             <https://en.wikipedia.org/wiki/Akaike_information_criterion>`_

From e3474785c8c0e51b81e6f063b80529ab10f3b0d0 Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 23 Nov 2021 19:25:57 +0100
Subject: [PATCH 35/35] Update
 examples/linear_model/plot_lasso_model_selection.py

Co-authored-by: Christian Lorentzen <lorentzen.ch@gmail.com>
---
 examples/linear_model/plot_lasso_model_selection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/linear_model/plot_lasso_model_selection.py b/examples/linear_model/plot_lasso_model_selection.py
index 0bcabcea66a15..7cc05055b22d9 100644
--- a/examples/linear_model/plot_lasso_model_selection.py
+++ b/examples/linear_model/plot_lasso_model_selection.py
@@ -237,7 +237,7 @@ def highlight_min(x):
 # descent algorithm will only sample the path on a grid.
 #
 # Note how the optimal value of alpha varies for each fold. This illustrates
-# why nested-cross validation is necessary when trying to evaluate the
+# why nested-cross validation is a good strategy when trying to evaluate the
 # performance of a method for which a parameter is chosen by cross-validation:
 # this choice of parameter may not be optimal for a final evaluation on
 # unseen test set only.