scikit-learn · thomasjpfan · Sep 18, 2024 · Sep 13, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -257,6 +257,15 @@ Changelog
   for the calculation of test scores.
   :pr:`29419` by :user:`Shruti Nath <snath-xoc>`.
 
+- |Fix| :class:`linear_model.RidgeCV` now properly use predictions the same scale as the
+  target seen during `fit`. Those predictions are stored in `cv_results_` when when
+  `scoring != None`. Previously, the predictions were rescaled by the square root of the
+  sample weights and offset by the mean of the target leading to an incorrect estimate
+  of the score.
+  :pr:`29842` by :user:`Guillaume Lemaitre <glemaitre>`,
+  :user:`Jérôme Dockes <jeromedockes>` and
+  :user:`Hanmin Qin <qinhanmin2014>`.
+
 - |API| Deprecates `copy_X` in :class:`linear_model.TheilSenRegressor` as the parameter
   has no effect. `copy_X` will be removed in 1.8.
   :pr:`29105` by :user:`Adam Li <adam2392>`.

diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
@@ -2129,6 +2129,7 @@ def fit(self, X, y, sample_weight=None, score_params=None):
 
         self.alphas = np.asarray(self.alphas)
 
+        unscaled_y = y
         X, y, X_offset, y_offset, X_scale = _preprocess_data(
             X,
             y,
@@ -2178,13 +2179,21 @@ def fit(self, X, y, sample_weight=None, score_params=None):
                     self.cv_results_[:, i] = squared_errors.ravel()
             else:
                 predictions = y - (c / G_inverse_diag)
+                # Rescale predictions back to original scale
+                if sample_weight is not None:  # avoid the unecessary division by ones
+                    if predictions.ndim > 1:
+                        predictions /= sqrt_sw[:, None]
+                    else:
+                        predictions /= sqrt_sw
+                predictions += y_offset
+
                 if self.store_cv_results:
                     self.cv_results_[:, i] = predictions.ravel()
 
                 score_params = score_params or {}
                 alpha_score = self._score(
                     predictions=predictions,
-                    y=y,
+                    y=unscaled_y,
                     n_y=n_y,
                     scorer=scorer,
                     score_params=score_params,

diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
@@ -859,7 +859,9 @@ def test_ridge_loo_cv_asym_scoring():
     loo_ridge.fit(X, y)
     gcv_ridge.fit(X, y)
 
-    assert gcv_ridge.alpha_ == pytest.approx(loo_ridge.alpha_)
+    assert gcv_ridge.alpha_ == pytest.approx(
+        loo_ridge.alpha_
+    ), f"{gcv_ridge.alpha_=}, {loo_ridge.alpha_=}"
     assert_allclose(gcv_ridge.coef_, loo_ridge.coef_, rtol=1e-3)
     assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
 
@@ -2252,6 +2254,52 @@ def test_ridge_cv_values_deprecated():
         ridge.cv_values_
 
 
+@pytest.mark.parametrize("with_sample_weight", [False, True])
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("n_targets", [1, 2])
+def test_ridge_cv_results_predictions(with_sample_weight, fit_intercept, n_targets):
+    """Check that the predictions stored in `cv_results_` are on the original scale.
+
+    The GCV approach works on scaled data: centered by an offset and scaled by the
+    squared root of the sample weights. Thus, previous to compute scores, the
+    predictions need to be scaled back to the original scale. Those predictions are the
+    ones stored in `cv_results_` in `RidgeCV`.
+
+    In this test, we check that the internal predictions stored in `cv_results_` are
+    equivalent to a naive LOO-CV grid-search with a `Ridge` estimator.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/13998
+    """
+    X, y = make_regression(
+        n_samples=100, n_features=10, n_targets=n_targets, random_state=0
+    )
+    sample_weight = np.ones(shape=(X.shape[0],))
+    if with_sample_weight:
+        sample_weight[::2] = 0.5
+
+    alphas = (0.1, 1.0, 10.0)
+
+    # scoring should be set to store predictions and not the squared error
+    ridge_cv = RidgeCV(
+        alphas=alphas,
+        scoring="neg_mean_squared_error",
+        fit_intercept=fit_intercept,
+        store_cv_results=True,
+    )
+    ridge_cv.fit(X, y, sample_weight=sample_weight)
+
+    # manual grid-search with a `Ridge` estimator
+    predictions = np.empty(shape=(*y.shape, len(alphas)))
+    cv = LeaveOneOut()
+    for alpha_idx, alpha in enumerate(alphas):
+        for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
+            ridge = Ridge(alpha=alpha, fit_intercept=fit_intercept)
+            ridge.fit(X[train_idx], y[train_idx], sample_weight[train_idx])
+            predictions[idx, ..., alpha_idx] = ridge.predict(X[test_idx])
+    assert_allclose(ridge_cv.cv_results_, predictions)
+
+
 def test_ridge_cv_multioutput_sample_weight(global_random_seed):
     """Check that `RidgeCV` works properly with multioutput and sample_weight
     when `scoring != None`.