diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 9e1dd7f22085d..a1d86eb797444 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -1054,6 +1054,16 @@ def _matmat(self, v):
         return res
 
 
+class _IdentityEstimator:
+    """Hack to call a scorer when we already have the predictions."""
+
+    def decision_function(self, y_predict):
+        return y_predict
+
+    def predict(self, y_predict):
+        return y_predict
+
+
 class _RidgeGCV(LinearModel):
     """Ridge regression with built-in Generalized Cross-Validation
 
@@ -1087,6 +1097,10 @@ class _RidgeGCV(LinearModel):
 
     looe = y - loov = c / diag(G^-1)
 
+    The best score (negative mean squared error or user-provided scoring) is
+    stored in the `best_score_` attribute, and the selected hyperparameter in
+    `alpha_`.
+
     References
     ----------
     http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
@@ -1460,45 +1474,59 @@ def fit(self, X, y, sample_weight=None):
             X, y = _rescale_data(X, y, sample_weight)
             sqrt_sw = np.sqrt(sample_weight)
         else:
-            sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)
+            sample_weight = sqrt_sw = np.ones(X.shape[0], dtype=X.dtype)
+
+        X_mean, *decomposition = decompose(X, y, sqrt_sw)
 
         scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
         error = scorer is None
 
         n_y = 1 if len(y.shape) == 1 else y.shape[1]
-        cv_values = np.zeros((n_samples * n_y, len(self.alphas)),
-                             dtype=X.dtype)
-        C = []
-        X_mean, *decomposition = decompose(X, y, sqrt_sw)
+
+        if self.store_cv_values:
+            self.cv_values_ = np.empty(
+                (n_samples * n_y, len(self.alphas)), dtype=X.dtype)
+
+        best_coef, best_score, best_alpha = None, None, None
+
         for i, alpha in enumerate(self.alphas):
             G_inverse_diag, c = solve(
                 float(alpha), y, sqrt_sw, X_mean, *decomposition)
             if error:
                 squared_errors = (c / G_inverse_diag) ** 2
-                cv_values[:, i] = squared_errors.ravel()
+                # convert errors back to the original space
+                # return converted errors
+                # calculate scores based on converted errors
+                if y.ndim == 2:
+                    squared_errors /= sample_weight[:, np.newaxis]
+                else:
+                    squared_errors /= sample_weight
+                # consistent with default multioutput of mean_squared_error
+                alpha_score = -squared_errors.mean()
+                if self.store_cv_values:
+                    self.cv_values_[:, i] = squared_errors.ravel()
             else:
                 predictions = y - (c / G_inverse_diag)
-                cv_values[:, i] = predictions.ravel()
-            C.append(c)
-
-        if error:
-            best = cv_values.mean(axis=0).argmin()
-        else:
-            # The scorer want an object that will make the predictions but
-            # they are already computed efficiently by _RidgeGCV. This
-            # identity_estimator will just return them
-            def identity_estimator():
-                pass
-            identity_estimator.decision_function = lambda y_predict: y_predict
-            identity_estimator.predict = lambda y_predict: y_predict
-
-            # signature of scorer is (estimator, X, y)
-            out = [scorer(identity_estimator, cv_values[:, i], y.ravel())
-                   for i in range(len(self.alphas))]
-            best = np.argmax(out)
-
-        self.alpha_ = self.alphas[best]
-        self.dual_coef_ = C[best]
+                # convert predictions back to the original space
+                # return converted predictions
+                # calculate scores based on converted predictions
+                if y.ndim == 2:
+                    y_true = y / sqrt_sw[:, np.newaxis] + y_offset
+                    y_pred = predictions / sqrt_sw[:, np.newaxis] + y_offset
+                else:
+                    y_true = y / sqrt_sw + y_offset
+                    y_pred = predictions / sqrt_sw + y_offset
+                # let the underlying scorer to handle multioutput
+                alpha_score = scorer(
+                    _IdentityEstimator(), y_pred, y_true)
+                if self.store_cv_values:
+                    self.cv_values_[:, i] = y_pred.ravel()
+            if (best_score is None) or (alpha_score > best_score):
+                best_coef, best_score, best_alpha = c, alpha_score, alpha
+
+        self.alpha_ = best_alpha
+        self.best_score_ = best_score
+        self.dual_coef_ = best_coef
         self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
 
         X_offset += X_mean * X_scale
@@ -1509,7 +1537,7 @@ def identity_estimator():
                 cv_values_shape = n_samples, len(self.alphas)
             else:
                 cv_values_shape = n_samples, n_y, len(self.alphas)
-            self.cv_values_ = cv_values.reshape(cv_values_shape)
+            self.cv_values_ = self.cv_values_.reshape(cv_values_shape)
 
         return self
 
@@ -1565,6 +1593,7 @@ def fit(self, X, y, sample_weight=None):
                                   store_cv_values=self.store_cv_values)
             estimator.fit(X, y, sample_weight=sample_weight)
             self.alpha_ = estimator.alpha_
+            self.best_score_ = estimator.best_score_
             if self.store_cv_values:
                 self.cv_values_ = estimator.cv_values_
         else:
@@ -1580,6 +1609,7 @@ def fit(self, X, y, sample_weight=None):
             gs.fit(X, y, sample_weight=sample_weight)
             estimator = gs.best_estimator_
             self.alpha_ = gs.best_estimator_.alpha
+            self.best_score_ = gs.best_score_
 
         self.coef_ = estimator.coef_
         self.intercept_ = estimator.intercept_
diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py
index c786b154fcb85..43252103e27b6 100644
--- a/sklearn/linear_model/tests/test_ridge.py
+++ b/sklearn/linear_model/tests/test_ridge.py
@@ -21,6 +21,7 @@
 from sklearn.metrics import mean_squared_error
 from sklearn.metrics import make_scorer
 from sklearn.metrics import get_scorer
+from sklearn.metrics import r2_score
 
 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import ridge_regression
@@ -36,7 +37,9 @@
 from sklearn.datasets import make_regression
 
 from sklearn.model_selection import GridSearchCV
-from sklearn.model_selection import KFold, GroupKFold, cross_val_predict
+from sklearn.model_selection import KFold
+from sklearn.model_selection import LeaveOneOut
+from sklearn.model_selection import cross_val_predict
 
 from sklearn.utils import check_random_state
 from sklearn.datasets import make_multilabel_classification
@@ -493,66 +496,6 @@ def test_ridge_loo_cv_asym_scoring():
     assert_allclose(gcv_ridge.intercept_, loo_ridge.intercept_, rtol=1e-3)
 
 
-@pytest.mark.parametrize('gcv_mode', ['svd', 'eigen'])
-@pytest.mark.parametrize('X_constructor', [np.asarray, sp.csr_matrix])
-@pytest.mark.parametrize('n_features', [8, 20])
-@pytest.mark.parametrize('y_shape, fit_intercept, noise',
-                         [((11,), True, 1.),
-                          ((11, 1), True, 20.),
-                          ((11, 3), True, 150.),
-                          ((11, 3), False, 30.)])
-def test_ridge_gcv_sample_weights(
-        gcv_mode, X_constructor, fit_intercept, n_features, y_shape, noise):
-    alphas = [1e-3, .1, 1., 10., 1e3]
-    rng = np.random.RandomState(0)
-    n_targets = y_shape[-1] if len(y_shape) == 2 else 1
-    X, y = _make_sparse_offset_regression(
-        n_samples=11, n_features=n_features, n_targets=n_targets,
-        random_state=0, shuffle=False, noise=noise)
-    y = y.reshape(y_shape)
-
-    sample_weight = 3 * rng.randn(len(X))
-    sample_weight = (sample_weight - sample_weight.min() + 1).astype(int)
-    indices = np.repeat(np.arange(X.shape[0]), sample_weight)
-    sample_weight = sample_weight.astype(float)
-    X_tiled, y_tiled = X[indices], y[indices]
-
-    cv = GroupKFold(n_splits=X.shape[0])
-    splits = cv.split(X_tiled, y_tiled, groups=indices)
-    kfold = RidgeCV(
-        alphas=alphas, cv=splits, scoring='neg_mean_squared_error',
-        fit_intercept=fit_intercept)
-    # ignore warning from GridSearchCV: FutureWarning: The default
-    # of the `iid` parameter will change from True to False in version 0.22
-    # and will be removed in 0.24
-    with ignore_warnings(category=FutureWarning):
-        kfold.fit(X_tiled, y_tiled)
-
-    ridge_reg = Ridge(alpha=kfold.alpha_, fit_intercept=fit_intercept)
-    splits = cv.split(X_tiled, y_tiled, groups=indices)
-    predictions = cross_val_predict(ridge_reg, X_tiled, y_tiled, cv=splits)
-    kfold_errors = (y_tiled - predictions)**2
-    kfold_errors = [
-        np.sum(kfold_errors[indices == i], axis=0) for
-        i in np.arange(X.shape[0])]
-    kfold_errors = np.asarray(kfold_errors)
-
-    X_gcv = X_constructor(X)
-    gcv_ridge = RidgeCV(
-        alphas=alphas, store_cv_values=True,
-        gcv_mode=gcv_mode, fit_intercept=fit_intercept)
-    gcv_ridge.fit(X_gcv, y, sample_weight=sample_weight)
-    if len(y_shape) == 2:
-        gcv_errors = gcv_ridge.cv_values_[:, :, alphas.index(kfold.alpha_)]
-    else:
-        gcv_errors = gcv_ridge.cv_values_[:, alphas.index(kfold.alpha_)]
-
-    assert kfold.alpha_ == pytest.approx(gcv_ridge.alpha_)
-    assert_allclose(gcv_errors, kfold_errors, rtol=1e-3)
-    assert_allclose(gcv_ridge.coef_, kfold.coef_, rtol=1e-3)
-    assert_allclose(gcv_ridge.intercept_, kfold.intercept_, rtol=1e-3)
-
-
 @pytest.mark.parametrize('mode', [True, 1, 5, 'bad', 'gcv'])
 def test_check_gcv_mode_error(mode):
     X, y = make_regression(n_samples=5, n_features=2)
@@ -1247,3 +1190,101 @@ def test_ridge_sag_with_X_fortran():
     X = X[::2, :]
     y = y[::2]
     Ridge(solver='sag').fit(X, y)
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("use_sample_weight", [True, False])
+@pytest.mark.parametrize("multioutput", [True, False])
+def test_ridgecv_default_scorer_consistency(fit_intercept, use_sample_weight,
+                                            multioutput):
+    if multioutput:
+        X, y = make_regression(n_samples=10, n_features=5,
+                               n_targets=3, random_state=0)
+    else:
+        X, y = make_regression(n_samples=10, n_features=5, random_state=0)
+    if use_sample_weight:
+        rng = np.random.RandomState(0)
+        sample_weight = rng.rand(X.shape[0])
+    else:
+        sample_weight = None
+    alphas = [0.1, 1, 10]
+
+    clf1 = RidgeCV(
+        fit_intercept=fit_intercept, scoring=None,
+        store_cv_values=True, alphas=alphas
+    )
+    clf1.fit(X, y, sample_weight=sample_weight)
+
+    # check consistency between RidgeCV(scoring=None) and
+    # RidgeCV(scoring="neg_mean_squared_error")
+    clf2 = RidgeCV(
+        fit_intercept=fit_intercept, scoring="neg_mean_squared_error",
+        store_cv_values=True, alphas=alphas
+    )
+    clf2.fit(X, y, sample_weight=sample_weight)
+    assert clf1.alpha_ == pytest.approx(clf2.alpha_)
+    assert clf1.best_score_ == pytest.approx(clf2.best_score_)
+    assert_array_almost_equal(clf1.coef_, clf2.coef_)
+    assert_array_almost_equal(clf1.intercept_, clf2.intercept_)
+    if multioutput:
+        cv_results_1 = clf1.cv_values_[:, :, alphas.index(clf1.alpha_)]
+        cv_results_2 = clf2.cv_values_[:, :, alphas.index(clf2.alpha_)]
+    else:
+        cv_results_1 = clf1.cv_values_[:, alphas.index(clf1.alpha_)]
+        cv_results_2 = clf2.cv_values_[:, alphas.index(clf2.alpha_)]
+    assert_array_almost_equal(cv_results_1, (y - cv_results_2) ** 2)
+    assert (clf1.best_score_ ==
+            pytest.approx(-mean_squared_error(y, cv_results_2)))
+
+    # check consistency between RidgeCV and GridSearCV
+    # this is true for specific scorer
+    clf2 = GridSearchCV(Ridge(fit_intercept=fit_intercept), {"alpha": alphas},
+                        scoring="neg_mean_squared_error", cv=LeaveOneOut())
+    clf2.fit(X, y, sample_weight=sample_weight)
+    assert clf1.alpha_ == pytest.approx(clf2.best_params_["alpha"])
+    assert clf1.best_score_ == pytest.approx(clf2.best_score_)
+    assert_array_almost_equal(clf1.coef_, clf2.best_estimator_.coef_)
+    assert_array_almost_equal(clf1.intercept_, clf2.best_estimator_.intercept_)
+
+    # check consistency between RidgeCV and cross_val_predict
+    # this is true for arbitrary scorer
+    ridge = Ridge(alpha=clf1.alpha_, fit_intercept=fit_intercept)
+    loo_pred = cross_val_predict(ridge, X, y, cv=LeaveOneOut(),
+                                 fit_params={"sample_weight": sample_weight})
+    assert_array_almost_equal(loo_pred, cv_results_2)
+
+
+@pytest.mark.parametrize("fit_intercept", [True, False])
+@pytest.mark.parametrize("use_sample_weight", [True, False])
+@pytest.mark.parametrize("multioutput", [True, False])
+def test_ridgecv_custom_scorer_consistency(fit_intercept, use_sample_weight,
+                                           multioutput):
+    if multioutput:
+        X, y = make_regression(n_samples=10, n_features=5,
+                               n_targets=3, random_state=0)
+    else:
+        X, y = make_regression(n_samples=10, n_features=5, random_state=0)
+    if use_sample_weight:
+        rng = np.random.RandomState(0)
+        sample_weight = rng.rand(X.shape[0])
+    else:
+        sample_weight = None
+
+    alphas = [0.1, 1, 10]
+    clf = RidgeCV(
+        fit_intercept=fit_intercept, scoring="r2",
+        store_cv_values=True, alphas=alphas
+    )
+    clf.fit(X, y, sample_weight=sample_weight)
+    if multioutput:
+        cv_results = clf.cv_values_[:, :, alphas.index(clf.alpha_)]
+    else:
+        cv_results = clf.cv_values_[:, alphas.index(clf.alpha_)]
+    assert clf.best_score_ == pytest.approx(r2_score(y, cv_results))
+
+    # check consistency between RidgeCV and cross_val_predict
+    # this is true for arbitrary scorer
+    ridge = Ridge(alpha=clf.alpha_, fit_intercept=fit_intercept)
+    loo_pred = cross_val_predict(ridge, X, y, cv=LeaveOneOut(),
+                                 fit_params={"sample_weight": sample_weight})
+    assert_array_almost_equal(loo_pred, cv_results)