diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
index 04894a4d7a7e7..b68f1593e317e 100644
--- a/sklearn/metrics/_classification.py
+++ b/sklearn/metrics/_classification.py
@@ -3277,10 +3277,10 @@ def d2_log_loss_score(y_true, y_pred, *, sample_weight=None, labels=None):
     :math:`D^2` score function, fraction of log loss explained.
 
     Best possible score is 1.0 and it can be negative (because the model can be
-    arbitrarily worse). A model that always uses the empirical mean of `y_true` as
-    constant prediction, disregarding the input features, gets a D^2 score of 0.0.
+    arbitrarily worse). A model that always predicts the per-class proportions
+    of `y_true`, disregarding the input features, gets a D^2 score of 0.0.
 
-    Read more in the :ref:`User Guide <d2_score>`.
+    Read more in the :ref:`User Guide <d2_score_classification>`.
 
     .. versionadded:: 1.5
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index 40b762bfa7308..b87e76ba2fb42 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -3048,7 +3048,8 @@ def test_d2_log_loss_score():
 
 
 def test_d2_log_loss_score_raises():
-    """Test that d2_log_loss raises error on invalid input."""
+    """Test that d2_log_loss_score raises the appropriate errors on
+    invalid inputs."""
     y_true = [0, 1, 2]
     y_pred = [[0.2, 0.8], [0.5, 0.5], [0.4, 0.6]]
     err = "contain different number of classes"