From fc0f0e75f94379b074f64447731825a7d3c928df Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 25 Feb 2023 12:45:23 +0100
Subject: [PATCH 01/20] WIP to be continued

---
 sklearn/linear_model/_glm/glm.py            | 29 ++++++-----
 sklearn/linear_model/_linear_loss.py        | 43 +++++++++++------
 sklearn/linear_model/_logistic.py           | 53 ++++++++++-----------
 sklearn/linear_model/tests/test_logistic.py | 52 +++++++++++++++++---
 4 files changed, 115 insertions(+), 62 deletions(-)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index 3dc0bbdc66bff..4cac889a4da51 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -207,10 +207,10 @@ def fit(self, X, y, sample_weight=None):
             loss_dtype = min(max(y.dtype, X.dtype), np.float64)
         y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
 
-        # TODO: We could support samples_weight=None as the losses support it.
-        # Note that _check_sample_weight calls check_array(order="C") required by
-        # losses.
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
 
         n_samples, n_features = X.shape
         self._base_loss = self._get_loss()
@@ -228,17 +228,20 @@ def fit(self, X, y, sample_weight=None):
 
         # TODO: if alpha=0 check that X is not rank deficient
 
-        # IMPORTANT NOTE: Rescaling of sample_weight:
+        # NOTE: Rescaling of sample_weight:
         # We want to minimize
-        #     obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
+        #     obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance)
         #         + 1/2 * alpha * L2,
         # with
         #     deviance = 2 * loss.
         # The objective is invariant to multiplying sample_weight by a constant. We
-        # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
+        # could choose this constant such that sum(sample_weight) = 1 in order to end
+        # up with
         #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
-        # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
-        sample_weight = sample_weight / sample_weight.sum()
+        # But LinearModelLoss.loss() already computes
+        #     average(loss, weights=sample_weight)
+        # Thus, without rescaling, we have
+        #     obj = LinearModelLoss.loss(...)
 
         if self.warm_start and hasattr(self, "coef_"):
             if self.fit_intercept:
@@ -415,10 +418,10 @@ def score(self, X, y, sample_weight=None):
                 f" {base_loss.__name__}."
             )
 
-        # Note that constant_to_optimal_zero is already multiplied by sample_weight.
-        constant = np.mean(base_loss.constant_to_optimal_zero(y_true=y))
-        if sample_weight is not None:
-            constant *= sample_weight.shape[0] / np.sum(sample_weight)
+        constant = np.average(
+            base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None),
+            weights=sample_weight,
+        )
 
         # Missing factor of 2 in deviance cancels out.
         deviance = base_loss(
diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
index 92a203abc87ab..4255706e284f1 100644
--- a/sklearn/linear_model/_linear_loss.py
+++ b/sklearn/linear_model/_linear_loss.py
@@ -12,18 +12,19 @@ class LinearModelLoss:
 
     Note that raw_prediction is also known as linear predictor.
 
-    The loss is the sum of per sample losses and includes a term for L2
+    The loss is the average of per sample losses and includes a term for L2
     regularization::
 
-        loss = sum_i s_i loss(y_i, X_i @ coef + intercept)
+        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
                + 1/2 * l2_reg_strength * ||coef||_2^2
 
-    with sample weights s_i=1 if sample_weight=None.
+    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.
 
     Gradient and hessian, for simplicity without intercept, are::
 
-        gradient = X.T @ loss.gradient + l2_reg_strength * coef
-        hessian = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
+        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
+                  + l2_reg_strength * identity
 
     Conventions:
         if fit_intercept:
@@ -182,7 +183,7 @@ def loss(
         n_threads=1,
         raw_prediction=None,
     ):
-        """Compute the loss as sum over point-wise losses.
+        """Compute the loss as weighted average over point-wise losses.
 
         Parameters
         ----------
@@ -209,7 +210,7 @@ def loss(
         Returns
         -------
         loss : float
-            Sum of losses per sample plus penalty.
+            Weighted average of losses per sample, plus penalty.
         """
         if raw_prediction is None:
             weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
@@ -219,10 +220,10 @@ def loss(
         loss = self.base_loss.loss(
             y_true=y,
             raw_prediction=raw_prediction,
-            sample_weight=sample_weight,
+            sample_weight=None,
             n_threads=n_threads,
         )
-        loss = loss.sum()
+        loss = np.average(loss, weights=sample_weight)
 
         return loss + self.l2_penalty(weights, l2_reg_strength)
 
@@ -263,12 +264,12 @@ def loss_gradient(
         Returns
         -------
         loss : float
-            Sum of losses per sample plus penalty.
+            Weighted average of losses per sample, plus penalty.
 
         gradient : ndarray of shape coef.shape
              The gradient of the loss.
         """
-        n_features, n_classes = X.shape[1], self.base_loss.n_classes
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
 
         if raw_prediction is None:
@@ -282,9 +283,12 @@ def loss_gradient(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
-        loss = loss.sum()
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        loss = loss.sum() / sw_sum
         loss += self.l2_penalty(weights, l2_reg_strength)
 
+        grad_pointwise /= sw_sum
+
         if not self.base_loss.is_multiclass:
             grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
@@ -340,7 +344,7 @@ def gradient(
         gradient : ndarray of shape coef.shape
              The gradient of the loss.
         """
-        n_features, n_classes = X.shape[1], self.base_loss.n_classes
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
 
         if raw_prediction is None:
@@ -354,6 +358,8 @@ def gradient(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
 
         if not self.base_loss.is_multiclass:
             grad = np.empty_like(coef, dtype=weights.dtype)
@@ -439,6 +445,9 @@ def gradient_hessian(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
+        hess_pointwise /= sw_sum
 
         # For non-canonical link functions and far away from the optimum, the pointwise
         # hessian can be negative. We take care that 75% of the hessian entries are
@@ -543,6 +552,7 @@ def gradient_hessian_product(
         (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
         weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
 
         if not self.base_loss.is_multiclass:
             grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
@@ -551,6 +561,8 @@ def gradient_hessian_product(
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
+            grad_pointwise /= sw_sum
+            hess_pointwise /= sw_sum
             grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
             if self.fit_intercept:
@@ -603,6 +615,7 @@ def hessp(s):
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
+            grad_pointwise /= sw_sum
             grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
             grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
@@ -644,9 +657,9 @@ def hessp(s):
                 # hess_prod = empty_like(grad), but we ravel grad below and this
                 # function is run after that.
                 hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
-                hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
+                hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s
                 if self.fit_intercept:
-                    hess_prod[:, -1] = tmp.sum(axis=0)
+                    hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum
                 if coef.ndim == 1:
                     return hess_prod.ravel(order="F")
                 else:
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 6bdc4b7368ef0..11c08c115ba2d 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -292,33 +292,27 @@ def _logistic_regression_path(
         # np.unique(y) gives labels in sorted order.
         pos_class = classes[1]
 
-    # If sample weights exist, convert them to array (support for lists)
-    # and check length
-    # Otherwise set them to 1 for all examples
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
-
-    if solver == "newton-cholesky":
-        # IMPORTANT NOTE: Rescaling of sample_weight:
-        # Same as in _GeneralizedLinearRegressor.fit().
-        # We want to minimize
-        #     obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
-        #         + 1/2 * alpha * L2,
-        # with
-        #     deviance = 2 * log_loss.
-        # The objective is invariant to multiplying sample_weight by a constant. We
-        # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
-        #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
-        # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
-        #
-        # This rescaling has to be done before multiplying by class_weights.
-        sw_sum = sample_weight.sum()  # needed to rescale penalty, nasty matter!
-        sample_weight = sample_weight / sw_sum
+    if sample_weight is not None or class_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
+    # IMPORTANT NOTE:
+    # All solvers relying on LinearModelLoss need to scale the penalty with n_samples
+    # or the sum of sample weights as the here implemented logistic regression
+    # objective is (unfortunately)
+    #     C * sum(pointwise_loss) + penalty
+    # instead of (as LinearModelLoss does)
+    #     mean(pointwise_loss) + 1/C * penalty
+    if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+        # This needs to be calculated before sample_weight is multiplied by
+        # class_weight.
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
 
     # If class_weights is a dict (provided by the user), the weights
     # are assigned to the original labels. If it is "balanced", then
     # the class_weights are assigned after masking the labels with a OvR.
     le = LabelEncoder()
-    if isinstance(class_weight, dict) or multi_class == "multinomial":
+    if isinstance(class_weight, dict) or (
+        multi_class == "multinomial" and class_weight is not None
+    ):
         class_weight_ = compute_class_weight(class_weight, classes=classes, y=y)
         sample_weight *= class_weight_[le.fit_transform(y)]
 
@@ -445,7 +439,7 @@ def _logistic_regression_path(
     n_iter = np.zeros(len(Cs), dtype=np.int32)
     for i, C in enumerate(Cs):
         if solver == "lbfgs":
-            l2_reg_strength = 1.0 / C
+            l2_reg_strength = 1.0 / (C * sw_sum)
             iprint = [-1, 50, 1, 100, 101][
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)
             ]
@@ -455,7 +449,12 @@ def _logistic_regression_path(
                 method="L-BFGS-B",
                 jac=True,
                 args=(X, target, sample_weight, l2_reg_strength, n_threads),
-                options={"iprint": iprint, "gtol": tol, "maxiter": max_iter},
+                options={
+                    "iprint": iprint,
+                    "gtol": tol,
+                    "maxiter": max_iter,
+                    "ftol": 64 * np.finfo(float).eps,
+                },
             )
             n_iter_i = _check_optimize_result(
                 solver,
@@ -465,15 +464,13 @@ def _logistic_regression_path(
             )
             w0, loss = opt_res.x, opt_res.fun
         elif solver == "newton-cg":
-            l2_reg_strength = 1.0 / C
+            l2_reg_strength = 1.0 / (C * sw_sum)
             args = (X, target, sample_weight, l2_reg_strength, n_threads)
             w0, n_iter_i = _newton_cg(
                 hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
             )
         elif solver == "newton-cholesky":
-            # The division by sw_sum is a consequence of the rescaling of
-            # sample_weight, see comment above.
-            l2_reg_strength = 1.0 / C / sw_sum
+            l2_reg_strength = 1.0 / (C * sw_sum)
             sol = NewtonCholeskySolver(
                 coef=w0,
                 linear_loss=loss,
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index 85f5c2d52b745..9dc4ce691fb61 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -702,14 +702,17 @@ def test_logistic_regression_solvers_multiclass():
     }
 
     for solver_1, solver_2 in itertools.combinations(regressors, r=2):
-        assert_array_almost_equal(
-            regressors[solver_1].coef_, regressors[solver_2].coef_, decimal=4
+        assert_allclose(
+            regressors[solver_1].coef_,
+            regressors[solver_2].coef_,
+            rtol=5e-3 if solver_2 == "saga" else 1e-3,
+            err_msg=f"{solver_1} vs {solver_2}",
         )
 
 
 @pytest.mark.parametrize("weight", [{0: 0.1, 1: 0.2}, {0: 0.1, 1: 0.2, 2: 0.5}])
 @pytest.mark.parametrize("class_weight", ["weight", "balanced"])
-def test_logistic_regressioncv_class_weights(weight, class_weight):
+def test_logistic_regressioncv_class_weights(weight, class_weight, global_random_seed):
     """Test class_weight for LogisticRegressionCV."""
     n_classes = len(weight)
     if class_weight == "weight":
@@ -722,23 +725,60 @@ def test_logistic_regressioncv_class_weights(weight, class_weight):
         n_informative=3,
         n_redundant=0,
         n_classes=n_classes,
-        random_state=0,
+        random_state=global_random_seed,
     )
     params = dict(
         Cs=1,
         fit_intercept=False,
         multi_class="ovr",
         class_weight=class_weight,
+        tol=1e-8,
     )
     clf_lbfgs = LogisticRegressionCV(solver="lbfgs", **params)
     clf_lbfgs.fit(X, y)
 
+    from sklearn.linear_model._linear_loss import LinearModelLoss
+    from sklearn._loss.loss import HalfMultinomialLoss, HalfBinomialLoss
+
+    if n_classes > 2:
+        loss = LinearModelLoss(
+            base_loss=HalfMultinomialLoss(n_classes=n_classes),
+            fit_intercept=False,
+        )
+    else:
+        loss = LinearModelLoss(
+            base_loss=HalfBinomialLoss(),
+            fit_intercept=False,
+        )
+    l_lbfgs = loss.loss(
+        coef=clf_lbfgs.coef_.squeeze(),
+        X=X,
+        y=LabelEncoder().fit_transform(y).astype(float),
+        sample_weight=None,
+        l2_reg_strength=1 / 20,
+    )
+    print(f"loss lbfgs = {l_lbfgs} C_={clf_lbfgs.C_}")
+
     for solver in set(SOLVERS) - set(["lbfgs"]):
         clf = LogisticRegressionCV(solver=solver, **params)
         if solver in ("sag", "saga"):
-            clf.set_params(tol=1e-5, max_iter=10000, random_state=0)
+            clf.set_params(
+                tol=1e-18, max_iter=10000, random_state=global_random_seed + 1
+            )
         clf.fit(X, y)
-        assert_allclose(clf.coef_, clf_lbfgs.coef_, rtol=1e-3)
+
+        l_solver = loss.loss(
+            coef=clf.coef_.squeeze(),
+            X=X,
+            y=LabelEncoder().fit_transform(y).astype(float),
+            sample_weight=None,
+            l2_reg_strength=1 / 20,
+        )
+        print(f"loss {solver} = {l_solver} C_={clf.C_}")
+
+        assert_allclose(
+            clf.coef_, clf_lbfgs.coef_, rtol=1e-3, err_msg=f"{solver} vs lbfgs"
+        )
 
 
 def test_logistic_regression_sample_weights():

From e1c21282663748db2f3d5be2a766224fad59761c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 15 Sep 2023 20:24:35 +0200
Subject: [PATCH 02/20] ENH improve line search of newton_cg for tiny loss
 improvements

---
 sklearn/utils/optimize.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index 68a1ae1dddb98..97d7ddf1169b1 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -16,6 +16,7 @@
 import warnings
 
 import numpy as np
+import scipy
 
 from ..exceptions import ConvergenceWarning
 from .fixes import line_search_wolfe1, line_search_wolfe2
@@ -39,6 +40,32 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg
     """
     ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs)
 
+    if ret[0] is None:
+        # Have a look at the line_search method of our NewtonSolver class. We borrow
+        # the logic from there
+        # Deal with relative loss differences around machine precision.
+        args = kwargs.get("args", tuple())
+        fval = f(xk + pk, *args)
+        eps = 16 * np.finfo(np.asarray(old_fval).dtype).eps
+        tiny_loss = np.abs(old_fval * eps)
+        loss_improvement = fval - old_fval
+        check = np.abs(loss_improvement) <= tiny_loss
+        if check:
+            # 2.1 Check sum of absolute gradients as alternative condition.
+            sum_abs_grad_old = scipy.linalg.norm(gfk, ord=1)
+            grad = fprime(xk + pk, *args)
+            sum_abs_grad = scipy.linalg.norm(grad, ord=1)
+            check = sum_abs_grad < sum_abs_grad_old
+            if check:
+                ret = (
+                    1.0,  # step size
+                    ret[1] + 1,  # number of function evaluations
+                    ret[2] + 1,  # number of gradient evaluations
+                    fval,
+                    old_fval,
+                    grad,
+                )
+
     if ret[0] is None:
         # line search failed: try different one.
         ret = line_search_wolfe2(

From ead068f0c6867fc9f0550147f52c727e8a7a35ae Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 15 Sep 2023 20:25:55 +0200
Subject: [PATCH 03/20] ENH sample weight rescaling after class weights

---
 sklearn/linear_model/_logistic.py           | 24 ++++++-----
 sklearn/linear_model/tests/test_logistic.py | 45 ++++-----------------
 2 files changed, 20 insertions(+), 49 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index cc2cd25a72156..c0badd03c078e 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -306,17 +306,6 @@ def _logistic_regression_path(
 
     if sample_weight is not None or class_weight is not None:
         sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
-    # IMPORTANT NOTE:
-    # All solvers relying on LinearModelLoss need to scale the penalty with n_samples
-    # or the sum of sample weights as the here implemented logistic regression
-    # objective is (unfortunately)
-    #     C * sum(pointwise_loss) + penalty
-    # instead of (as LinearModelLoss does)
-    #     mean(pointwise_loss) + 1/C * penalty
-    if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
-        # This needs to be calculated before sample_weight is multiplied by
-        # class_weight.
-        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
 
     # If class_weights is a dict (provided by the user), the weights
     # are assigned to the original labels. If it is "balanced", then
@@ -369,6 +358,19 @@ def _logistic_regression_path(
             (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype
         )
 
+    # IMPORTANT NOTE:
+    # All solvers relying on LinearModelLoss need to scale the penalty with n_samples
+    # or the sum of sample weights because the implemented logistic regression
+    # objective here is (unfortunately)
+    #     C * sum(pointwise_loss) + penalty
+    # instead of (as LinearModelLoss does)
+    #     mean(pointwise_loss) + 1/C * penalty
+    if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+        # This needs to be calculated after sample_weight is multiplied by
+        # class_weight. It is even tested that passing class_weight is equivalent to
+        # passing sample_weights according to class_weight.
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+
     if coef is not None:
         # it must work both giving the bias term and not
         if multi_class == "ovr":
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
index dc7bba2518b20..8cdab6f041307 100644
--- a/sklearn/linear_model/tests/test_logistic.py
+++ b/sklearn/linear_model/tests/test_logistic.py
@@ -754,28 +754,6 @@ def test_logistic_regressioncv_class_weights(weight, class_weight, global_random
     clf_lbfgs = LogisticRegressionCV(solver="lbfgs", **params)
     clf_lbfgs.fit(X, y)
 
-    from sklearn.linear_model._linear_loss import LinearModelLoss
-    from sklearn._loss.loss import HalfMultinomialLoss, HalfBinomialLoss
-
-    if n_classes > 2:
-        loss = LinearModelLoss(
-            base_loss=HalfMultinomialLoss(n_classes=n_classes),
-            fit_intercept=False,
-        )
-    else:
-        loss = LinearModelLoss(
-            base_loss=HalfBinomialLoss(),
-            fit_intercept=False,
-        )
-    l_lbfgs = loss.loss(
-        coef=clf_lbfgs.coef_.squeeze(),
-        X=X,
-        y=LabelEncoder().fit_transform(y).astype(float),
-        sample_weight=None,
-        l2_reg_strength=1 / 20,
-    )
-    print(f"loss lbfgs = {l_lbfgs} C_={clf_lbfgs.C_}")
-
     for solver in set(SOLVERS) - set(["lbfgs"]):
         clf = LogisticRegressionCV(solver=solver, **params)
         if solver in ("sag", "saga"):
@@ -784,15 +762,6 @@ def test_logistic_regressioncv_class_weights(weight, class_weight, global_random
             )
         clf.fit(X, y)
 
-        l_solver = loss.loss(
-            coef=clf.coef_.squeeze(),
-            X=X,
-            y=LabelEncoder().fit_transform(y).astype(float),
-            sample_weight=None,
-            l2_reg_strength=1 / 20,
-        )
-        print(f"loss {solver} = {l_solver} C_={clf.C_}")
-
         assert_allclose(
             clf.coef_, clf_lbfgs.coef_, rtol=1e-3, err_msg=f"{solver} vs lbfgs"
         )
@@ -820,7 +789,7 @@ def test_logistic_regression_sample_weights():
 
         # Test that sample weights work the same with the lbfgs,
         # newton-cg, newton-cholesky and 'sag' solvers
-        clf_sw_lbfgs = LR(**kw)
+        clf_sw_lbfgs = LR(**kw, tol=1e-5)
         clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight)
         for solver in set(SOLVERS) - set(("lbfgs", "saga")):
             clf_sw = LR(solver=solver, tol=1e-10 if solver == "sag" else 1e-5, **kw)
@@ -946,9 +915,9 @@ def test_logistic_regression_multinomial():
 
     # 'lbfgs' is used as a referenced
     solver = "lbfgs"
-    ref_i = LogisticRegression(solver=solver, multi_class="multinomial")
+    ref_i = LogisticRegression(solver=solver, multi_class="multinomial", tol=1e-6)
     ref_w = LogisticRegression(
-        solver=solver, multi_class="multinomial", fit_intercept=False
+        solver=solver, multi_class="multinomial", fit_intercept=False, tol=1e-6
     )
     ref_i.fit(X, y)
     ref_w.fit(X, y)
@@ -976,9 +945,9 @@ def test_logistic_regression_multinomial():
         assert clf_w.coef_.shape == (n_classes, n_features)
 
         # Compare solutions between lbfgs and the other solvers
-        assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-2)
+        assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-3)
         assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-2)
-        assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-2)
+        assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-3)
 
     # Test that the path give almost the same results. However since in this
     # case we take the average of the coefs after fitting across all the
@@ -988,8 +957,8 @@ def test_logistic_regression_multinomial():
             solver=solver, max_iter=2000, tol=1e-6, multi_class="multinomial", Cs=[1.0]
         )
         clf_path.fit(X, y)
-        assert_allclose(clf_path.coef_, ref_i.coef_, rtol=2e-2)
-        assert_allclose(clf_path.intercept_, ref_i.intercept_, rtol=2e-2)
+        assert_allclose(clf_path.coef_, ref_i.coef_, rtol=1e-2)
+        assert_allclose(clf_path.intercept_, ref_i.intercept_, rtol=1e-2)
 
 
 def test_liblinear_decision_function_zero():

From fdc0fa36b6924b7f721a0cc94892b666f01da634 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 18 Sep 2023 20:05:56 +0200
Subject: [PATCH 04/20] ENH fix curvature condition in CG

---
 sklearn/utils/optimize.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index 97d7ddf1169b1..b6a319bc49953 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -108,6 +108,8 @@ def _cg(fhess_p, fgrad, maxiter, tol):
     psupi = -ri
     i = 0
     dri0 = np.dot(ri, ri)
+    # We also track of |p_i|^2.
+    psupi_norm2 = dri0
 
     while i <= maxiter:
         if np.sum(np.abs(ri)) <= tol:
@@ -116,7 +118,8 @@ def _cg(fhess_p, fgrad, maxiter, tol):
         Ap = fhess_p(psupi)
         # check curvature
         curv = np.dot(psupi, Ap)
-        if 0 <= curv <= 3 * np.finfo(np.float64).eps:
+        if 0 <= curv <= 16 * np.finfo(np.float64).eps * psupi_norm2:
+            # See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient.
             break
         elif curv < 0:
             if i > 0:
@@ -131,6 +134,8 @@ def _cg(fhess_p, fgrad, maxiter, tol):
         dri1 = np.dot(ri, ri)
         betai = dri1 / dri0
         psupi = -ri + betai * psupi
+        # We use  |p_i|^2 = |r_i|^2 + beta_i^2 |p_{i-1}|^2
+        psupi_norm2 = dri1 + betai**2 * psupi_norm2
         i = i + 1
         dri0 = dri1  # update np.dot(ri,ri) for next time.
 

From ef268133b75d77cbb0c4e98e20cf594c9ba5885e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 18 Sep 2023 23:57:50 +0200
Subject: [PATCH 05/20] ENH add verbose to _newton_cg

---
 sklearn/linear_model/_logistic.py | 11 +++++++--
 sklearn/utils/optimize.py         | 41 ++++++++++++++++++++++++++-----
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index c0badd03c078e..0fe6ccd241558 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -15,9 +15,9 @@
 from numbers import Integral, Real
 
 import numpy as np
-from joblib import effective_n_jobs
 from scipy import optimize
 
+from joblib import effective_n_jobs
 from sklearn.metrics import get_scorer_names
 
 from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss
@@ -481,7 +481,14 @@ def _logistic_regression_path(
             l2_reg_strength = 1.0 / (C * sw_sum)
             args = (X, target, sample_weight, l2_reg_strength, n_threads)
             w0, n_iter_i = _newton_cg(
-                hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
+                grad_hess=hess,
+                func=func,
+                grad=grad,
+                x0=w0,
+                args=args,
+                maxiter=max_iter,
+                tol=tol,
+                verbose=verbose,
             )
         elif solver == "newton-cholesky":
             l2_reg_strength = 1.0 / (C * sw_sum)
diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index b6a319bc49953..a222978512112 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -26,7 +26,9 @@ class _LineSearchError(RuntimeError):
     pass
 
 
-def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs):
+def _line_search_wolfe12(
+    f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs
+):
     """
     Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
     suitable step length is not found, and raise an exception if a
@@ -57,6 +59,9 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg
             sum_abs_grad = scipy.linalg.norm(grad, ord=1)
             check = sum_abs_grad < sum_abs_grad_old
             if check:
+                if verbose >= 2:
+                    print("  newton_cg line search detected tiny loss improvement.")
+                    print(f"  {loss_improvement=} {sum_abs_grad=}")
                 ret = (
                     1.0,  # step size
                     ret[1] + 1,  # number of function evaluations
@@ -78,7 +83,7 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg
     return ret
 
 
-def _cg(fhess_p, fgrad, maxiter, tol):
+def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
     """
     Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
     with a conjugate gradient descent.
@@ -113,6 +118,10 @@ def _cg(fhess_p, fgrad, maxiter, tol):
 
     while i <= maxiter:
         if np.sum(np.abs(ri)) <= tol:
+            if verbose >= 2:
+                print(
+                    f"  inner solver iteration {i} stopped with {np.sum(np.abs(ri))=}"
+                )
             break
 
         Ap = fhess_p(psupi)
@@ -120,6 +129,8 @@ def _cg(fhess_p, fgrad, maxiter, tol):
         curv = np.dot(psupi, Ap)
         if 0 <= curv <= 16 * np.finfo(np.float64).eps * psupi_norm2:
             # See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient.
+            if verbose >= 2:
+                print(f"  inner solver iteration {i} stopped with {curv=}")
             break
         elif curv < 0:
             if i > 0:
@@ -138,7 +149,11 @@ def _cg(fhess_p, fgrad, maxiter, tol):
         psupi_norm2 = dri1 + betai**2 * psupi_norm2
         i = i + 1
         dri0 = dri1  # update np.dot(ri,ri) for next time.
-
+    if i > maxiter and verbose >= 2:
+        print(
+            f"  newton_cg iterative solver stopped with maxiter={i - 1} and "
+            f"{np.sum(np.abs(ri))=}"
+        )
     return xsupi
 
 
@@ -153,6 +168,7 @@ def _newton_cg(
     maxinner=200,
     line_search=True,
     warn=True,
+    verbose=0,
 ):
     """
     Minimization of scalar function of one or more variables using the
@@ -206,6 +222,8 @@ def _newton_cg(
     if line_search:
         old_fval = func(x0, *args)
         old_old_fval = None
+    else:
+        old_fval = 0
 
     # Outer loop: our Newton iteration
     while k < maxiter:
@@ -214,7 +232,10 @@ def _newton_cg(
         fgrad, fhess_p = grad_hess(xk, *args)
 
         absgrad = np.abs(fgrad)
-        if np.max(absgrad) <= tol:
+        max_absgrad = np.max(absgrad)
+        if verbose > 0:
+            print(f"newton_cg iter = {k} loss = {old_fval} max|grad| = {max_absgrad}")
+        if max_absgrad <= tol:
             break
 
         maggrad = np.sum(absgrad)
@@ -223,14 +244,22 @@ def _newton_cg(
 
         # Inner loop: solve the Newton update by conjugate gradient, to
         # avoid inverting the Hessian
-        xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond)
+        xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose)
 
         alphak = 1.0
 
         if line_search:
             try:
                 alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12(
-                    func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args
+                    func,
+                    grad,
+                    xk,
+                    xsupi,
+                    fgrad,
+                    old_fval,
+                    old_old_fval,
+                    verbose=verbose,
+                    args=args,
                 )
             except _LineSearchError:
                 warnings.warn("Line Search failed")

From b1aae34a113d0213dafdc8f6bce8d55323ba8279 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 14:17:38 +0200
Subject: [PATCH 06/20] DOC add whatsnew

---
 doc/whats_new/v1.4.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 589f2b3e1dd5e..464e7c7cf8cab 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -196,6 +196,21 @@ Changelog
   :class:`scipy.sparse.sparray` subclasses.
   :pr:`27301` by :user:`Lohit SundaramahaLingam <lohitslohit>`.
 
+:mod:`sklearn.linear_model`
+...................................
+
+- |Efficiency| :class:`linear_model.LogisticRegression` and
+  :class:`linear_model.LogisticRegressionCV` now have much better convergence for
+  solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision
+  for the coefficients depending on the specified `tol`. Additionally, lbfgs can
+  make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg
+  is now faster than before.
+  This change also means that with this new version of scikit-learn, the resulting
+  coefficients `coef_` and `intercept_` of your models will change for these two
+  solvers (when fit on the same data again). The amount of change depends on the
+  specified `tol`, for small values you will get more precise results.
+  :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 :mod:`sklearn.metrics`
 ......................
 

From fc96f0d000f8521ef419daae896fb049aba31287 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 14:26:52 +0200
Subject: [PATCH 07/20] DOC add changed models entry

---
 doc/whats_new/v1.4.rst | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 464e7c7cf8cab..9b50a5877dbbd 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -19,6 +19,23 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
+- |Efficiency| :class:`linear_model.LogisticRegression` and
+  :class:`linear_model.LogisticRegressionCV` now have much better convergence for
+  solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision
+  for the coefficients depending on the specified `tol`. Additionally, lbfgs can
+  make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg
+  is now faster than before.
+  :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
+
+  .. note::
+
+      The lbfgs is the default solver, so this change might effect many models.
+
+      This change also means that with this new version of scikit-learn, the resulting
+      coefficients `coef_` and `intercept_` of your models will change for these two
+      solvers (when fit on the same data again). The amount of change depends on the
+      specified `tol`, for small values you will get more precise results.
+
 Changes impacting all modules
 -----------------------------
 
@@ -205,12 +222,15 @@ Changelog
   for the coefficients depending on the specified `tol`. Additionally, lbfgs can
   make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg
   is now faster than before.
-  This change also means that with this new version of scikit-learn, the resulting
-  coefficients `coef_` and `intercept_` of your models will change for these two
-  solvers (when fit on the same data again). The amount of change depends on the
-  specified `tol`, for small values you will get more precise results.
   :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
 
+  .. note::
+
+      This change also means that with this new version of scikit-learn, the resulting
+      coefficients `coef_` and `intercept_` of your models will change for these two
+      solvers (when fit on the same data again). The amount of change depends on the
+      specified `tol`, for small values you will get more precise results.
+
 :mod:`sklearn.metrics`
 ......................
 

From 6e559291dd7ab0e4ebdfb9f77d56c63379a8b942 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 14:44:12 +0200
Subject: [PATCH 08/20] CLN isort _logistic.py

---
 sklearn/linear_model/_logistic.py | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index e4fe81fd12eb6..9c7dd476e061c 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -26,30 +26,18 @@
 from ..model_selection import check_cv
 from ..preprocessing import LabelBinarizer, LabelEncoder
 from ..svm._base import _fit_liblinear
-from ..utils import (
-    Bunch,
-    check_array,
-    check_consistent_length,
-    check_random_state,
-    compute_class_weight,
-)
+from ..utils import (Bunch, check_array, check_consistent_length,
+                     check_random_state, compute_class_weight)
 from ..utils._param_validation import Interval, StrOptions
 from ..utils.extmath import row_norms, softmax
-from ..utils.metadata_routing import (
-    MetadataRouter,
-    MethodMapping,
-    _raise_for_params,
-    _routing_enabled,
-    process_routing,
-)
+from ..utils.metadata_routing import (MetadataRouter, MethodMapping,
+                                      _raise_for_params, _routing_enabled,
+                                      process_routing)
 from ..utils.multiclass import check_classification_targets
 from ..utils.optimize import _check_optimize_result, _newton_cg
 from ..utils.parallel import Parallel, delayed
-from ..utils.validation import (
-    _check_method_params,
-    _check_sample_weight,
-    check_is_fitted,
-)
+from ..utils.validation import (_check_method_params, _check_sample_weight,
+                                check_is_fitted)
 from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin
 from ._glm.glm import NewtonCholeskySolver
 from ._linear_loss import LinearModelLoss

From 175ee5fa814b6a4f3fe21a81be382d2df17d6b2b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 15:06:57 +0200
Subject: [PATCH 09/20] CLN black _logistic.py

---
 sklearn/linear_model/_logistic.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 9c7dd476e061c..e4fe81fd12eb6 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -26,18 +26,30 @@
 from ..model_selection import check_cv
 from ..preprocessing import LabelBinarizer, LabelEncoder
 from ..svm._base import _fit_liblinear
-from ..utils import (Bunch, check_array, check_consistent_length,
-                     check_random_state, compute_class_weight)
+from ..utils import (
+    Bunch,
+    check_array,
+    check_consistent_length,
+    check_random_state,
+    compute_class_weight,
+)
 from ..utils._param_validation import Interval, StrOptions
 from ..utils.extmath import row_norms, softmax
-from ..utils.metadata_routing import (MetadataRouter, MethodMapping,
-                                      _raise_for_params, _routing_enabled,
-                                      process_routing)
+from ..utils.metadata_routing import (
+    MetadataRouter,
+    MethodMapping,
+    _raise_for_params,
+    _routing_enabled,
+    process_routing,
+)
 from ..utils.multiclass import check_classification_targets
 from ..utils.optimize import _check_optimize_result, _newton_cg
 from ..utils.parallel import Parallel, delayed
-from ..utils.validation import (_check_method_params, _check_sample_weight,
-                                check_is_fitted)
+from ..utils.validation import (
+    _check_method_params,
+    _check_sample_weight,
+    check_is_fitted,
+)
 from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin
 from ._glm.glm import NewtonCholeskySolver
 from ._linear_loss import LinearModelLoss

From 52d63d53cc2ec4350aac67b05028d2e386148ac7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 16:17:27 +0200
Subject: [PATCH 10/20] CI ruff --config pyproject.toml

---
 build_tools/linting.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/linting.sh b/build_tools/linting.sh
index 28d16a8bbed32..cef174ea763c4 100755
--- a/build_tools/linting.sh
+++ b/build_tools/linting.sh
@@ -23,7 +23,7 @@ else
 fi
 
 echo -e "### Running ruff ###\n"
-ruff check --show-source .
+ruff check --show-source --config pyproject.toml .
 status=$?
 if [[ $status -eq 0 ]]
 then

From 1a499f719ea7f709ab64b5af9a49d58f683f1b20 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 16:22:06 +0200
Subject: [PATCH 11/20] Revert "CI ruff --config pyproject.toml"

This reverts commit 52d63d53cc2ec4350aac67b05028d2e386148ac7.
---
 build_tools/linting.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/linting.sh b/build_tools/linting.sh
index cef174ea763c4..28d16a8bbed32 100755
--- a/build_tools/linting.sh
+++ b/build_tools/linting.sh
@@ -23,7 +23,7 @@ else
 fi
 
 echo -e "### Running ruff ###\n"
-ruff check --show-source --config pyproject.toml .
+ruff check --show-source .
 status=$?
 if [[ $status -eq 0 ]]
 then

From ff424391713dfd648d197e1ac3255b261b01d152 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 21:58:36 +0200
Subject: [PATCH 12/20] TST fix doctest failures

---
 sklearn/feature_selection/_from_model.py | 2 +-
 sklearn/multioutput.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index dadca96d5df5f..fd20a30289fd5 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -211,7 +211,7 @@ class SelectFromModel(
     >>> y = [0, 1, 0, 1]
     >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
     >>> selector.estimator_.coef_
-    array([[-0.3252302 ,  0.83462377,  0.49750423]])
+    array([[-0.3252...,  0.8345...,  0.4976...]])
     >>> selector.threshold_
     0.55245...
     >>> selector.get_support()
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 595156e10722a..5cf1eae96fd3b 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -888,7 +888,7 @@ class labels for each estimator in the chain.
     >>> chain.predict_proba(X_test)
     array([[0.8387..., 0.9431..., 0.4576...],
            [0.8878..., 0.3684..., 0.2640...],
-           [0.0321..., 0.9935..., 0.0625...]])
+           [0.0321..., 0.9935..., 0.0626...]])
     """
 
     @_fit_context(

From 97d7468da69abdacfd03157038d4737cae51b27d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 22:19:38 +0200
Subject: [PATCH 13/20] TST fix doctest failures 2nd try

---
 sklearn/feature_selection/_from_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index fd20a30289fd5..c44b2bcae7218 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -213,7 +213,7 @@ class SelectFromModel(
     >>> selector.estimator_.coef_
     array([[-0.3252...,  0.8345...,  0.4976...]])
     >>> selector.threshold_
-    0.55245...
+    0.55249...
     >>> selector.get_support()
     array([False,  True, False])
     >>> selector.transform(X)

From 433c2caa71b374a842a56328bb09c9ef9981af60 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 1 Oct 2023 22:32:36 +0200
Subject: [PATCH 14/20] TST test_multinomial_loss in test_sag.py

---
 sklearn/linear_model/tests/test_sag.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py
index 2208deaf55d8d..96f8a79726833 100644
--- a/sklearn/linear_model/tests/test_sag.py
+++ b/sklearn/linear_model/tests/test_sag.py
@@ -935,8 +935,7 @@ def test_multinomial_loss():
     rng = check_random_state(42)
     weights = rng.randn(n_features, n_classes)
     intercept = rng.randn(n_classes)
-    sample_weights = rng.randn(n_samples)
-    np.abs(sample_weights, sample_weights)
+    sample_weights = np.abs(rng.randn(n_samples))
 
     # compute loss and gradient like in multinomial SAG
     dataset, _ = make_dataset(X, y, sample_weights, random_state=42)
@@ -953,6 +952,9 @@ def test_multinomial_loss():
         weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights
     )
     grad_2 = grad_2[:, :-1].T
+    # convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw)
+    loss_2 *= np.sum(sample_weights)
+    grad_2 *= np.sum(sample_weights)
 
     # comparison
     assert_array_almost_equal(grad_1, grad_2)
@@ -987,6 +989,9 @@ def test_multinomial_loss_ground_truth():
         weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights
     )
     grad_2 = grad_2[:, :-1].T
+    # convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw)
+    loss_2 *= np.sum(sample_weights)
+    grad_2 *= np.sum(sample_weights)
 
     assert_almost_equal(loss_1, loss_2)
     assert_array_almost_equal(grad_1, grad_2)

From fce33e6ba7adef2be27a5b92d681ff2b980db8fa Mon Sep 17 00:00:00 2001
From: Guillaume Lemaitre <g.lemaitre58@gmail.com>
Date: Tue, 3 Oct 2023 18:31:12 +0200
Subject: [PATCH 15/20] apply pre-commit on _logistic.py

---
 sklearn/linear_model/_logistic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index e4fe81fd12eb6..08aa45a959765 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -15,9 +15,9 @@
 from numbers import Integral, Real
 
 import numpy as np
+from joblib import effective_n_jobs
 from scipy import optimize
 
-from joblib import effective_n_jobs
 from sklearn.metrics import get_scorer_names
 
 from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss

From 3a4b7b47613dc1e0c1c693d175e1081f357fee5c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 3 Oct 2023 19:39:07 +0200
Subject: [PATCH 16/20] ENH increase maxls in lbfgs like in GLMs

This is needed for more  reliable convergence.
Tests like test_logistic_regressioncv_class_weights then don't raise a convergence error.
---
 sklearn/linear_model/_logistic.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 08aa45a959765..32022e031566c 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -464,9 +464,10 @@ def _logistic_regression_path(
                 jac=True,
                 args=(X, target, sample_weight, l2_reg_strength, n_threads),
                 options={
+                    "maxiter": max_iter,
+                    "maxls": 50,  # default is 20
                     "iprint": iprint,
                     "gtol": tol,
-                    "maxiter": max_iter,
                     "ftol": 64 * np.finfo(float).eps,
                 },
             )

From 573fea1ee76306e06030a34f0a19036bec8f0536 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 3 Oct 2023 20:55:33 +0200
Subject: [PATCH 17/20] TST increase tol of LogisticRegressionCV in
 test_balance_property

---
 sklearn/linear_model/tests/test_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py
index 201bac9927112..ff9d7aad146f3 100644
--- a/sklearn/linear_model/tests/test_common.py
+++ b/sklearn/linear_model/tests/test_common.py
@@ -59,7 +59,7 @@
             ),
             marks=pytest.mark.xfail(reason="Missing importance sampling scheme"),
         ),
-        LogisticRegressionCV(),
+        LogisticRegressionCV(tol=1e-6),
         MultiTaskElasticNet(),
         MultiTaskElasticNetCV(),
         MultiTaskLasso(),

From f32c90f9d279740e95852dd68f58068b1785a09e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 3 Oct 2023 22:43:55 +0200
Subject: [PATCH 18/20] Revert "ENH add verbose to _newton_cg"

This reverts commit ef268133b75d77cbb0c4e98e20cf594c9ba5885e.
---
 sklearn/linear_model/_logistic.py |  9 +------
 sklearn/utils/optimize.py         | 41 +++++--------------------------
 2 files changed, 7 insertions(+), 43 deletions(-)

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index f06651ec9b8d5..e6ac6ff087945 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -482,14 +482,7 @@ def _logistic_regression_path(
             l2_reg_strength = 1.0 / (C * sw_sum)
             args = (X, target, sample_weight, l2_reg_strength, n_threads)
             w0, n_iter_i = _newton_cg(
-                grad_hess=hess,
-                func=func,
-                grad=grad,
-                x0=w0,
-                args=args,
-                maxiter=max_iter,
-                tol=tol,
-                verbose=verbose,
+                hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
             )
         elif solver == "newton-cholesky":
             l2_reg_strength = 1.0 / (C * sw_sum)
diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index 8987195480ec3..a9eb7afcff8c0 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -26,9 +26,7 @@ class _LineSearchError(RuntimeError):
     pass
 
 
-def _line_search_wolfe12(
-    f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs
-):
+def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs):
     """
     Same as line_search_wolfe1, but fall back to line_search_wolfe2 if
     suitable step length is not found, and raise an exception if a
@@ -59,9 +57,6 @@ def _line_search_wolfe12(
             sum_abs_grad = scipy.linalg.norm(grad, ord=1)
             check = sum_abs_grad < sum_abs_grad_old
             if check:
-                if verbose >= 2:
-                    print("  newton_cg line search detected tiny loss improvement.")
-                    print(f"  {loss_improvement=} {sum_abs_grad=}")
                 ret = (
                     1.0,  # step size
                     ret[1] + 1,  # number of function evaluations
@@ -83,7 +78,7 @@ def _line_search_wolfe12(
     return ret
 
 
-def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
+def _cg(fhess_p, fgrad, maxiter, tol):
     """
     Solve iteratively the linear system 'fhess_p . xsupi = fgrad'
     with a conjugate gradient descent.
@@ -118,10 +113,6 @@ def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
 
     while i <= maxiter:
         if np.sum(np.abs(ri)) <= tol:
-            if verbose >= 2:
-                print(
-                    f"  inner solver iteration {i} stopped with {np.sum(np.abs(ri))=}"
-                )
             break
 
         Ap = fhess_p(psupi)
@@ -129,8 +120,6 @@ def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
         curv = np.dot(psupi, Ap)
         if 0 <= curv <= 16 * np.finfo(np.float64).eps * psupi_norm2:
             # See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient.
-            if verbose >= 2:
-                print(f"  inner solver iteration {i} stopped with {curv=}")
             break
         elif curv < 0:
             if i > 0:
@@ -149,11 +138,7 @@ def _cg(fhess_p, fgrad, maxiter, tol, verbose=0):
         psupi_norm2 = dri1 + betai**2 * psupi_norm2
         i = i + 1
         dri0 = dri1  # update np.dot(ri,ri) for next time.
-    if i > maxiter and verbose >= 2:
-        print(
-            f"  newton_cg iterative solver stopped with maxiter={i - 1} and "
-            f"{np.sum(np.abs(ri))=}"
-        )
+
     return xsupi
 
 
@@ -168,7 +153,6 @@ def _newton_cg(
     maxinner=200,
     line_search=True,
     warn=True,
-    verbose=0,
 ):
     """
     Minimization of scalar function of one or more variables using the
@@ -222,8 +206,6 @@ def _newton_cg(
     if line_search:
         old_fval = func(x0, *args)
         old_old_fval = None
-    else:
-        old_fval = 0
 
     # Outer loop: our Newton iteration
     while k < maxiter:
@@ -232,10 +214,7 @@ def _newton_cg(
         fgrad, fhess_p = grad_hess(xk, *args)
 
         absgrad = np.abs(fgrad)
-        max_absgrad = np.max(absgrad)
-        if verbose > 0:
-            print(f"newton_cg iter = {k} loss = {old_fval} max|grad| = {max_absgrad}")
-        if max_absgrad <= tol:
+        if np.max(absgrad) <= tol:
             break
 
         maggrad = np.sum(absgrad)
@@ -244,22 +223,14 @@ def _newton_cg(
 
         # Inner loop: solve the Newton update by conjugate gradient, to
         # avoid inverting the Hessian
-        xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose)
+        xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond)
 
         alphak = 1.0
 
         if line_search:
             try:
                 alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12(
-                    func,
-                    grad,
-                    xk,
-                    xsupi,
-                    fgrad,
-                    old_fval,
-                    old_old_fval,
-                    verbose=verbose,
-                    args=args,
+                    func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args
                 )
             except _LineSearchError:
                 warnings.warn("Line Search failed")

From 511b142d5eb5aed599542822c2cbe44ce5340375 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 6 Oct 2023 08:28:50 +0200
Subject: [PATCH 19/20] MNT add TODO note for old line search branch

---
 sklearn/utils/optimize.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py
index a9eb7afcff8c0..024b0bcaf95ee 100644
--- a/sklearn/utils/optimize.py
+++ b/sklearn/utils/optimize.py
@@ -68,6 +68,9 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg
 
     if ret[0] is None:
         # line search failed: try different one.
+        # TODO: It seems that the new check for the sum of absolute gradients above
+        # catches all cases that, earlier, ended up here. In fact, our tests never
+        # trigger this "if branch" here and we can consider to remove it.
         ret = line_search_wolfe2(
             f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs
         )

From d5dea8621d58a9226adc498fa7a1b772a323446f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 6 Oct 2023 19:09:30 +0200
Subject: [PATCH 20/20] DOC add 2nd whatsnew entry

---
 doc/whats_new/v1.4.rst | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
index 900e2eff7da64..a56017fa4a624 100644
--- a/doc/whats_new/v1.4.rst
+++ b/doc/whats_new/v1.4.rst
@@ -23,8 +23,7 @@ random sampling procedures.
   :class:`linear_model.LogisticRegressionCV` now have much better convergence for
   solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision
   for the coefficients depending on the specified `tol`. Additionally, lbfgs can
-  make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg
-  is now faster than before.
+  make better use of `tol`, i.e., stop sooner or reach higher precision.
   :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
 
   .. note::
@@ -271,8 +270,9 @@ Changelog
   :class:`linear_model.LogisticRegressionCV` now have much better convergence for
   solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision
   for the coefficients depending on the specified `tol`. Additionally, lbfgs can
-  make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg
-  is now faster than before.
+  make better use of `tol`, i.e., stop sooner or reach higher precision. This is
+  accomplished by better scaling of the objective function, i.e., using average per
+  sample losses instead of sum of per sample losses.
   :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
 
   .. note::
@@ -282,6 +282,13 @@ Changelog
       solvers (when fit on the same data again). The amount of change depends on the
       specified `tol`, for small values you will get more precise results.
 
+- |Efficiency| :class:`linear_model.LogisticRegression` and
+  :class:`linear_model.LogisticRegressionCV` with solver `"newton-cg"` can now be
+  considerably faster for some data and parameter settings. This is accomplished by a
+  better line search convergence check for negligible loss improvements that takes into
+  account gradient information.
+  :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Efficiency| Solver `"newton-cg"` in :class:`linear_model.LogisticRegression` and
   :class:`linear_model.LogisticRegressionCV` uses a little less memory. The effect is
   proportional to the number of coefficients (`n_features * n_classes`).