scikit-learn · glemaitre · Oct 6, 2023 · Feb 25, 2023 · Sep 15, 2023 · Sep 15, 2023
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -19,6 +19,22 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
+- |Efficiency| :class:`linear_model.LogisticRegression` and
+  :class:`linear_model.LogisticRegressionCV` now have much better convergence for
+  solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision
+  for the coefficients depending on the specified `tol`. Additionally, lbfgs can
+  make better use of `tol`, i.e., stop sooner or reach higher precision.
+  :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
+
+  .. note::
+
+      The lbfgs is the default solver, so this change might effect many models.
+
+      This change also means that with this new version of scikit-learn, the resulting
+      coefficients `coef_` and `intercept_` of your models will change for these two
+      solvers (when fit on the same data again). The amount of change depends on the
+      specified `tol`, for small values you will get more precise results.
+
 Changes impacting all modules
 -----------------------------
 
@@ -250,7 +266,30 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
-- |Enhancement| Solver `"newton-cg"` in :class:`linear_model.LogisticRegression` and
+- |Efficiency| :class:`linear_model.LogisticRegression` and
+  :class:`linear_model.LogisticRegressionCV` now have much better convergence for
+  solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision
+  for the coefficients depending on the specified `tol`. Additionally, lbfgs can
+  make better use of `tol`, i.e., stop sooner or reach higher precision. This is
+  accomplished by better scaling of the objective function, i.e., using average per
+  sample losses instead of sum of per sample losses.
+  :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
+
+  .. note::
+
+      This change also means that with this new version of scikit-learn, the resulting
+      coefficients `coef_` and `intercept_` of your models will change for these two
+      solvers (when fit on the same data again). The amount of change depends on the
+      specified `tol`, for small values you will get more precise results.
+
+- |Efficiency| :class:`linear_model.LogisticRegression` and
+  :class:`linear_model.LogisticRegressionCV` with solver `"newton-cg"` can now be
+  considerably faster for some data and parameter settings. This is accomplished by a
+  better line search convergence check for negligible loss improvements that takes into
+  account gradient information.
+  :pr:`26721` by :user:`Christian Lorentzen <lorentzenchr>`.
+
+- |Efficiency| Solver `"newton-cg"` in :class:`linear_model.LogisticRegression` and
   :class:`linear_model.LogisticRegressionCV` uses a little less memory. The effect is
   proportional to the number of coefficients (`n_features * n_classes`).
   :pr:`27417` by :user:`Christian Lorentzen <lorentzenchr>`.

diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
@@ -211,9 +211,9 @@ class SelectFromModel(
     >>> y = [0, 1, 0, 1]
     >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
     >>> selector.estimator_.coef_
-    array([[-0.3252302 ,  0.83462377,  0.49750423]])
+    array([[-0.3252...,  0.8345...,  0.4976...]])
     >>> selector.threshold_
-    0.55245...
+    0.55249...
     >>> selector.get_support()
     array([False,  True, False])
     >>> selector.transform(X)

diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
@@ -207,10 +207,10 @@ def fit(self, X, y, sample_weight=None):
             loss_dtype = min(max(y.dtype, X.dtype), np.float64)
         y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False)
 
-        # TODO: We could support samples_weight=None as the losses support it.
-        # Note that _check_sample_weight calls check_array(order="C") required by
-        # losses.
-        sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
+        if sample_weight is not None:
+            # Note that _check_sample_weight calls check_array(order="C") required by
+            # losses.
+            sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype)
 
         n_samples, n_features = X.shape
         self._base_loss = self._get_loss()
@@ -228,17 +228,20 @@ def fit(self, X, y, sample_weight=None):
 
         # TODO: if alpha=0 check that X is not rank deficient
 
-        # IMPORTANT NOTE: Rescaling of sample_weight:
+        # NOTE: Rescaling of sample_weight:
         # We want to minimize
-        #     obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
+        #     obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance)
         #         + 1/2 * alpha * L2,
         # with
         #     deviance = 2 * loss.
         # The objective is invariant to multiplying sample_weight by a constant. We
-        # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
+        # could choose this constant such that sum(sample_weight) = 1 in order to end
+        # up with
         #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
-        # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
-        sample_weight = sample_weight / sample_weight.sum()
+        # But LinearModelLoss.loss() already computes
+        #     average(loss, weights=sample_weight)
+        # Thus, without rescaling, we have
+        #     obj = LinearModelLoss.loss(...)
 
         if self.warm_start and hasattr(self, "coef_"):
             if self.fit_intercept:
@@ -415,10 +418,10 @@ def score(self, X, y, sample_weight=None):
                 f" {base_loss.__name__}."
             )
 
-        # Note that constant_to_optimal_zero is already multiplied by sample_weight.
-        constant = np.mean(base_loss.constant_to_optimal_zero(y_true=y))
-        if sample_weight is not None:
-            constant *= sample_weight.shape[0] / np.sum(sample_weight)
+        constant = np.average(
+            base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None),
+            weights=sample_weight,
+        )
 
         # Missing factor of 2 in deviance cancels out.
         deviance = base_loss(

diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py
@@ -12,18 +12,19 @@ class LinearModelLoss:
 
     Note that raw_prediction is also known as linear predictor.
 
-    The loss is the sum of per sample losses and includes a term for L2
+    The loss is the average of per sample losses and includes a term for L2
     regularization::
 
-        loss = sum_i s_i loss(y_i, X_i @ coef + intercept)
+        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
                + 1/2 * l2_reg_strength * ||coef||_2^2
 
-    with sample weights s_i=1 if sample_weight=None.
+    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.
 
     Gradient and hessian, for simplicity without intercept, are::
 
-        gradient = X.T @ loss.gradient + l2_reg_strength * coef
-        hessian = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
+        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
+        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
+                  + l2_reg_strength * identity
 
     Conventions:
         if fit_intercept:
@@ -182,7 +183,7 @@ def loss(
         n_threads=1,
         raw_prediction=None,
     ):
-        """Compute the loss as sum over point-wise losses.
+        """Compute the loss as weighted average over point-wise losses.
 
         Parameters
         ----------
@@ -209,7 +210,7 @@ def loss(
         Returns
         -------
         loss : float
-            Sum of losses per sample plus penalty.
+            Weighted average of losses per sample, plus penalty.
         """
         if raw_prediction is None:
             weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
@@ -219,10 +220,10 @@ def loss(
         loss = self.base_loss.loss(
             y_true=y,
             raw_prediction=raw_prediction,
-            sample_weight=sample_weight,
+            sample_weight=None,
             n_threads=n_threads,
         )
-        loss = loss.sum()
+        loss = np.average(loss, weights=sample_weight)
 
         return loss + self.l2_penalty(weights, l2_reg_strength)
 
@@ -263,12 +264,12 @@ def loss_gradient(
         Returns
         -------
         loss : float
-            Sum of losses per sample plus penalty.
+            Weighted average of losses per sample, plus penalty.
 
         gradient : ndarray of shape coef.shape
              The gradient of the loss.
         """
-        n_features, n_classes = X.shape[1], self.base_loss.n_classes
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
 
         if raw_prediction is None:
@@ -282,9 +283,12 @@ def loss_gradient(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
-        loss = loss.sum()
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        loss = loss.sum() / sw_sum
         loss += self.l2_penalty(weights, l2_reg_strength)
 
+        grad_pointwise /= sw_sum
+
         if not self.base_loss.is_multiclass:
             grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
@@ -340,7 +344,7 @@ def gradient(
         gradient : ndarray of shape coef.shape
              The gradient of the loss.
         """
-        n_features, n_classes = X.shape[1], self.base_loss.n_classes
+        (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
 
         if raw_prediction is None:
@@ -354,6 +358,8 @@ def gradient(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
 
         if not self.base_loss.is_multiclass:
             grad = np.empty_like(coef, dtype=weights.dtype)
@@ -439,6 +445,9 @@ def gradient_hessian(
             sample_weight=sample_weight,
             n_threads=n_threads,
         )
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+        grad_pointwise /= sw_sum
+        hess_pointwise /= sw_sum
 
         # For non-canonical link functions and far away from the optimum, the pointwise
         # hessian can be negative. We take care that 75% of the hessian entries are
@@ -543,6 +552,7 @@ def gradient_hessian_product(
         (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
         n_dof = n_features + int(self.fit_intercept)
         weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
 
         if not self.base_loss.is_multiclass:
             grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
@@ -551,6 +561,8 @@ def gradient_hessian_product(
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
+            grad_pointwise /= sw_sum
+            hess_pointwise /= sw_sum
             grad = np.empty_like(coef, dtype=weights.dtype)
             grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
             if self.fit_intercept:
@@ -603,6 +615,7 @@ def hessp(s):
                 sample_weight=sample_weight,
                 n_threads=n_threads,
             )
+            grad_pointwise /= sw_sum
             grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
             grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
             if self.fit_intercept:
@@ -644,9 +657,9 @@ def hessp(s):
                 # hess_prod = empty_like(grad), but we ravel grad below and this
                 # function is run after that.
                 hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
-                hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
+                hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s
                 if self.fit_intercept:
-                    hess_prod[:, -1] = tmp.sum(axis=0)
+                    hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum
                 if coef.ndim == 1:
                     return hess_prod.ravel(order="F")
                 else:

diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
@@ -304,33 +304,16 @@ def _logistic_regression_path(
         # np.unique(y) gives labels in sorted order.
         pos_class = classes[1]
 
-    # If sample weights exist, convert them to array (support for lists)
-    # and check length
-    # Otherwise set them to 1 for all examples
-    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
-
-    if solver == "newton-cholesky":
-        # IMPORTANT NOTE: Rescaling of sample_weight:
-        # Same as in _GeneralizedLinearRegressor.fit().
-        # We want to minimize
-        #     obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
-        #         + 1/2 * alpha * L2,
-        # with
-        #     deviance = 2 * log_loss.
-        # The objective is invariant to multiplying sample_weight by a constant. We
-        # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
-        #     obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
-        # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
-        #
-        # This rescaling has to be done before multiplying by class_weights.
-        sw_sum = sample_weight.sum()  # needed to rescale penalty, nasty matter!
-        sample_weight = sample_weight / sw_sum
+    if sample_weight is not None or class_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
 
     # If class_weights is a dict (provided by the user), the weights
     # are assigned to the original labels. If it is "balanced", then
     # the class_weights are assigned after masking the labels with a OvR.
     le = LabelEncoder()
-    if isinstance(class_weight, dict) or multi_class == "multinomial":
+    if isinstance(class_weight, dict) or (
+        multi_class == "multinomial" and class_weight is not None
+    ):
         class_weight_ = compute_class_weight(class_weight, classes=classes, y=y)
         sample_weight *= class_weight_[le.fit_transform(y)]
 
@@ -375,6 +358,19 @@ def _logistic_regression_path(
             (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype
         )
 
+    # IMPORTANT NOTE:
+    # All solvers relying on LinearModelLoss need to scale the penalty with n_samples
+    # or the sum of sample weights because the implemented logistic regression
+    # objective here is (unfortunately)
+    #     C * sum(pointwise_loss) + penalty
+    # instead of (as LinearModelLoss does)
+    #     mean(pointwise_loss) + 1/C * penalty
+    if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
+        # This needs to be calculated after sample_weight is multiplied by
+        # class_weight. It is even tested that passing class_weight is equivalent to
+        # passing sample_weights according to class_weight.
+        sw_sum = n_samples if sample_weight is None else np.sum(sample_weight)
+
     if coef is not None:
         # it must work both giving the bias term and not
         if multi_class == "ovr":
@@ -457,7 +453,7 @@ def _logistic_regression_path(
     n_iter = np.zeros(len(Cs), dtype=np.int32)
     for i, C in enumerate(Cs):
         if solver == "lbfgs":
-            l2_reg_strength = 1.0 / C
+            l2_reg_strength = 1.0 / (C * sw_sum)
             iprint = [-1, 50, 1, 100, 101][
                 np.searchsorted(np.array([0, 1, 2, 3]), verbose)
             ]
@@ -467,7 +463,13 @@ def _logistic_regression_path(
                 method="L-BFGS-B",
                 jac=True,
                 args=(X, target, sample_weight, l2_reg_strength, n_threads),
-                options={"iprint": iprint, "gtol": tol, "maxiter": max_iter},
+                options={
+                    "maxiter": max_iter,
+                    "maxls": 50,  # default is 20
+                    "iprint": iprint,
+                    "gtol": tol,
+                    "ftol": 64 * np.finfo(float).eps,
+                },
             )
             n_iter_i = _check_optimize_result(
                 solver,
@@ -477,15 +479,13 @@ def _logistic_regression_path(
             )
             w0, loss = opt_res.x, opt_res.fun
         elif solver == "newton-cg":
-            l2_reg_strength = 1.0 / C
+            l2_reg_strength = 1.0 / (C * sw_sum)
             args = (X, target, sample_weight, l2_reg_strength, n_threads)
             w0, n_iter_i = _newton_cg(
                 hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
             )
         elif solver == "newton-cholesky":
-            # The division by sw_sum is a consequence of the rescaling of
-            # sample_weight, see comment above.
-            l2_reg_strength = 1.0 / C / sw_sum
+            l2_reg_strength = 1.0 / (C * sw_sum)
             sol = NewtonCholeskySolver(
                 coef=w0,
                 linear_loss=loss,

diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py
@@ -59,7 +59,7 @@
             ),
             marks=pytest.mark.xfail(reason="Missing importance sampling scheme"),
         ),
-        LogisticRegressionCV(),
+        LogisticRegressionCV(tol=1e-6),
         MultiTaskElasticNet(),
         MultiTaskElasticNetCV(),
         MultiTaskLasso(),