From fc0f0e75f94379b074f64447731825a7d3c928df Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 25 Feb 2023 12:45:23 +0100 Subject: [PATCH 01/20] WIP to be continued --- sklearn/linear_model/_glm/glm.py | 29 ++++++----- sklearn/linear_model/_linear_loss.py | 43 +++++++++++------ sklearn/linear_model/_logistic.py | 53 ++++++++++----------- sklearn/linear_model/tests/test_logistic.py | 52 +++++++++++++++++--- 4 files changed, 115 insertions(+), 62 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 3dc0bbdc66bff..4cac889a4da51 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -207,10 +207,10 @@ def fit(self, X, y, sample_weight=None): loss_dtype = min(max(y.dtype, X.dtype), np.float64) y = check_array(y, dtype=loss_dtype, order="C", ensure_2d=False) - # TODO: We could support samples_weight=None as the losses support it. - # Note that _check_sample_weight calls check_array(order="C") required by - # losses. - sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype) + if sample_weight is not None: + # Note that _check_sample_weight calls check_array(order="C") required by + # losses. + sample_weight = _check_sample_weight(sample_weight, X, dtype=loss_dtype) n_samples, n_features = X.shape self._base_loss = self._get_loss() @@ -228,17 +228,20 @@ def fit(self, X, y, sample_weight=None): # TODO: if alpha=0 check that X is not rank deficient - # IMPORTANT NOTE: Rescaling of sample_weight: + # NOTE: Rescaling of sample_weight: # We want to minimize - # obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance) + # obj = 1/(2 * sum(sample_weight)) * sum(sample_weight * deviance) # + 1/2 * alpha * L2, # with # deviance = 2 * loss. # The objective is invariant to multiplying sample_weight by a constant. We - # choose this constant such that sum(sample_weight) = 1. Thus, we end up with + # could choose this constant such that sum(sample_weight) = 1 in order to end + # up with # obj = sum(sample_weight * loss) + 1/2 * alpha * L2. - # Note that LinearModelLoss.loss() computes sum(sample_weight * loss). - sample_weight = sample_weight / sample_weight.sum() + # But LinearModelLoss.loss() already computes + # average(loss, weights=sample_weight) + # Thus, without rescaling, we have + # obj = LinearModelLoss.loss(...) if self.warm_start and hasattr(self, "coef_"): if self.fit_intercept: @@ -415,10 +418,10 @@ def score(self, X, y, sample_weight=None): f" {base_loss.__name__}." ) - # Note that constant_to_optimal_zero is already multiplied by sample_weight. - constant = np.mean(base_loss.constant_to_optimal_zero(y_true=y)) - if sample_weight is not None: - constant *= sample_weight.shape[0] / np.sum(sample_weight) + constant = np.average( + base_loss.constant_to_optimal_zero(y_true=y, sample_weight=None), + weights=sample_weight, + ) # Missing factor of 2 in deviance cancels out. deviance = base_loss( diff --git a/sklearn/linear_model/_linear_loss.py b/sklearn/linear_model/_linear_loss.py index 92a203abc87ab..4255706e284f1 100644 --- a/sklearn/linear_model/_linear_loss.py +++ b/sklearn/linear_model/_linear_loss.py @@ -12,18 +12,19 @@ class LinearModelLoss: Note that raw_prediction is also known as linear predictor. - The loss is the sum of per sample losses and includes a term for L2 + The loss is the average of per sample losses and includes a term for L2 regularization:: - loss = sum_i s_i loss(y_i, X_i @ coef + intercept) + loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept) + 1/2 * l2_reg_strength * ||coef||_2^2 - with sample weights s_i=1 if sample_weight=None. + with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i. Gradient and hessian, for simplicity without intercept, are:: - gradient = X.T @ loss.gradient + l2_reg_strength * coef - hessian = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity + gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef + hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X + + l2_reg_strength * identity Conventions: if fit_intercept: @@ -182,7 +183,7 @@ def loss( n_threads=1, raw_prediction=None, ): - """Compute the loss as sum over point-wise losses. + """Compute the loss as weighted average over point-wise losses. Parameters ---------- @@ -209,7 +210,7 @@ def loss( Returns ------- loss : float - Sum of losses per sample plus penalty. + Weighted average of losses per sample, plus penalty. """ if raw_prediction is None: weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X) @@ -219,10 +220,10 @@ def loss( loss = self.base_loss.loss( y_true=y, raw_prediction=raw_prediction, - sample_weight=sample_weight, + sample_weight=None, n_threads=n_threads, ) - loss = loss.sum() + loss = np.average(loss, weights=sample_weight) return loss + self.l2_penalty(weights, l2_reg_strength) @@ -263,12 +264,12 @@ def loss_gradient( Returns ------- loss : float - Sum of losses per sample plus penalty. + Weighted average of losses per sample, plus penalty. gradient : ndarray of shape coef.shape The gradient of the loss. """ - n_features, n_classes = X.shape[1], self.base_loss.n_classes + (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes n_dof = n_features + int(self.fit_intercept) if raw_prediction is None: @@ -282,9 +283,12 @@ def loss_gradient( sample_weight=sample_weight, n_threads=n_threads, ) - loss = loss.sum() + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + loss = loss.sum() / sw_sum loss += self.l2_penalty(weights, l2_reg_strength) + grad_pointwise /= sw_sum + if not self.base_loss.is_multiclass: grad = np.empty_like(coef, dtype=weights.dtype) grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights @@ -340,7 +344,7 @@ def gradient( gradient : ndarray of shape coef.shape The gradient of the loss. """ - n_features, n_classes = X.shape[1], self.base_loss.n_classes + (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes n_dof = n_features + int(self.fit_intercept) if raw_prediction is None: @@ -354,6 +358,8 @@ def gradient( sample_weight=sample_weight, n_threads=n_threads, ) + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + grad_pointwise /= sw_sum if not self.base_loss.is_multiclass: grad = np.empty_like(coef, dtype=weights.dtype) @@ -439,6 +445,9 @@ def gradient_hessian( sample_weight=sample_weight, n_threads=n_threads, ) + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + grad_pointwise /= sw_sum + hess_pointwise /= sw_sum # For non-canonical link functions and far away from the optimum, the pointwise # hessian can be negative. We take care that 75% of the hessian entries are @@ -543,6 +552,7 @@ def gradient_hessian_product( (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes n_dof = n_features + int(self.fit_intercept) weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X) + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) if not self.base_loss.is_multiclass: grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian( @@ -551,6 +561,8 @@ def gradient_hessian_product( sample_weight=sample_weight, n_threads=n_threads, ) + grad_pointwise /= sw_sum + hess_pointwise /= sw_sum grad = np.empty_like(coef, dtype=weights.dtype) grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights if self.fit_intercept: @@ -603,6 +615,7 @@ def hessp(s): sample_weight=sample_weight, n_threads=n_threads, ) + grad_pointwise /= sw_sum grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F") grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights if self.fit_intercept: @@ -644,9 +657,9 @@ def hessp(s): # hess_prod = empty_like(grad), but we ravel grad below and this # function is run after that. hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F") - hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s + hess_prod[:, :n_features] = (tmp.T @ X) / sw_sum + l2_reg_strength * s if self.fit_intercept: - hess_prod[:, -1] = tmp.sum(axis=0) + hess_prod[:, -1] = tmp.sum(axis=0) / sw_sum if coef.ndim == 1: return hess_prod.ravel(order="F") else: diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 6bdc4b7368ef0..11c08c115ba2d 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -292,33 +292,27 @@ def _logistic_regression_path( # np.unique(y) gives labels in sorted order. pos_class = classes[1] - # If sample weights exist, convert them to array (support for lists) - # and check length - # Otherwise set them to 1 for all examples - sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True) - - if solver == "newton-cholesky": - # IMPORTANT NOTE: Rescaling of sample_weight: - # Same as in _GeneralizedLinearRegressor.fit(). - # We want to minimize - # obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance) - # + 1/2 * alpha * L2, - # with - # deviance = 2 * log_loss. - # The objective is invariant to multiplying sample_weight by a constant. We - # choose this constant such that sum(sample_weight) = 1. Thus, we end up with - # obj = sum(sample_weight * loss) + 1/2 * alpha * L2. - # Note that LinearModelLoss.loss() computes sum(sample_weight * loss). - # - # This rescaling has to be done before multiplying by class_weights. - sw_sum = sample_weight.sum() # needed to rescale penalty, nasty matter! - sample_weight = sample_weight / sw_sum + if sample_weight is not None or class_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True) + # IMPORTANT NOTE: + # All solvers relying on LinearModelLoss need to scale the penalty with n_samples + # or the sum of sample weights as the here implemented logistic regression + # objective is (unfortunately) + # C * sum(pointwise_loss) + penalty + # instead of (as LinearModelLoss does) + # mean(pointwise_loss) + 1/C * penalty + if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + # This needs to be calculated before sample_weight is multiplied by + # class_weight. + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) # If class_weights is a dict (provided by the user), the weights # are assigned to the original labels. If it is "balanced", then # the class_weights are assigned after masking the labels with a OvR. le = LabelEncoder() - if isinstance(class_weight, dict) or multi_class == "multinomial": + if isinstance(class_weight, dict) or ( + multi_class == "multinomial" and class_weight is not None + ): class_weight_ = compute_class_weight(class_weight, classes=classes, y=y) sample_weight *= class_weight_[le.fit_transform(y)] @@ -445,7 +439,7 @@ def _logistic_regression_path( n_iter = np.zeros(len(Cs), dtype=np.int32) for i, C in enumerate(Cs): if solver == "lbfgs": - l2_reg_strength = 1.0 / C + l2_reg_strength = 1.0 / (C * sw_sum) iprint = [-1, 50, 1, 100, 101][ np.searchsorted(np.array([0, 1, 2, 3]), verbose) ] @@ -455,7 +449,12 @@ def _logistic_regression_path( method="L-BFGS-B", jac=True, args=(X, target, sample_weight, l2_reg_strength, n_threads), - options={"iprint": iprint, "gtol": tol, "maxiter": max_iter}, + options={ + "iprint": iprint, + "gtol": tol, + "maxiter": max_iter, + "ftol": 64 * np.finfo(float).eps, + }, ) n_iter_i = _check_optimize_result( solver, @@ -465,15 +464,13 @@ def _logistic_regression_path( ) w0, loss = opt_res.x, opt_res.fun elif solver == "newton-cg": - l2_reg_strength = 1.0 / C + l2_reg_strength = 1.0 / (C * sw_sum) args = (X, target, sample_weight, l2_reg_strength, n_threads) w0, n_iter_i = _newton_cg( hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol ) elif solver == "newton-cholesky": - # The division by sw_sum is a consequence of the rescaling of - # sample_weight, see comment above. - l2_reg_strength = 1.0 / C / sw_sum + l2_reg_strength = 1.0 / (C * sw_sum) sol = NewtonCholeskySolver( coef=w0, linear_loss=loss, diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index 85f5c2d52b745..9dc4ce691fb61 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -702,14 +702,17 @@ def test_logistic_regression_solvers_multiclass(): } for solver_1, solver_2 in itertools.combinations(regressors, r=2): - assert_array_almost_equal( - regressors[solver_1].coef_, regressors[solver_2].coef_, decimal=4 + assert_allclose( + regressors[solver_1].coef_, + regressors[solver_2].coef_, + rtol=5e-3 if solver_2 == "saga" else 1e-3, + err_msg=f"{solver_1} vs {solver_2}", ) @pytest.mark.parametrize("weight", [{0: 0.1, 1: 0.2}, {0: 0.1, 1: 0.2, 2: 0.5}]) @pytest.mark.parametrize("class_weight", ["weight", "balanced"]) -def test_logistic_regressioncv_class_weights(weight, class_weight): +def test_logistic_regressioncv_class_weights(weight, class_weight, global_random_seed): """Test class_weight for LogisticRegressionCV.""" n_classes = len(weight) if class_weight == "weight": @@ -722,23 +725,60 @@ def test_logistic_regressioncv_class_weights(weight, class_weight): n_informative=3, n_redundant=0, n_classes=n_classes, - random_state=0, + random_state=global_random_seed, ) params = dict( Cs=1, fit_intercept=False, multi_class="ovr", class_weight=class_weight, + tol=1e-8, ) clf_lbfgs = LogisticRegressionCV(solver="lbfgs", **params) clf_lbfgs.fit(X, y) + from sklearn.linear_model._linear_loss import LinearModelLoss + from sklearn._loss.loss import HalfMultinomialLoss, HalfBinomialLoss + + if n_classes > 2: + loss = LinearModelLoss( + base_loss=HalfMultinomialLoss(n_classes=n_classes), + fit_intercept=False, + ) + else: + loss = LinearModelLoss( + base_loss=HalfBinomialLoss(), + fit_intercept=False, + ) + l_lbfgs = loss.loss( + coef=clf_lbfgs.coef_.squeeze(), + X=X, + y=LabelEncoder().fit_transform(y).astype(float), + sample_weight=None, + l2_reg_strength=1 / 20, + ) + print(f"loss lbfgs = {l_lbfgs} C_={clf_lbfgs.C_}") + for solver in set(SOLVERS) - set(["lbfgs"]): clf = LogisticRegressionCV(solver=solver, **params) if solver in ("sag", "saga"): - clf.set_params(tol=1e-5, max_iter=10000, random_state=0) + clf.set_params( + tol=1e-18, max_iter=10000, random_state=global_random_seed + 1 + ) clf.fit(X, y) - assert_allclose(clf.coef_, clf_lbfgs.coef_, rtol=1e-3) + + l_solver = loss.loss( + coef=clf.coef_.squeeze(), + X=X, + y=LabelEncoder().fit_transform(y).astype(float), + sample_weight=None, + l2_reg_strength=1 / 20, + ) + print(f"loss {solver} = {l_solver} C_={clf.C_}") + + assert_allclose( + clf.coef_, clf_lbfgs.coef_, rtol=1e-3, err_msg=f"{solver} vs lbfgs" + ) def test_logistic_regression_sample_weights(): From e1c21282663748db2f3d5be2a766224fad59761c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 15 Sep 2023 20:24:35 +0200 Subject: [PATCH 02/20] ENH improve line search of newton_cg for tiny loss improvements --- sklearn/utils/optimize.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index 68a1ae1dddb98..97d7ddf1169b1 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -16,6 +16,7 @@ import warnings import numpy as np +import scipy from ..exceptions import ConvergenceWarning from .fixes import line_search_wolfe1, line_search_wolfe2 @@ -39,6 +40,32 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg """ ret = line_search_wolfe1(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs) + if ret[0] is None: + # Have a look at the line_search method of our NewtonSolver class. We borrow + # the logic from there + # Deal with relative loss differences around machine precision. + args = kwargs.get("args", tuple()) + fval = f(xk + pk, *args) + eps = 16 * np.finfo(np.asarray(old_fval).dtype).eps + tiny_loss = np.abs(old_fval * eps) + loss_improvement = fval - old_fval + check = np.abs(loss_improvement) <= tiny_loss + if check: + # 2.1 Check sum of absolute gradients as alternative condition. + sum_abs_grad_old = scipy.linalg.norm(gfk, ord=1) + grad = fprime(xk + pk, *args) + sum_abs_grad = scipy.linalg.norm(grad, ord=1) + check = sum_abs_grad < sum_abs_grad_old + if check: + ret = ( + 1.0, # step size + ret[1] + 1, # number of function evaluations + ret[2] + 1, # number of gradient evaluations + fval, + old_fval, + grad, + ) + if ret[0] is None: # line search failed: try different one. ret = line_search_wolfe2( From ead068f0c6867fc9f0550147f52c727e8a7a35ae Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 15 Sep 2023 20:25:55 +0200 Subject: [PATCH 03/20] ENH sample weight rescaling after class weights --- sklearn/linear_model/_logistic.py | 24 ++++++----- sklearn/linear_model/tests/test_logistic.py | 45 ++++----------------- 2 files changed, 20 insertions(+), 49 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index cc2cd25a72156..c0badd03c078e 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -306,17 +306,6 @@ def _logistic_regression_path( if sample_weight is not None or class_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True) - # IMPORTANT NOTE: - # All solvers relying on LinearModelLoss need to scale the penalty with n_samples - # or the sum of sample weights as the here implemented logistic regression - # objective is (unfortunately) - # C * sum(pointwise_loss) + penalty - # instead of (as LinearModelLoss does) - # mean(pointwise_loss) + 1/C * penalty - if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: - # This needs to be calculated before sample_weight is multiplied by - # class_weight. - sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) # If class_weights is a dict (provided by the user), the weights # are assigned to the original labels. If it is "balanced", then @@ -369,6 +358,19 @@ def _logistic_regression_path( (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype ) + # IMPORTANT NOTE: + # All solvers relying on LinearModelLoss need to scale the penalty with n_samples + # or the sum of sample weights because the implemented logistic regression + # objective here is (unfortunately) + # C * sum(pointwise_loss) + penalty + # instead of (as LinearModelLoss does) + # mean(pointwise_loss) + 1/C * penalty + if solver in ["lbfgs", "newton-cg", "newton-cholesky"]: + # This needs to be calculated after sample_weight is multiplied by + # class_weight. It is even tested that passing class_weight is equivalent to + # passing sample_weights according to class_weight. + sw_sum = n_samples if sample_weight is None else np.sum(sample_weight) + if coef is not None: # it must work both giving the bias term and not if multi_class == "ovr": diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index dc7bba2518b20..8cdab6f041307 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -754,28 +754,6 @@ def test_logistic_regressioncv_class_weights(weight, class_weight, global_random clf_lbfgs = LogisticRegressionCV(solver="lbfgs", **params) clf_lbfgs.fit(X, y) - from sklearn.linear_model._linear_loss import LinearModelLoss - from sklearn._loss.loss import HalfMultinomialLoss, HalfBinomialLoss - - if n_classes > 2: - loss = LinearModelLoss( - base_loss=HalfMultinomialLoss(n_classes=n_classes), - fit_intercept=False, - ) - else: - loss = LinearModelLoss( - base_loss=HalfBinomialLoss(), - fit_intercept=False, - ) - l_lbfgs = loss.loss( - coef=clf_lbfgs.coef_.squeeze(), - X=X, - y=LabelEncoder().fit_transform(y).astype(float), - sample_weight=None, - l2_reg_strength=1 / 20, - ) - print(f"loss lbfgs = {l_lbfgs} C_={clf_lbfgs.C_}") - for solver in set(SOLVERS) - set(["lbfgs"]): clf = LogisticRegressionCV(solver=solver, **params) if solver in ("sag", "saga"): @@ -784,15 +762,6 @@ def test_logistic_regressioncv_class_weights(weight, class_weight, global_random ) clf.fit(X, y) - l_solver = loss.loss( - coef=clf.coef_.squeeze(), - X=X, - y=LabelEncoder().fit_transform(y).astype(float), - sample_weight=None, - l2_reg_strength=1 / 20, - ) - print(f"loss {solver} = {l_solver} C_={clf.C_}") - assert_allclose( clf.coef_, clf_lbfgs.coef_, rtol=1e-3, err_msg=f"{solver} vs lbfgs" ) @@ -820,7 +789,7 @@ def test_logistic_regression_sample_weights(): # Test that sample weights work the same with the lbfgs, # newton-cg, newton-cholesky and 'sag' solvers - clf_sw_lbfgs = LR(**kw) + clf_sw_lbfgs = LR(**kw, tol=1e-5) clf_sw_lbfgs.fit(X, y, sample_weight=sample_weight) for solver in set(SOLVERS) - set(("lbfgs", "saga")): clf_sw = LR(solver=solver, tol=1e-10 if solver == "sag" else 1e-5, **kw) @@ -946,9 +915,9 @@ def test_logistic_regression_multinomial(): # 'lbfgs' is used as a referenced solver = "lbfgs" - ref_i = LogisticRegression(solver=solver, multi_class="multinomial") + ref_i = LogisticRegression(solver=solver, multi_class="multinomial", tol=1e-6) ref_w = LogisticRegression( - solver=solver, multi_class="multinomial", fit_intercept=False + solver=solver, multi_class="multinomial", fit_intercept=False, tol=1e-6 ) ref_i.fit(X, y) ref_w.fit(X, y) @@ -976,9 +945,9 @@ def test_logistic_regression_multinomial(): assert clf_w.coef_.shape == (n_classes, n_features) # Compare solutions between lbfgs and the other solvers - assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-2) + assert_allclose(ref_i.coef_, clf_i.coef_, rtol=1e-3) assert_allclose(ref_w.coef_, clf_w.coef_, rtol=1e-2) - assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-2) + assert_allclose(ref_i.intercept_, clf_i.intercept_, rtol=1e-3) # Test that the path give almost the same results. However since in this # case we take the average of the coefs after fitting across all the @@ -988,8 +957,8 @@ def test_logistic_regression_multinomial(): solver=solver, max_iter=2000, tol=1e-6, multi_class="multinomial", Cs=[1.0] ) clf_path.fit(X, y) - assert_allclose(clf_path.coef_, ref_i.coef_, rtol=2e-2) - assert_allclose(clf_path.intercept_, ref_i.intercept_, rtol=2e-2) + assert_allclose(clf_path.coef_, ref_i.coef_, rtol=1e-2) + assert_allclose(clf_path.intercept_, ref_i.intercept_, rtol=1e-2) def test_liblinear_decision_function_zero(): From fdc0fa36b6924b7f721a0cc94892b666f01da634 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 18 Sep 2023 20:05:56 +0200 Subject: [PATCH 04/20] ENH fix curvature condition in CG --- sklearn/utils/optimize.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index 97d7ddf1169b1..b6a319bc49953 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -108,6 +108,8 @@ def _cg(fhess_p, fgrad, maxiter, tol): psupi = -ri i = 0 dri0 = np.dot(ri, ri) + # We also track of |p_i|^2. + psupi_norm2 = dri0 while i <= maxiter: if np.sum(np.abs(ri)) <= tol: @@ -116,7 +118,8 @@ def _cg(fhess_p, fgrad, maxiter, tol): Ap = fhess_p(psupi) # check curvature curv = np.dot(psupi, Ap) - if 0 <= curv <= 3 * np.finfo(np.float64).eps: + if 0 <= curv <= 16 * np.finfo(np.float64).eps * psupi_norm2: + # See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient. break elif curv < 0: if i > 0: @@ -131,6 +134,8 @@ def _cg(fhess_p, fgrad, maxiter, tol): dri1 = np.dot(ri, ri) betai = dri1 / dri0 psupi = -ri + betai * psupi + # We use |p_i|^2 = |r_i|^2 + beta_i^2 |p_{i-1}|^2 + psupi_norm2 = dri1 + betai**2 * psupi_norm2 i = i + 1 dri0 = dri1 # update np.dot(ri,ri) for next time. From ef268133b75d77cbb0c4e98e20cf594c9ba5885e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 18 Sep 2023 23:57:50 +0200 Subject: [PATCH 05/20] ENH add verbose to _newton_cg --- sklearn/linear_model/_logistic.py | 11 +++++++-- sklearn/utils/optimize.py | 41 ++++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index c0badd03c078e..0fe6ccd241558 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -15,9 +15,9 @@ from numbers import Integral, Real import numpy as np -from joblib import effective_n_jobs from scipy import optimize +from joblib import effective_n_jobs from sklearn.metrics import get_scorer_names from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss @@ -481,7 +481,14 @@ def _logistic_regression_path( l2_reg_strength = 1.0 / (C * sw_sum) args = (X, target, sample_weight, l2_reg_strength, n_threads) w0, n_iter_i = _newton_cg( - hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol + grad_hess=hess, + func=func, + grad=grad, + x0=w0, + args=args, + maxiter=max_iter, + tol=tol, + verbose=verbose, ) elif solver == "newton-cholesky": l2_reg_strength = 1.0 / (C * sw_sum) diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index b6a319bc49953..a222978512112 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -26,7 +26,9 @@ class _LineSearchError(RuntimeError): pass -def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs): +def _line_search_wolfe12( + f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs +): """ Same as line_search_wolfe1, but fall back to line_search_wolfe2 if suitable step length is not found, and raise an exception if a @@ -57,6 +59,9 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg sum_abs_grad = scipy.linalg.norm(grad, ord=1) check = sum_abs_grad < sum_abs_grad_old if check: + if verbose >= 2: + print(" newton_cg line search detected tiny loss improvement.") + print(f" {loss_improvement=} {sum_abs_grad=}") ret = ( 1.0, # step size ret[1] + 1, # number of function evaluations @@ -78,7 +83,7 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg return ret -def _cg(fhess_p, fgrad, maxiter, tol): +def _cg(fhess_p, fgrad, maxiter, tol, verbose=0): """ Solve iteratively the linear system 'fhess_p . xsupi = fgrad' with a conjugate gradient descent. @@ -113,6 +118,10 @@ def _cg(fhess_p, fgrad, maxiter, tol): while i <= maxiter: if np.sum(np.abs(ri)) <= tol: + if verbose >= 2: + print( + f" inner solver iteration {i} stopped with {np.sum(np.abs(ri))=}" + ) break Ap = fhess_p(psupi) @@ -120,6 +129,8 @@ def _cg(fhess_p, fgrad, maxiter, tol): curv = np.dot(psupi, Ap) if 0 <= curv <= 16 * np.finfo(np.float64).eps * psupi_norm2: # See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient. + if verbose >= 2: + print(f" inner solver iteration {i} stopped with {curv=}") break elif curv < 0: if i > 0: @@ -138,7 +149,11 @@ def _cg(fhess_p, fgrad, maxiter, tol): psupi_norm2 = dri1 + betai**2 * psupi_norm2 i = i + 1 dri0 = dri1 # update np.dot(ri,ri) for next time. - + if i > maxiter and verbose >= 2: + print( + f" newton_cg iterative solver stopped with maxiter={i - 1} and " + f"{np.sum(np.abs(ri))=}" + ) return xsupi @@ -153,6 +168,7 @@ def _newton_cg( maxinner=200, line_search=True, warn=True, + verbose=0, ): """ Minimization of scalar function of one or more variables using the @@ -206,6 +222,8 @@ def _newton_cg( if line_search: old_fval = func(x0, *args) old_old_fval = None + else: + old_fval = 0 # Outer loop: our Newton iteration while k < maxiter: @@ -214,7 +232,10 @@ def _newton_cg( fgrad, fhess_p = grad_hess(xk, *args) absgrad = np.abs(fgrad) - if np.max(absgrad) <= tol: + max_absgrad = np.max(absgrad) + if verbose > 0: + print(f"newton_cg iter = {k} loss = {old_fval} max|grad| = {max_absgrad}") + if max_absgrad <= tol: break maggrad = np.sum(absgrad) @@ -223,14 +244,22 @@ def _newton_cg( # Inner loop: solve the Newton update by conjugate gradient, to # avoid inverting the Hessian - xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond) + xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose) alphak = 1.0 if line_search: try: alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12( - func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args + func, + grad, + xk, + xsupi, + fgrad, + old_fval, + old_old_fval, + verbose=verbose, + args=args, ) except _LineSearchError: warnings.warn("Line Search failed") From b1aae34a113d0213dafdc8f6bce8d55323ba8279 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 14:17:38 +0200 Subject: [PATCH 06/20] DOC add whatsnew --- doc/whats_new/v1.4.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 589f2b3e1dd5e..464e7c7cf8cab 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -196,6 +196,21 @@ Changelog :class:`scipy.sparse.sparray` subclasses. :pr:`27301` by :user:`Lohit SundaramahaLingam `. +:mod:`sklearn.linear_model` +................................... + +- |Efficiency| :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` now have much better convergence for + solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision + for the coefficients depending on the specified `tol`. Additionally, lbfgs can + make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg + is now faster than before. + This change also means that with this new version of scikit-learn, the resulting + coefficients `coef_` and `intercept_` of your models will change for these two + solvers (when fit on the same data again). The amount of change depends on the + specified `tol`, for small values you will get more precise results. + :pr:`26721` by :user:`Christian Lorentzen `. + :mod:`sklearn.metrics` ...................... From fc96f0d000f8521ef419daae896fb049aba31287 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 14:26:52 +0200 Subject: [PATCH 07/20] DOC add changed models entry --- doc/whats_new/v1.4.rst | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 464e7c7cf8cab..9b50a5877dbbd 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -19,6 +19,23 @@ parameters, may produce different models from the previous version. This often occurs due to changes in the modelling logic (bug fixes or enhancements), or in random sampling procedures. +- |Efficiency| :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` now have much better convergence for + solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision + for the coefficients depending on the specified `tol`. Additionally, lbfgs can + make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg + is now faster than before. + :pr:`26721` by :user:`Christian Lorentzen `. + + .. note:: + + The lbfgs is the default solver, so this change might effect many models. + + This change also means that with this new version of scikit-learn, the resulting + coefficients `coef_` and `intercept_` of your models will change for these two + solvers (when fit on the same data again). The amount of change depends on the + specified `tol`, for small values you will get more precise results. + Changes impacting all modules ----------------------------- @@ -205,12 +222,15 @@ Changelog for the coefficients depending on the specified `tol`. Additionally, lbfgs can make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg is now faster than before. - This change also means that with this new version of scikit-learn, the resulting - coefficients `coef_` and `intercept_` of your models will change for these two - solvers (when fit on the same data again). The amount of change depends on the - specified `tol`, for small values you will get more precise results. :pr:`26721` by :user:`Christian Lorentzen `. + .. note:: + + This change also means that with this new version of scikit-learn, the resulting + coefficients `coef_` and `intercept_` of your models will change for these two + solvers (when fit on the same data again). The amount of change depends on the + specified `tol`, for small values you will get more precise results. + :mod:`sklearn.metrics` ...................... From 6e559291dd7ab0e4ebdfb9f77d56c63379a8b942 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 14:44:12 +0200 Subject: [PATCH 08/20] CLN isort _logistic.py --- sklearn/linear_model/_logistic.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index e4fe81fd12eb6..9c7dd476e061c 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -26,30 +26,18 @@ from ..model_selection import check_cv from ..preprocessing import LabelBinarizer, LabelEncoder from ..svm._base import _fit_liblinear -from ..utils import ( - Bunch, - check_array, - check_consistent_length, - check_random_state, - compute_class_weight, -) +from ..utils import (Bunch, check_array, check_consistent_length, + check_random_state, compute_class_weight) from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import row_norms, softmax -from ..utils.metadata_routing import ( - MetadataRouter, - MethodMapping, - _raise_for_params, - _routing_enabled, - process_routing, -) +from ..utils.metadata_routing import (MetadataRouter, MethodMapping, + _raise_for_params, _routing_enabled, + process_routing) from ..utils.multiclass import check_classification_targets from ..utils.optimize import _check_optimize_result, _newton_cg from ..utils.parallel import Parallel, delayed -from ..utils.validation import ( - _check_method_params, - _check_sample_weight, - check_is_fitted, -) +from ..utils.validation import (_check_method_params, _check_sample_weight, + check_is_fitted) from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin from ._glm.glm import NewtonCholeskySolver from ._linear_loss import LinearModelLoss From 175ee5fa814b6a4f3fe21a81be382d2df17d6b2b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 15:06:57 +0200 Subject: [PATCH 09/20] CLN black _logistic.py --- sklearn/linear_model/_logistic.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 9c7dd476e061c..e4fe81fd12eb6 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -26,18 +26,30 @@ from ..model_selection import check_cv from ..preprocessing import LabelBinarizer, LabelEncoder from ..svm._base import _fit_liblinear -from ..utils import (Bunch, check_array, check_consistent_length, - check_random_state, compute_class_weight) +from ..utils import ( + Bunch, + check_array, + check_consistent_length, + check_random_state, + compute_class_weight, +) from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import row_norms, softmax -from ..utils.metadata_routing import (MetadataRouter, MethodMapping, - _raise_for_params, _routing_enabled, - process_routing) +from ..utils.metadata_routing import ( + MetadataRouter, + MethodMapping, + _raise_for_params, + _routing_enabled, + process_routing, +) from ..utils.multiclass import check_classification_targets from ..utils.optimize import _check_optimize_result, _newton_cg from ..utils.parallel import Parallel, delayed -from ..utils.validation import (_check_method_params, _check_sample_weight, - check_is_fitted) +from ..utils.validation import ( + _check_method_params, + _check_sample_weight, + check_is_fitted, +) from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin from ._glm.glm import NewtonCholeskySolver from ._linear_loss import LinearModelLoss From 52d63d53cc2ec4350aac67b05028d2e386148ac7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 16:17:27 +0200 Subject: [PATCH 10/20] CI ruff --config pyproject.toml --- build_tools/linting.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/linting.sh b/build_tools/linting.sh index 28d16a8bbed32..cef174ea763c4 100755 --- a/build_tools/linting.sh +++ b/build_tools/linting.sh @@ -23,7 +23,7 @@ else fi echo -e "### Running ruff ###\n" -ruff check --show-source . +ruff check --show-source --config pyproject.toml . status=$? if [[ $status -eq 0 ]] then From 1a499f719ea7f709ab64b5af9a49d58f683f1b20 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 16:22:06 +0200 Subject: [PATCH 11/20] Revert "CI ruff --config pyproject.toml" This reverts commit 52d63d53cc2ec4350aac67b05028d2e386148ac7. --- build_tools/linting.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/linting.sh b/build_tools/linting.sh index cef174ea763c4..28d16a8bbed32 100755 --- a/build_tools/linting.sh +++ b/build_tools/linting.sh @@ -23,7 +23,7 @@ else fi echo -e "### Running ruff ###\n" -ruff check --show-source --config pyproject.toml . +ruff check --show-source . status=$? if [[ $status -eq 0 ]] then From ff424391713dfd648d197e1ac3255b261b01d152 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 21:58:36 +0200 Subject: [PATCH 12/20] TST fix doctest failures --- sklearn/feature_selection/_from_model.py | 2 +- sklearn/multioutput.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index dadca96d5df5f..fd20a30289fd5 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -211,7 +211,7 @@ class SelectFromModel( >>> y = [0, 1, 0, 1] >>> selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) >>> selector.estimator_.coef_ - array([[-0.3252302 , 0.83462377, 0.49750423]]) + array([[-0.3252..., 0.8345..., 0.4976...]]) >>> selector.threshold_ 0.55245... >>> selector.get_support() diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 595156e10722a..5cf1eae96fd3b 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -888,7 +888,7 @@ class labels for each estimator in the chain. >>> chain.predict_proba(X_test) array([[0.8387..., 0.9431..., 0.4576...], [0.8878..., 0.3684..., 0.2640...], - [0.0321..., 0.9935..., 0.0625...]]) + [0.0321..., 0.9935..., 0.0626...]]) """ @_fit_context( From 97d7468da69abdacfd03157038d4737cae51b27d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 22:19:38 +0200 Subject: [PATCH 13/20] TST fix doctest failures 2nd try --- sklearn/feature_selection/_from_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index fd20a30289fd5..c44b2bcae7218 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -213,7 +213,7 @@ class SelectFromModel( >>> selector.estimator_.coef_ array([[-0.3252..., 0.8345..., 0.4976...]]) >>> selector.threshold_ - 0.55245... + 0.55249... >>> selector.get_support() array([False, True, False]) >>> selector.transform(X) From 433c2caa71b374a842a56328bb09c9ef9981af60 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Oct 2023 22:32:36 +0200 Subject: [PATCH 14/20] TST test_multinomial_loss in test_sag.py --- sklearn/linear_model/tests/test_sag.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py index 2208deaf55d8d..96f8a79726833 100644 --- a/sklearn/linear_model/tests/test_sag.py +++ b/sklearn/linear_model/tests/test_sag.py @@ -935,8 +935,7 @@ def test_multinomial_loss(): rng = check_random_state(42) weights = rng.randn(n_features, n_classes) intercept = rng.randn(n_classes) - sample_weights = rng.randn(n_samples) - np.abs(sample_weights, sample_weights) + sample_weights = np.abs(rng.randn(n_samples)) # compute loss and gradient like in multinomial SAG dataset, _ = make_dataset(X, y, sample_weights, random_state=42) @@ -953,6 +952,9 @@ def test_multinomial_loss(): weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights ) grad_2 = grad_2[:, :-1].T + # convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw) + loss_2 *= np.sum(sample_weights) + grad_2 *= np.sum(sample_weights) # comparison assert_array_almost_equal(grad_1, grad_2) @@ -987,6 +989,9 @@ def test_multinomial_loss_ground_truth(): weights_intercept, X, y, l2_reg_strength=0.0, sample_weight=sample_weights ) grad_2 = grad_2[:, :-1].T + # convert to same convention, i.e. LinearModelLoss uses average(loss, weight=sw) + loss_2 *= np.sum(sample_weights) + grad_2 *= np.sum(sample_weights) assert_almost_equal(loss_1, loss_2) assert_array_almost_equal(grad_1, grad_2) From fce33e6ba7adef2be27a5b92d681ff2b980db8fa Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 3 Oct 2023 18:31:12 +0200 Subject: [PATCH 15/20] apply pre-commit on _logistic.py --- sklearn/linear_model/_logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index e4fe81fd12eb6..08aa45a959765 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -15,9 +15,9 @@ from numbers import Integral, Real import numpy as np +from joblib import effective_n_jobs from scipy import optimize -from joblib import effective_n_jobs from sklearn.metrics import get_scorer_names from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss From 3a4b7b47613dc1e0c1c693d175e1081f357fee5c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 3 Oct 2023 19:39:07 +0200 Subject: [PATCH 16/20] ENH increase maxls in lbfgs like in GLMs This is needed for more reliable convergence. Tests like test_logistic_regressioncv_class_weights then don't raise a convergence error. --- sklearn/linear_model/_logistic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 08aa45a959765..32022e031566c 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -464,9 +464,10 @@ def _logistic_regression_path( jac=True, args=(X, target, sample_weight, l2_reg_strength, n_threads), options={ + "maxiter": max_iter, + "maxls": 50, # default is 20 "iprint": iprint, "gtol": tol, - "maxiter": max_iter, "ftol": 64 * np.finfo(float).eps, }, ) From 573fea1ee76306e06030a34f0a19036bec8f0536 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 3 Oct 2023 20:55:33 +0200 Subject: [PATCH 17/20] TST increase tol of LogisticRegressionCV in test_balance_property --- sklearn/linear_model/tests/test_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_common.py b/sklearn/linear_model/tests/test_common.py index 201bac9927112..ff9d7aad146f3 100644 --- a/sklearn/linear_model/tests/test_common.py +++ b/sklearn/linear_model/tests/test_common.py @@ -59,7 +59,7 @@ ), marks=pytest.mark.xfail(reason="Missing importance sampling scheme"), ), - LogisticRegressionCV(), + LogisticRegressionCV(tol=1e-6), MultiTaskElasticNet(), MultiTaskElasticNetCV(), MultiTaskLasso(), From f32c90f9d279740e95852dd68f58068b1785a09e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 3 Oct 2023 22:43:55 +0200 Subject: [PATCH 18/20] Revert "ENH add verbose to _newton_cg" This reverts commit ef268133b75d77cbb0c4e98e20cf594c9ba5885e. --- sklearn/linear_model/_logistic.py | 9 +------ sklearn/utils/optimize.py | 41 +++++-------------------------- 2 files changed, 7 insertions(+), 43 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index f06651ec9b8d5..e6ac6ff087945 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -482,14 +482,7 @@ def _logistic_regression_path( l2_reg_strength = 1.0 / (C * sw_sum) args = (X, target, sample_weight, l2_reg_strength, n_threads) w0, n_iter_i = _newton_cg( - grad_hess=hess, - func=func, - grad=grad, - x0=w0, - args=args, - maxiter=max_iter, - tol=tol, - verbose=verbose, + hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol ) elif solver == "newton-cholesky": l2_reg_strength = 1.0 / (C * sw_sum) diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index 8987195480ec3..a9eb7afcff8c0 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -26,9 +26,7 @@ class _LineSearchError(RuntimeError): pass -def _line_search_wolfe12( - f, fprime, xk, pk, gfk, old_fval, old_old_fval, verbose=0, **kwargs -): +def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs): """ Same as line_search_wolfe1, but fall back to line_search_wolfe2 if suitable step length is not found, and raise an exception if a @@ -59,9 +57,6 @@ def _line_search_wolfe12( sum_abs_grad = scipy.linalg.norm(grad, ord=1) check = sum_abs_grad < sum_abs_grad_old if check: - if verbose >= 2: - print(" newton_cg line search detected tiny loss improvement.") - print(f" {loss_improvement=} {sum_abs_grad=}") ret = ( 1.0, # step size ret[1] + 1, # number of function evaluations @@ -83,7 +78,7 @@ def _line_search_wolfe12( return ret -def _cg(fhess_p, fgrad, maxiter, tol, verbose=0): +def _cg(fhess_p, fgrad, maxiter, tol): """ Solve iteratively the linear system 'fhess_p . xsupi = fgrad' with a conjugate gradient descent. @@ -118,10 +113,6 @@ def _cg(fhess_p, fgrad, maxiter, tol, verbose=0): while i <= maxiter: if np.sum(np.abs(ri)) <= tol: - if verbose >= 2: - print( - f" inner solver iteration {i} stopped with {np.sum(np.abs(ri))=}" - ) break Ap = fhess_p(psupi) @@ -129,8 +120,6 @@ def _cg(fhess_p, fgrad, maxiter, tol, verbose=0): curv = np.dot(psupi, Ap) if 0 <= curv <= 16 * np.finfo(np.float64).eps * psupi_norm2: # See https://arxiv.org/abs/1803.02924, Algo 1 Capped Conjugate Gradient. - if verbose >= 2: - print(f" inner solver iteration {i} stopped with {curv=}") break elif curv < 0: if i > 0: @@ -149,11 +138,7 @@ def _cg(fhess_p, fgrad, maxiter, tol, verbose=0): psupi_norm2 = dri1 + betai**2 * psupi_norm2 i = i + 1 dri0 = dri1 # update np.dot(ri,ri) for next time. - if i > maxiter and verbose >= 2: - print( - f" newton_cg iterative solver stopped with maxiter={i - 1} and " - f"{np.sum(np.abs(ri))=}" - ) + return xsupi @@ -168,7 +153,6 @@ def _newton_cg( maxinner=200, line_search=True, warn=True, - verbose=0, ): """ Minimization of scalar function of one or more variables using the @@ -222,8 +206,6 @@ def _newton_cg( if line_search: old_fval = func(x0, *args) old_old_fval = None - else: - old_fval = 0 # Outer loop: our Newton iteration while k < maxiter: @@ -232,10 +214,7 @@ def _newton_cg( fgrad, fhess_p = grad_hess(xk, *args) absgrad = np.abs(fgrad) - max_absgrad = np.max(absgrad) - if verbose > 0: - print(f"newton_cg iter = {k} loss = {old_fval} max|grad| = {max_absgrad}") - if max_absgrad <= tol: + if np.max(absgrad) <= tol: break maggrad = np.sum(absgrad) @@ -244,22 +223,14 @@ def _newton_cg( # Inner loop: solve the Newton update by conjugate gradient, to # avoid inverting the Hessian - xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond, verbose=verbose) + xsupi = _cg(fhess_p, fgrad, maxiter=maxinner, tol=termcond) alphak = 1.0 if line_search: try: alphak, fc, gc, old_fval, old_old_fval, gfkp1 = _line_search_wolfe12( - func, - grad, - xk, - xsupi, - fgrad, - old_fval, - old_old_fval, - verbose=verbose, - args=args, + func, grad, xk, xsupi, fgrad, old_fval, old_old_fval, args=args ) except _LineSearchError: warnings.warn("Line Search failed") From 511b142d5eb5aed599542822c2cbe44ce5340375 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 6 Oct 2023 08:28:50 +0200 Subject: [PATCH 19/20] MNT add TODO note for old line search branch --- sklearn/utils/optimize.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/utils/optimize.py b/sklearn/utils/optimize.py index a9eb7afcff8c0..024b0bcaf95ee 100644 --- a/sklearn/utils/optimize.py +++ b/sklearn/utils/optimize.py @@ -68,6 +68,9 @@ def _line_search_wolfe12(f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwarg if ret[0] is None: # line search failed: try different one. + # TODO: It seems that the new check for the sum of absolute gradients above + # catches all cases that, earlier, ended up here. In fact, our tests never + # trigger this "if branch" here and we can consider to remove it. ret = line_search_wolfe2( f, fprime, xk, pk, gfk, old_fval, old_old_fval, **kwargs ) From d5dea8621d58a9226adc498fa7a1b772a323446f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 6 Oct 2023 19:09:30 +0200 Subject: [PATCH 20/20] DOC add 2nd whatsnew entry --- doc/whats_new/v1.4.rst | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 900e2eff7da64..a56017fa4a624 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -23,8 +23,7 @@ random sampling procedures. :class:`linear_model.LogisticRegressionCV` now have much better convergence for solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision for the coefficients depending on the specified `tol`. Additionally, lbfgs can - make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg - is now faster than before. + make better use of `tol`, i.e., stop sooner or reach higher precision. :pr:`26721` by :user:`Christian Lorentzen `. .. note:: @@ -271,8 +270,9 @@ Changelog :class:`linear_model.LogisticRegressionCV` now have much better convergence for solvers `"lbfgs"` and `"newton-cg"`. Both solvers can now reach much higher precision for the coefficients depending on the specified `tol`. Additionally, lbfgs can - make better use of `tol`, i.e., stop sooner or reach higher precision, and newton-cg - is now faster than before. + make better use of `tol`, i.e., stop sooner or reach higher precision. This is + accomplished by better scaling of the objective function, i.e., using average per + sample losses instead of sum of per sample losses. :pr:`26721` by :user:`Christian Lorentzen `. .. note:: @@ -282,6 +282,13 @@ Changelog solvers (when fit on the same data again). The amount of change depends on the specified `tol`, for small values you will get more precise results. +- |Efficiency| :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` with solver `"newton-cg"` can now be + considerably faster for some data and parameter settings. This is accomplished by a + better line search convergence check for negligible loss improvements that takes into + account gradient information. + :pr:`26721` by :user:`Christian Lorentzen `. + - |Efficiency| Solver `"newton-cg"` in :class:`linear_model.LogisticRegression` and :class:`linear_model.LogisticRegressionCV` uses a little less memory. The effect is proportional to the number of coefficients (`n_features * n_classes`).