From f8362d72f4faf096b9504052d2699ac880c5dce2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 2 Jan 2021 11:57:32 +0100
Subject: [PATCH 001/143] ENH add common link function submodule

---
 sklearn/_loss/link.py            | 246 +++++++++++++++++++++++++++++++
 sklearn/_loss/tests/test_link.py | 107 ++++++++++++++
 2 files changed, 353 insertions(+)
 create mode 100644 sklearn/_loss/link.py
 create mode 100644 sklearn/_loss/tests/test_link.py

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
new file mode 100644
index 0000000000000..f5567e6dd7b49
--- /dev/null
+++ b/sklearn/_loss/link.py
@@ -0,0 +1,246 @@
+"""
+Module contains classes for invertible (and differentiable) link functions.
+"""
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+
+from abc import ABC, abstractmethod
+from collections import namedtuple
+
+import numpy as np
+from scipy.special import expit, logit
+from scipy.stats import gmean
+from ..utils.extmath import softmax
+
+
+Interval = namedtuple(
+    "Interval", ("low", "high", "low_inclusive", "high_inclusive")
+)
+
+
+def is_in_interval_range(x, interval):
+    """Test whether values of x are in interval range from Interval.
+
+    Parameters
+    ----------
+    x : ndarray
+        Array whose elements are tested to be in interval range.
+    interval: Interval
+        An Interval range.
+    """
+    if interval.low_inclusive:
+        low = np.greater_equal(x, interval.low)
+    else:
+        low = np.greater(x, interval.low)
+
+    if not np.all(low):
+        return False
+
+    if interval.high_inclusive:
+        high = np.less_equal(x, interval.high)
+    else:
+        high = np.less(x, interval.high)
+
+    # Note: np.all returns numpy.bool_
+    if np.all(high):
+        return True
+    else:
+        return False
+
+
+def _inclusive_low_high(interval, dtype=float):
+    """Generate values low and high to be within the interval range."""
+    eps = 10 * np.finfo(dtype).eps
+    if interval.low == -np.inf:
+        low = -1e10
+    elif interval.low < 0:
+        low = interval.low * (1 - eps) + eps
+    else:
+        low = interval.low * (1 + eps) + eps
+
+    if interval.high == np.inf:
+        high = 1e10
+    elif interval.high < 0:
+        high = interval.high * (1 + eps) - eps
+    else:
+        high = interval.high * (1 - eps) - eps
+
+    return low, high
+
+
+class BaseLink(ABC):
+    """Abstract base class for differentiable, invertible link functions.
+
+    Convention:
+        - link function g: raw_prediction = g(y_pred)
+        - inverse link h: y_pred = h(raw_prediction)
+
+    For (generalized) linear models, `raw_prediction = X @ coef` is the so
+    called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
+    conditional (on X) expected value of the target `y_true`.
+
+    In case a link function needs parameters, the methods are not implemented
+    as staticmethods.
+    """
+
+    multiclass = False
+
+    # Usually, raw_prediction may be any real number and y_pred is an open
+    # interval.
+    interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
+    interval_y_pred = Interval(-np.inf, np.inf, False, False)
+
+    @abstractmethod
+    def link(self, y_pred, out=None):
+        """Compute the link function g(y_pred).
+
+        The link function maps (predicted) target values to raw predictions,
+        i.e. `g(y_pred) = raw_prediction`.
+
+        Parameters
+        ----------
+        y_pred : array
+            Predicted target values.
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise link function.
+        """
+
+    @abstractmethod
+    def inverse(self, raw_prediction, out=None):
+        """Compute the inverse link function h(raw_prediction).
+
+        The inverse link function maps raw predictions to predicted target
+        values, i.e. `h(raw_prediction) = y_pred`.
+
+        Parameters
+        ----------
+        raw_prediction : array
+            Raw prediction values (in link space).
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise inverse link function.
+        """
+
+
+class IdentityLink(BaseLink):
+    """The identity link function g(x)=x."""
+
+    def link(self, y_pred, out=None):
+        if out is not None:
+            np.copyto(out, y_pred)
+            return out
+        else:
+            return y_pred
+
+    inverse = link
+
+
+class LogLink(BaseLink):
+    """The log link function g(x)=log(x)."""
+
+    interval_y_pred = Interval(0, np.inf, False, False)
+
+    def link(self, y_pred, out=None):
+        return np.log(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return np.exp(raw_prediction, out=out)
+
+
+class LogitLink(BaseLink):
+    """The logit link function g(x)=logit(x)."""
+
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def link(self, y_pred, out=None):
+        return logit(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return expit(raw_prediction, out=out)
+
+
+class MultinomialLogit(BaseLink):
+    """The symmetric multinomial logit function.
+
+    Convention:
+        - y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+
+    Notes:
+        - The inverse link h is the softmax function.
+        - The sum is over the second axis, i.e. axis=1 (n_classes).
+
+    We have to choose additional contraints in order to make
+
+        y_pred_k = exp(raw_pred_k) / sum(exp(raw_pred_k), k=0..n_classes-1)
+
+    for n_classes classes identifiable and invertible.
+    We choose the symmetric side contraint where the geometric mean response
+    is set as reference category, see [2]:
+
+    The symmetric multinomial logit link function for a single data point is
+    then defined as
+
+        raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
+        = log(y_pred[k]) - mean(log(y_pred)).
+
+    Note that this is equivalent to the definition in [1] and implies mean
+    centered raw predictions:
+
+        sum(raw_prediction[k], k=0..n_classes-1) = 0.
+
+    For linear models with raw_prediction = X @ coef, this corresponds to
+    sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
+    feature is zero.
+
+    Reference
+    ---------
+    .. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
+        logistic regression: a statistical view of boosting" Ann. Statist.
+        28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
+        https://projecteuclid.org/euclid.aos/1016218223
+
+    .. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
+        multinomial logit models with symmetric side constraints."
+        Computational Statistics 28 (2013): 1017-1034.
+        http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
+    """
+
+    multiclass = True
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def symmetrize_raw_prediction(self, raw_prediction):
+        return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
+
+    def link(self, y_pred, out=None):
+        # geometric mean as reference category
+        gm = gmean(y_pred, axis=1)
+        out = np.log(y_pred / gm[:, np.newaxis], out=out)
+        return out
+
+    def inverse(self, raw_prediction, out=None):
+        if out is None:
+            return softmax(raw_prediction, copy=True)
+        else:
+            np.copyto(out, raw_prediction)
+            softmax(out, copy=False)
+            return out
+
+
+_LINKS = {
+    "identity": IdentityLink,
+    "log": LogLink,
+    "logit": LogitLink,
+    "multinomial_logit": MultinomialLogit,
+}
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
new file mode 100644
index 0000000000000..a8dbbff511373
--- /dev/null
+++ b/sklearn/_loss/tests/test_link.py
@@ -0,0 +1,107 @@
+import numpy as np
+from numpy.testing import assert_allclose, assert_array_equal
+import pytest
+
+from sklearn._loss.link import (
+    _LINKS,
+    _inclusive_low_high,
+    MultinomialLogit,
+    Interval,
+    is_in_interval_range,
+)
+
+
+LINK_FUNCTIONS = list(_LINKS.values())
+
+
+@pytest.mark.parametrize(
+    "interval",
+    [
+        Interval(0, 1, False, False),
+        Interval(0, 1, False, True),
+        Interval(0, 1, True, False),
+        Interval(0, 1, True, True),
+        Interval(-np.inf, np.inf, False, False),
+        Interval(-np.inf, np.inf, False, True),
+        Interval(-np.inf, np.inf, True, False),
+        Interval(-np.inf, np.inf, True, True),
+    ],
+)
+def test_is_in_range(interval):
+    # make sure low and high are always within the interval, used for linspace
+    low, high = _inclusive_low_high(interval)
+
+    x = np.linspace(low, high, num=10)
+    assert is_in_interval_range(x, interval)
+
+    # x contains lower bound
+    assert (
+        is_in_interval_range(np.r_[x, interval.low], interval)
+        == interval.low_inclusive
+    )
+
+    # x contains upper bound
+    assert (
+        is_in_interval_range(np.r_[x, interval.high], interval)
+        == interval.high_inclusive
+    )
+
+    # x contains upper and lower bound
+    assert is_in_interval_range(
+        np.r_[x, interval.low, interval.high], interval
+    ) == (interval.low_inclusive and interval.high_inclusive)
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_inverse_identity(link):
+    # Test that link of inverse gives idendity.
+    rng = np.random.RandomState(42)
+    link = link()
+    n_samples, n_classes = 100, None
+    if link.multiclass:
+        n_classes = 10
+        raw_prediction = rng.normal(
+            loc=0, scale=10, size=(n_samples, n_classes)
+        )
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    else:
+        # So far, the valid interval of raw_prediction is (-inf, inf) and
+        # we do not need to distinguish
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
+
+    assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
+    y_pred = link.inverse(raw_prediction)
+    assert_allclose(link.inverse(link.link(y_pred)), y_pred)
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_out_argument(link):
+    # Test that out argument gets assigned the result.
+    rng = np.random.RandomState(42)
+    link = link()
+    n_samples, n_classes = 100, None
+    if link.multiclass:
+        n_classes = 10
+        raw_prediction = rng.normal(
+            loc=0, scale=10, size=(n_samples, n_classes)
+        )
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    else:
+        # So far, the valid interval of raw_prediction is (-inf, inf) and
+        # we do not need to distinguish
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
+
+    y_pred = link.inverse(raw_prediction, out=None)
+    out = np.empty_like(raw_prediction)
+    y_pred_2 = link.inverse(raw_prediction, out=out)
+    assert_allclose(y_pred, out)
+    assert_array_equal(out, y_pred_2)
+    assert np.shares_memory(out, y_pred_2)
+
+    out = np.empty_like(y_pred)
+    raw_prediction_2 = link.link(y_pred, out=out)
+    assert_allclose(raw_prediction, out)
+    assert_array_equal(out, raw_prediction_2)
+    assert np.shares_memory(out, raw_prediction_2)

From afdb67e251c43faef6494ae8eb7e9e55ed0b4d01 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 2 Jan 2021 12:58:58 +0100
Subject: [PATCH 002/143] ENH add common loss function submodule

---
 sklearn/_loss/_loss.pxd          |   75 ++
 sklearn/_loss/_loss.pyx          | 1780 ++++++++++++++++++++++++++++++
 sklearn/_loss/loss.py            |  910 +++++++++++++++
 sklearn/_loss/setup.py           |   20 +
 sklearn/_loss/tests/test_loss.py |  814 ++++++++++++++
 5 files changed, 3599 insertions(+)
 create mode 100644 sklearn/_loss/_loss.pxd
 create mode 100644 sklearn/_loss/_loss.pyx
 create mode 100644 sklearn/_loss/loss.py
 create mode 100644 sklearn/_loss/setup.py
 create mode 100644 sklearn/_loss/tests/test_loss.py

diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
new file mode 100644
index 0000000000000..1528ab28741fd
--- /dev/null
+++ b/sklearn/_loss/_loss.pxd
@@ -0,0 +1,75 @@
+# cython: language_level=3
+
+import numpy as np
+cimport numpy as np
+
+np.import_array()
+
+
+# Fused types for y_true, y_pred, raw_prediction
+ctypedef fused Y_DTYPE_C:
+    np.npy_float64
+    np.npy_float32
+
+
+# Fused types for gradient and hessian
+ctypedef fused G_DTYPE_C:
+    np.npy_float64
+    np.npy_float32
+
+
+# Struct to return 2 doubles
+ctypedef struct double2:
+   double val1
+   double val2
+
+
+# C base class for loss functions
+cdef class cLossFunction:
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfSquaredError(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cAbsoluteError(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cPinballLoss(cLossFunction):
+    cdef readonly double quantile  # readonly makes it inherited by children
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfPoissonLoss(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfGammaLoss(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfTweedieLoss(cLossFunction):
+    cdef readonly double power  # readonly makes it inherited by children
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cBinaryCrossEntropy(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
new file mode 100644
index 0000000000000..f94c4118119f9
--- /dev/null
+++ b/sklearn/_loss/_loss.pyx
@@ -0,0 +1,1780 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+
+# Design:
+# See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
+# a) Merge link functions into loss functions for speed and numerical
+#    stability, i.e. use raw_prediction instead of y_pred in signature.
+# b) Pure C functions (nogil) calculate single points (single sample)
+# c) Wrap C functions in a loop to get Python functions operating on ndarrays.
+#   - Write loops manually.
+#     Reason: There is still some performance overhead when using a wrapper
+#     function "wrap" that carries out the loop and gets as argument a function
+#     pointer to one of the C functions from b), e.g.
+#     wrap(closs_half_poisson, y_true, ...)
+#   - Pass n_threads as argument to prange and propagate option to all callers.
+# d) Provide classes (Cython extension types) per loss in order to have
+#    semantical structured objects.
+#    - Member function for single points just call the C function from b).
+#      These are used e.g. in SGD `_plain_sgd`.
+#    - Member functions operating on ndarrays looping, see c), over calls to C
+#      functions from b).
+# e) Provide convenience Python classes that inherit from these extension types
+#    elsewhere (see loss.py)
+#    - Example: loss.gradient calls extension_type._gradient but does some
+#      input checking like None -> np.empty().
+#
+# Note: We require 1-dim ndarrays to be contiguous.
+# TODO: Use const memoryviews with Cython 3.0 where appropriate (# IN)
+
+cimport cython
+from cython.parallel import parallel, prange
+import numpy as np
+cimport numpy as np
+
+from libc.math cimport exp, fabs, log, log1p
+from libc.stdlib cimport malloc, free
+
+np.import_array()
+
+
+# -------------------------------------
+# Helper functions
+# -------------------------------------
+# Numerically stable version of log(1 + exp(x)) for double precision
+# See https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
+cdef inline double log1pexp(double x) nogil:
+    if x <= -37:
+        return exp(x)
+    elif x <= 18:
+        return log1p(exp(x))
+    elif x <= 33.3:
+        return x + exp(-x)
+    else:
+        return x
+
+
+cdef inline void sum_exp_minus_max(
+    const int i, Y_DTYPE_C[:, :] raw_prediction,  # IN
+    Y_DTYPE_C *p                                  # OUT
+) nogil:
+    # Store p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
+    #       p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
+    #       p[-1] = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
+    # len(p) must be n_classes + 2
+    # Notes:
+    # - Using "by reference" arguments doesn't work well, therefore we use a
+    #   longer p, see https://github.com/cython/cython/issues/1863
+    # - i needs to be passed (and stays constant) because otherwise Cython does
+    #   not generate optimal code, see
+    #   https://github.com/scikit-learn/scikit-learn/issues/17299
+    # - We do not calculate p[k] = p[k] / sum_exps to save one loop over k.
+    cdef:
+        int k
+        int n_classes = raw_prediction.shape[1]
+        double max_value = raw_prediction[i, 0]
+        double sum_exps = 0
+    for k in range(1, n_classes):
+        # Compute max value of array for numerical stability
+        if max_value < raw_prediction[i, k]:
+            max_value = raw_prediction[i, k]
+
+    for k in range(n_classes):
+        p[k] = exp(raw_prediction[i, k] - max_value)
+        sum_exps += p[k]
+
+    p[n_classes] = max_value     # same as p[-2]
+    p[n_classes + 1] = sum_exps  # same as p[-1]
+
+
+# -------------------------------------
+# Single point inline C functions
+# -------------------------------------
+# Half Squared Error
+cdef inline double closs_half_squared_error(double y_true, double raw_prediction) nogil:
+    return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
+
+
+cdef inline double cgradient_half_squared_error(
+    double y_true, double raw_prediction
+) nogil:
+    return raw_prediction - y_true
+
+
+cdef inline double2 cgrad_hess_half_squared_error(
+    double y_true, double raw_prediction
+) nogil:
+    cdef double2 gh
+    gh.val1 = raw_prediction - y_true  # gradient
+    gh.val2 = 1.                       # hessian
+    return gh
+
+
+# Absolute Error
+cdef inline double closs_absolute_error(double y_true, double raw_prediction) nogil:
+    return fabs(raw_prediction - y_true)
+
+
+cdef inline double cgradient_absolute_error(double y_true, double raw_prediction) nogil:
+    return 1. if raw_prediction > y_true else -1.
+
+
+cdef inline double2 cgrad_hess_absolute_error(
+    double y_true, double raw_prediction
+) nogil:
+    cdef double2 gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = 1. if raw_prediction > y_true else -1.  # gradient
+    gh.val2 = 1.                                      # hessian
+    return gh
+
+
+# Quantile Loss / Pinball Loss
+cdef inline double closs_pinball_loss(
+    double y_true, double raw_prediction, double quantile
+) nogil:
+    return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
+            else (1. - quantile) * (raw_prediction - y_true))
+
+
+cdef inline double cgradient_pinball_loss(
+    double y_true, double raw_prediction, double quantile
+) nogil:
+    return -quantile if y_true >=raw_prediction else 1. - quantile
+
+
+cdef inline double2 cgrad_hess_pinball_loss(
+    double y_true, double raw_prediction, double quantile
+) nogil:
+    cdef double2 gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile  # gradient
+    gh.val2 = 1.                                                       # hessian
+    return gh
+
+
+# Half Poisson Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_poisson(double y_true, double raw_prediction) nogil:
+    return exp(raw_prediction) - y_true * raw_prediction
+
+
+cdef inline double cgradient_half_poisson(double y_true, double raw_prediction) nogil:
+    # y_pred - y_true
+    return exp(raw_prediction) - y_true
+
+
+cdef inline double2 closs_grad_half_poisson(double y_true, double raw_prediction) nogil:
+    cdef double2 lg
+    lg.val2 = exp(raw_prediction)
+    lg.val1 = lg.val2 - y_true * raw_prediction  # loss
+    lg.val2 -= y_true                            # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_half_poisson(double y_true, double raw_prediction) nogil:
+    cdef double2 gh
+    gh.val2 = exp(raw_prediction)  # hessian
+    gh.val1 = gh.val2 - y_true     # gradient
+    return gh
+
+
+# Half Gamma Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_gamma(double y_true, double raw_prediction) nogil:
+    return raw_prediction + y_true * exp(-raw_prediction)
+
+
+cdef inline double cgradient_half_gamma(double y_true, double raw_prediction) nogil:
+    return 1. - y_true * exp(-raw_prediction)
+
+
+cdef inline double2 closs_grad_half_gamma(double y_true, double raw_prediction) nogil:
+    cdef double2 lg
+    lg.val2 = exp(-raw_prediction)
+    lg.val1 = raw_prediction + y_true * lg.val2  # loss
+    lg.val2 = 1. - y_true * lg.val2              # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_half_gamma(double y_true, double raw_prediction) nogil:
+    cdef double2 gh
+    gh.val2 = exp(-raw_prediction)
+    gh.val1 = 1. - y_true * gh.val2  # gradient
+    gh.val2 *= y_true                # hessian
+    return gh
+
+
+# Half Tweedie Deviance with Log-Link, dropping constant terms
+# Note that by dropping constants this is no longer smooth in parameter power.
+cdef inline double closs_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    if power == 0.:
+        return closs_half_squared_error(y_true, exp(raw_prediction))
+    elif power == 1.:
+        return closs_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction) / (2. - power)
+                - y_true * exp((1. - power) * raw_prediction) / (1. - power))
+
+
+cdef inline double cgradient_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    cdef double exp1
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        return exp1 * (exp1 - y_true)
+    elif power == 1.:
+        return cgradient_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgradient_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction)
+                - y_true * exp((1. - power) * raw_prediction))
+
+
+cdef inline double2 closs_grad_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    cdef double2 lg
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        lg.val1 = closs_half_squared_error(y_true, exp1)  # loss
+        lg.val2 = exp1 * (exp1 - y_true)                  # gradient
+    elif power == 1.:
+        return closs_grad_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_grad_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power)  # loss
+        lg.val2 = exp2 - y_true * exp1                                # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    cdef double2 gh
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        gh.val1 = exp1 * (exp1 - y_true)      # gradient
+        gh.val2 = exp1 * (2 * exp1 - y_true)  # hessian
+    elif power == 1.:
+        return cgrad_hess_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgrad_hess_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        gh.val1 = exp2 - y_true * exp1                                # gradient
+        gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1  # hessian
+    return gh
+
+
+# Binary cross entropy aka log-loss
+cdef inline double closs_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    # log1p(exp(raw_prediction)) - y_true * raw_prediction
+    return log1pexp(raw_prediction) - y_true * raw_prediction
+
+
+cdef inline double cgradient_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    # y_pred - y_true = expit(raw_prediction) - y_true
+    # Numerically more stable, see
+    # http://fa.bianp.net/blog/2019/evaluate_logistic/
+    #     if raw_prediction < 0:
+    #         exp_tmp = exp(raw_prediction)
+    #         return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
+    #     else:
+    #         exp_tmp = exp(-raw_prediction)
+    #         return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    # Note that optimal speed would be achieved, at the cost of precision, by
+    #     return expit(raw_prediction) - y_true
+    # i.e. no if else, and an own inline implemention of expit instead of
+    #     from scipy.special.cython_special cimport expit
+    # The case distinction raw_prediction < 0 in the stable implementation
+    # does not provide significant better precision. Therefore we go without
+    # it.
+    cdef double exp_tmp
+    exp_tmp = exp(-raw_prediction)
+    return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+
+
+cdef inline double2 closs_grad_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    cdef double2 lg
+    if raw_prediction <= 0:
+        lg.val2 = exp(raw_prediction)
+        if raw_prediction <= -37:
+            lg.val1 = lg.val2 - y_true * raw_prediction              # loss
+        else:
+            lg.val1 = log1p(lg.val2) - y_true * raw_prediction       # loss
+        lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2)  # gradient
+    else:
+        lg.val2 = exp(-raw_prediction)
+        if raw_prediction <= 18:
+            # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
+            lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction  # loss
+        else:
+            lg.val1 = lg.val2 + (1 - y_true) * raw_prediction         # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)   # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    # with y_pred = expit(raw)
+    # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2
+    #                                 = exp(-raw) / (1 + exp(-raw))**2
+    cdef double2 gh
+    gh.val2 = exp(-raw_prediction)
+    gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
+    gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
+    return gh
+
+
+# ---------------------------------------------------
+# Extension Types for Loss Functions of 1-dim targets
+# ---------------------------------------------------
+cdef class cLossFunction:
+    """Base class for convex loss functions."""
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        """Compute the loss for a single sample.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        double
+            The loss evaluated at `y_true` and `raw_prediction`.
+        """
+        pass
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        """Compute gradient of loss w.r.t. raw_prediction for a single sample.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        double
+            The derivative of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        """Compute gradient and hessian.
+
+        Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
+
+        This is usually diagonal in raw_prediction_i and raw_prediction_j.
+        Therefore, we return the diagonal element i=j.
+
+        For a loss with a non-canonical link, this might implement the diagonal
+        of the Fisher matrix (=expected hessian) instead of the hessian.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        grad_hess_pair
+            Gradient and hessian of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+
+    # Note: With Cython 3.0, fused types can be used together with const:
+    #       const Y_DTYPE_C double[::1] y_true
+    # See release notes 3.0.0 alpha1
+    # https://cython.readthedocs.io/en/latest/src/changes.html#alpha-1-2020-04-12
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
+        int n_threads=1
+    ):
+        """Compute the pointwise loss value for each input.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+        """
+        pass
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
+        int n_threads=1
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples,)
+            Element-wise gradients.
+        """
+        pass
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
+        G_DTYPE_C[::1] gradient,        # OUT
+        int n_threads=1
+    ):
+        """Compute loss and gradient of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss : array of shape (n_samples,) or None
+            A location into which the element-wise loss is stored.
+        gradient : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+
+        gradient : array of shape (n_samples,)
+            Element-wise gradients.
+        """
+        self._loss(y_true, raw_prediction, sample_weight, loss,
+                            n_threads)
+        self._gradient(y_true, raw_prediction, sample_weight, gradient,
+                      n_threads)
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] hessian,         # OUT
+        int n_threads=1
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        hessian : array of shape (n_samples,)
+            A location into which the hessian is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples,)
+            Element-wise gradients.
+
+        hessian : array of shape (n_samples,)
+            Element-wise hessians.
+        """
+        pass
+
+
+cdef class cHalfSquaredError(cLossFunction):
+    """Half Squared Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_squared_error(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_squared_error(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_squared_error(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_squared_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_squared_error(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_squared_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_squared_error(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cAbsoluteError(cLossFunction):
+    """Absolute Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_absolute_error(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_absolute_error(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_absolute_error(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_absolute_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (sample_weight[i]
+                    * closs_absolute_error(y_true[i], raw_prediction[i]))
+
+        return np.asarray(loss)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_absolute_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_absolute_error(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cPinballLoss(cLossFunction):
+    """Quantile Loss aka Pinball Loss with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
+    """
+
+    def __init__(self, quantile):
+        self.quantile = quantile
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_pinball_loss(y_true, raw_prediction, self.quantile)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_pinball_loss(y_true, raw_prediction, self.quantile)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_pinball_loss(y_true, raw_prediction, self.quantile)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
+                )
+
+        return np.asarray(loss)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_pinball_loss(
+                    y_true[i], raw_prediction[i], self.quantile
+                )
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_pinball_loss(
+                    y_true[i], raw_prediction[i], self.quantile
+                )
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_pinball_loss(
+                    y_true[i], raw_prediction[i], self.quantile
+                )
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cHalfPoissonLoss(cLossFunction):
+    """Half Poisson deviance loss with log-link.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Poisson deviance with log-link is
+        y_true * log(y_true/y_pred) + y_pred - y_true
+        = y_true * log(y_true) - y_true * raw_prediction
+          + exp(raw_prediction) - y_true
+
+    Dropping constant terms, this gives:
+        exp(raw_prediction) - y_true * raw_prediction
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_poisson(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_poisson(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_poisson(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_poisson(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_poisson(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_poisson(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_poisson(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cHalfGammaLoss(cLossFunction):
+    """Half Gamma deviance loss with log-link.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Gamma deviance with log-link is
+        log(y_pred/y_true) + y_true/y_pred - 1
+        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
+
+    Dropping constant terms, this gives:
+        raw_prediction + y_true * exp(-raw_prediction)
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_gamma(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_gamma(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_gamma(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_gamma(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_gamma(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_gamma(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_gamma(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cHalfTweedieLoss(cLossFunction):
+    """Half Tweedie deviance loss with log-link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Tweedie deviance with log-link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+        + exp((2-p) * raw_prediction) / (2-p)
+
+    Dropping constant terms, this gives:
+        exp((2-p) * raw_prediction) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+
+    Notes:
+    - Poisson with p=1 and and Gamma with p=2 have different terms dropped such
+      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
+    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
+      0<p<1 still gives a strictly consistent scoring function for the
+      expectation.
+    """
+
+    def __init__(self, power):
+        self.power = power
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_tweedie(y_true, raw_prediction, self.power)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_tweedie(y_true, raw_prediction, self.power)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_tweedie(y_true, raw_prediction, self.power)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_tweedie(
+                    y_true[i], raw_prediction[i], self.power
+                )
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cBinaryCrossEntropy(cLossFunction):
+    """BinaryCrossEntropy with logit link.
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_binary_crossentropy(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_binary_crossentropy(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_binary_crossentropy(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_binary_crossentropy(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_binary_crossentropy(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cCategoricalCrossEntropy(cLossFunction):
+    """CategoricalCrossEntropy with multinomial logit link.
+
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded
+
+    Link:
+    y_pred = softmax(raw_prediction)
+
+    Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is
+    mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1.
+    """
+
+    # Note that we do not assume memory alignement/contiguity of 2d arrays.
+    # There seems to be little benefit in doing so. Benchmarks proofing the
+    # opposite are welcome.
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C max_value, sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        # We assume n_samples > n_classes. In this case having the inner loop
+        # over n_classes is a good default.
+        # TODO: If every memoryview is contiguous and raw_preduction is
+        #       f-contiguous, can we write a better algo (loops) to improve
+        #       performance?
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = p[n_classes]     # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss[i] -= raw_prediction[i, k]
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = raw_prediction[i, 0]
+                    max_value = p[n_classes]     # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss[i] -= raw_prediction[i, k]
+
+                    loss[i] *= sample_weight[i]
+
+                free(p)
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[:, :] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C max_value, sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = p[n_classes]  # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true [i] == k:
+                            loss[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = p_k - (y_true == k)
+                        gradient[i, k] = p[k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = p[n_classes]  # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true [i] == k:
+                            loss[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+
+                    loss[i] *= sample_weight[i]
+
+                free(p)
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[:, :] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient[i, k] = p[k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+
+                free(p)
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[:, :] gradient,
+        G_DTYPE_C[:, :] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C sum_exps
+            Y_DTYPE_C* p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # hessian_k = p_k * (1 - p_k)
+                        # gradient_k = p_k - (y_true == k)
+                        gradient[i, k] = p[k] - (y_true[i] == k)
+                        hessian[i, k] = p[k] * (1. - p[k])
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        # hessian_k = p_k * (1 - p_k) * sw
+                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                        hessian[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
+
+                free(p)
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+    # This method simplifies the implementation of hessp in linear models,
+    # i.e. the matrix-vector product of the full hessian, not only of the
+    # diagonal (in the classes) approximation as implemented above.
+    def _gradient_proba(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[:, :] gradient,
+        G_DTYPE_C[:, :] proba,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        proba[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient[i, k] = proba[i, k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        proba[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient[i, k] = (proba[i, k] - (y_true[i] == k)) * sample_weight[i]
+
+                free(p)
+
+        return np.asarray(gradient), np.asarray(proba)
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
new file mode 100644
index 0000000000000..49d968b6bd2af
--- /dev/null
+++ b/sklearn/_loss/loss.py
@@ -0,0 +1,910 @@
+"""
+This module contains loss classes suitable for fitting.
+
+It is not part of the public API.
+Specific losses are used for regression, binary classification or multiclass
+classification.
+"""
+# Goals:
+# - Provide a common private module for loss functions/classes.
+# - Replace losses for:
+#   - LogisticRegression
+#   - PoissonRegressor, GammaRegressor, TweedieRegressor
+#   - HistGradientBoostingRegressor, HistGradientBoostingClassifier
+#   - GradientBoostingRegressor, GradientBoostingClassifier
+#   - SGDRegressor, SGDClassifier
+# - Replace link module of GLMs.
+
+import numpy as np
+from scipy.special import xlogy
+from ._loss import (
+    cLossFunction,
+    cHalfSquaredError,
+    cAbsoluteError,
+    cPinballLoss,
+    cHalfPoissonLoss,
+    cHalfGammaLoss,
+    cHalfTweedieLoss,
+    cBinaryCrossEntropy,
+    cCategoricalCrossEntropy,
+)
+from .link import (
+    Interval,
+    is_in_interval_range,
+    BaseLink,
+    IdentityLink,
+    LogLink,
+    LogitLink,
+    MultinomialLogit,
+)
+from ..utils.stats import _weighted_percentile
+
+
+# Note: The shape of raw_prediction for multiclass classifications are
+# - GradientBoostingClassifier: (n_samples, n_classes)
+# - HistGradientBoostingClassifier: (n_classes, n_samples)
+class BaseLoss(BaseLink, cLossFunction):
+    """Base class for a loss function of 1-dimensional targets.
+
+    Conventions:
+
+        - y_true.shape = sample_weight.shape = (n_samples,)
+        - y_pred.shape = raw_prediction.shape = (n_samples,)
+        - If n_classes >= 3 (multiclass classification), then
+          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+          Note that this corresponds to the return value of decision_function.
+
+    y_true, y_pred, sample_weight and raw_prediction must either be all float64
+    or all float32.
+    gradient and hessian must be either both float64 or both float32.
+
+    Note that y_pred = link.inverse(raw_prediction).
+
+    Specific loss classes can inherit specific link classes to satisfy
+    BaseLink's abstractmethods.
+
+    Parameters
+    ----------
+    sample_weight : {None, ndarray}
+        If sample_weight is None, the hessian might be constant.
+    n_classes : {None, int}
+        The number of classes for classification, else None.
+
+    Attributes
+    ----------
+    interval_y_true: Interval
+        Valid interval for y_true
+    interval_y_pred: Interval
+        Valid Interval for y_pred
+    differentiable: bool
+        Indicates whether or not loss function is differentiable in
+        raw_prediction everywhere.
+    need_update_leaves_values: bool
+        Indicates whether decision trees in gradient boosting need to uptade
+        leave values after having been fit to the (negative) gradients.
+    approx_hessian : bool
+        Indicates whether the hessian is approximated or exact. If,
+        approximated, it should be larger or equal to the exact one.
+    constant_hessian : bool
+        Indicates whether the hessian is one for this loss.
+    """
+
+    # Inherited methods from BaseLink:
+    # - link
+    # - inverse
+    #
+    # Inherited methods from cLossFunction:
+    # - _loss, _loss_gradient, _gradient, _gradient_hessian
+
+    # For decision trees:
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    need_update_leaves_values = False
+    differentiable = True
+
+    def __init__(self, n_classes=1):
+        self.approx_hessian = False
+        self.constant_hessian = False
+        self.n_classes = n_classes
+        self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
+
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return is_in_interval_range(y, self.interval_y_true)
+
+    def in_y_pred_range(self, y):
+        """Return True if y is in the valid range of y_pred.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return is_in_interval_range(y, self.interval_y_pred)
+
+    def loss(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss=None,
+        n_threads=1,
+    ):
+        """Compute the pointwise loss value for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss : None or C-contiguous array of shape (n_samples,)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+        """
+        if loss is None:
+            loss = np.empty_like(y_true)
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        return self._loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=loss,
+            n_threads=n_threads,
+        )
+
+    def loss_gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss=None,
+        gradient=None,
+        n_threads=1,
+    ):
+        """Compute loss and gradient w.r.t. raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss : None or C-contiguous array of shape (n_samples,)
+            A location into which the loss is stored. If None, a new array
+            might be created.
+        gradient : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if loss is None:
+            if gradient is None:
+                loss = np.empty_like(y_true)
+                gradient = np.empty_like(raw_prediction)
+            else:
+                loss = np.empty_like(y_true, dtype=gradient.dtype)
+        elif gradient is None:
+            gradient = np.empty_like(raw_prediction, dtype=loss.dtype)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient.ndim == 2 and gradient.shape[1] == 1:
+            gradient = gradient.squeeze(1)
+
+        return self._loss_gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=loss,
+            gradient=gradient,
+            n_threads=n_threads,
+        )
+
+    def gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        n_threads=1,
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if gradient is None:
+            gradient = np.empty_like(raw_prediction)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient.ndim == 2 and gradient.shape[1] == 1:
+            gradient = gradient.squeeze(1)
+
+        return self._gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=gradient,
+            n_threads=n_threads,
+        )
+
+    def gradient_hessian(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        hessian=None,
+        n_threads=1,
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        hessian : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the hessian is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+
+        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise hessians.
+        """
+        if gradient is None:
+            if hessian is None:
+                gradient = np.empty_like(raw_prediction)
+                hessian = np.empty_like(raw_prediction)
+            else:
+                gradient = np.empty_like(hessian)
+        elif hessian is None:
+            hessian = np.empty_like(gradient)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient.ndim == 2 and gradient.shape[1] == 1:
+            gradient = gradient.squeeze(1)
+        if hessian.ndim == 2 and hessian.shape[1] == 1:
+            hessian = hessian.squeeze(1)
+
+        return self._gradient_hessian(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=gradient,
+            hessian=hessian,
+            n_threads=n_threads,
+        )
+
+    def __call__(
+        self, y_true, raw_prediction, sample_weight=None, n_threads=1
+    ):
+        """Compute the weighted average loss.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : float
+            Mean or averaged loss function.
+        """
+        return np.average(
+            self.loss(
+                y_true=y_true,
+                raw_prediction=raw_prediction,
+                sample_weight=None,
+                loss=None,
+                n_threads=n_threads,
+            ),
+            weights=sample_weight,
+        )
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This can be used as initial estimates of predictions, i.e. before the
+        first iteration in fit.
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or array of shape (n_samples,)
+            Sample weights.
+
+        Returns
+        -------
+        raw_prediction : float or (n_classes,)
+            Raw predictions of an intercept-only model.
+        """
+        # As default, take weighted average of the target over the samples
+        # axis=0 and then transform into link-scale (raw_prediction).
+        y_pred = np.average(y_true, weights=sample_weight, axis=0)
+        eps = 10 * np.finfo(y_pred.dtype).eps
+
+        if self.interval_y_pred.low == -np.inf:
+            a_min = None
+        elif self.interval_y_pred.low_inclusive:
+            a_min = self.interval_y_pred.low
+        else:
+            a_min = self.interval_y_pred.low + eps
+
+        if self.interval_y_pred.high == np.inf:
+            a_max = None
+        elif self.interval_y_pred.high_inclusive:
+            a_max = self.interval_y_pred.high
+        else:
+            a_max = self.interval_y_pred.high - eps
+
+        if a_min is None and a_max is None:
+            return self.link(y_pred)
+        else:
+            return self.link(np.clip(y_pred, a_min, a_max))
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        """Calculate term dropped in loss.
+
+        With this term added, the loss of perfect predictions is zero.
+        """
+        return np.zeros_like(y_true)
+
+
+class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
+    """Half Squared Error with identity link, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, half squares error is defined as::
+
+        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
+
+    The factor of 0.5 simplifies the computation of gradients and results in a
+    unit hessian (and be consistent with what is done in LightGBM).
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        if sample_weight is None:
+            self.constant_hessian = True
+        else:
+            self.constant_hessian = False
+
+    def gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        n_threads=1,
+    ):
+        # easier in numpy
+        # gradient = raw_prediction - y_true is easier in numpy
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if (
+            gradient is not None
+            and gradient.ndim == 2
+            and gradient.shape[1] == 1
+        ):
+            gradient = gradient.squeeze(1)
+
+        gradient = np.subtract(raw_prediction, y_true, out=gradient)
+        if sample_weight is None:
+            return gradient
+        else:
+            return np.multiply(sample_weight, gradient, out=gradient)
+
+    def gradient_hessian(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        hessian=None,
+        n_threads=1,
+    ):
+        # easier in numpy
+        gradient = self.gradient(
+            y_true, raw_prediction, sample_weight, gradient, hessian
+        )
+        if hessian is None:
+            hessian = np.empty_like(gradient)
+        elif hessian.ndim == 2 and hessian.shape[1] == 1:
+            # Be graceful to shape (n_samples, 1) -> (n_samples,)
+            hessian = hessian.squeeze(1)
+        if sample_weight is None:
+            hessian.fill(1)
+        else:
+            np.copyto(hessian, sample_weight)
+        return gradient, hessian
+
+
+class AbsoluteError(IdentityLink, BaseLoss, cAbsoluteError):
+    """Least absolute error, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, the absolute error is defined as::
+
+        loss(x_i) = |y_true_i - raw_prediction_i|
+    """
+
+    differentiable = False
+    need_update_leaves_values = True
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        self.approx_hessian = True
+        if sample_weight is None:
+            self.constant_hessian = True
+        else:
+            self.constant_hessian = False
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.median(y_true, axis=0)
+        else:
+            return _weighted_percentile(y_true, sample_weight, 50)
+
+
+class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
+    """Quantile Loss aka Pinball Loss, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, the pinball loss loss is defined as::
+
+        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
+
+        rho_{quantile}(u) = u * (quantile - 1_{u<0})
+                          = -u (1 - quantile)  if u < 0
+                            u * quantile       if u >= 0
+
+    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
+
+    Additional Attributes
+    ---------------------
+    quantile : float
+        The quantile to be estimated. Must be in range (0, 1).
+    """
+
+    differentiable = False
+    need_update_leaves_values = True
+
+    def __init__(self, sample_weight=None, quantile=0.5):
+        BaseLoss.__init__(self)
+        cPinballLoss.__init__(self, quantile=float(quantile))
+        self.approx_hessian = True
+        if sample_weight is None:
+            self.constant_hessian = True
+        else:
+            self.constant_hessian = False
+        if quantile <= 0 or quantile >= 1:
+            raise ValueError(
+                f"PinballLoss aka quantile loss only accepts "
+                f"0 < quantile < 1; {quantile} was given."
+            )
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.percentile(y_true, 100 * self.quantile, axis=0)
+        else:
+            return _weighted_percentile(
+                y_true, sample_weight, 100 * self.quantile
+            )
+
+
+class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
+    """Poisson deviance loss with log-link, for regression.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half the Poisson deviance is defined as::
+
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
+                    - y_true_i + exp(raw_prediction_i)
+
+    Half the Poisson deviance is actually the negative log likelihood up to
+    constant terms (not involving raw_prediction) and simplifies the
+    computation of the gradients.
+    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        self.interval_y_true = Interval(0, np.inf, True, False)
+        self.interval_y_pred = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = xlogy(y_true, y_true) - y_true
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+
+class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
+    """Gamma deviance loss with log-link, for regression.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half Gamma deviance loss is defined as::
+
+        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
+                    + y_true/exp(raw_prediction_i) - 1
+
+    Half the Gamma deviance is actually proportional the negative log
+    likelihood up constant terms (not involving raw_prediction) and simplifies
+    the computation of the gradients.
+    We also skip the constant term `-log(y_true_i) - 1`.
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        self.interval_y_true = Interval(0, np.inf, False, False)
+        self.interval_y_pred = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = -np.log(y_true) - 1
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+
+class HalfTweedieLoss(LogLink, BaseLoss, cHalfTweedieLoss):
+    """Tweedie deviance loss with log-link, for regression.
+
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers
+    power in real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
+                    + exp(raw_prediction_i)**(2-p) / (2-p)
+
+    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
+    HalfPoissonLoss and HalfGammaLoss.
+
+    We also skip constant terms, but those are different for p=0, 1, 2.
+    Therefore, the loss is not continuous in `power`.
+
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+
+    def __init__(self, sample_weight=None, power=1.5):
+        BaseLoss.__init__(self)
+        cHalfTweedieLoss.__init__(self, power=power)
+        self.interval_y_pred = Interval(0, np.inf, False, False)
+        if self.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        if self.power == 0:
+            return HalfSquaredError().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.power == 1:
+            return HalfPoissonLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.power == 2:
+            return HalfGammaLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        else:
+            p = self.power
+            term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p)
+            if sample_weight is not None:
+                term *= sample_weight
+            return term
+
+
+class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
+    """Binary cross entropy loss for binary classification.
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+
+    For a given sample x_i, the binary cross-entropy, aka log loss, is defined
+    as the negative log-likelihood of the Bernoulli distributions and can be
+    expressed as::
+
+        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
+
+    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
+    section 4.4.1 (about logistic regression).
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__(n_classes=2)
+        self.interval_y_true = Interval(0, 1, True, True)
+        self.interval_y_pred = Interval(0, 1, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        # This is non-zero only if y_true is neither 0 nor 1.
+        term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true)
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+    def predict_proba(self, raw_prediction):
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        proba = np.empty(
+            (raw_prediction.shape[0], 2), dtype=raw_prediction.dtype
+        )
+        proba[:, 1] = self.inverse(raw_prediction)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+
+
+class CategoricalCrossEntropy(
+    MultinomialLogit, BaseLoss, cCategoricalCrossEntropy
+):
+    """Categorical cross-entropy loss for multiclass classification.
+
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred a n_classes array, each element in (0, 1)
+
+    Link:
+    y_pred = softmax(raw_prediction)
+
+    Note: We assume y_true to be already label encoded.
+
+    For a given sample x_i, the categorical cross-entropy loss is defined as
+    the negative log-likelihood of the multinomial distribution, it generalizes
+    the binary cross-entropy to more than 2 classes::
+
+        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
+                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
+
+    See [1].
+
+    Note that for the hessian, we calculate only the diagonal part in the
+    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
+    we calculate H_i_k_k, i.e. k=l.
+
+    Reference
+    ---------
+    .. [1] Simon, Noah, J. Friedman and T. Hastie.
+        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
+        Multinomial Regression."
+        https://arxiv.org/pdf/1311.6529.pdf
+    """
+
+    def __init__(self, sample_weight=None, n_classes=3):
+        super().__init__(n_classes=n_classes)
+        self.interval_y_true = Interval(0, np.inf, True, False)
+        self.interval_y_pred = Interval(0, 1, False, False)
+
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return is_in_interval_range(y, self.interval_y_true) and np.all(
+            y.astype(np.int) == y
+        )
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the softmax of the weighted average of the target, i.e. over
+        the samples axis=0.
+        """
+        out = np.zeros(self.n_classes, dtype=y_true.dtype)
+        eps = np.finfo(y_true.dtype).eps
+        for k in range(self.n_classes):
+            out[k] = np.average(y_true == k, weights=sample_weight, axis=0)
+            out[k] = np.clip(out[k], eps, 1 - eps)
+        return self.link(out[None, :]).reshape(-1)
+
+    def predict_proba(self, raw_prediction):
+        return self.inverse(raw_prediction)
+
+    def gradient_proba(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        proba=None,
+        n_threads=1,
+    ):
+        """Compute gradient and probabilities of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient : None or array of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        proba : None or array of shape (n_samples, n_classes)
+            A location into which the class probabilities are stored. If None,
+            a new array might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient, proba : array of shape (n_samples, n_classes)
+            Element-wise gradients.
+
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilites.
+        """
+        if gradient is None:
+            if proba is None:
+                gradient = np.empty_like(raw_prediction)
+                proba = np.empty_like(raw_prediction)
+            else:
+                gradient = np.empty_like(proba)
+        elif proba is None:
+            proba = np.empty_like(gradient)
+
+        return self._gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=gradient,
+            proba=proba,
+            n_threads=n_threads,
+        )
+
+
+_LOSSES = {
+    "squared_error": HalfSquaredError,
+    "absolute_error": AbsoluteError,
+    "pinball_loss": PinballLoss,
+    "poisson_loss": HalfPoissonLoss,
+    "gamma_loss": HalfGammaLoss,
+    "tweedie_loss": HalfTweedieLoss,
+    "binary_crossentropy": BinaryCrossEntropy,
+    "categorical_crossentropy": CategoricalCrossEntropy,
+}
diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
new file mode 100644
index 0000000000000..23d35439885ba
--- /dev/null
+++ b/sklearn/_loss/setup.py
@@ -0,0 +1,20 @@
+import numpy
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package="", top_path=None):
+    config = Configuration("_loss", parent_package, top_path)
+
+    config.add_extension(
+        "_loss", sources=["_loss.pyx"], include_dirs=[numpy.get_include()]
+    )
+
+    # config.add_subpackage("tests")
+
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+
+    setup(**configuration().todict())
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
new file mode 100644
index 0000000000000..5f7e001f2d6de
--- /dev/null
+++ b/sklearn/_loss/tests/test_loss.py
@@ -0,0 +1,814 @@
+import numpy as np
+from numpy.testing import assert_allclose, assert_array_equal
+import pytest
+from pytest import approx
+from scipy.optimize import (
+    minimize,
+    minimize_scalar,
+    newton,
+)
+from scipy.special import logit
+
+from sklearn._loss.link import _inclusive_low_high
+from sklearn._loss.loss import (
+    _LOSSES,
+    AbsoluteError,
+    BinaryCrossEntropy,
+    CategoricalCrossEntropy,
+    HalfGammaLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    PinballLoss,
+)
+from sklearn.utils import assert_all_finite
+from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils.fixes import sp_version, parse_version
+
+
+ALL_LOSSES = list(_LOSSES.values())
+
+LOSS_INSTANCES = [loss() for loss in ALL_LOSSES]
+# HalfTweedieLoss(power=1.5) is already there as default
+LOSS_INSTANCES += [
+    PinballLoss(quantile=0.25),
+    HalfTweedieLoss(power=-1.5),
+    HalfTweedieLoss(power=0),
+    HalfTweedieLoss(power=1),
+    HalfTweedieLoss(power=2),
+    HalfTweedieLoss(power=3.0),
+]
+
+
+def loss_instance_name(loss):
+    name = loss.__class__.__name__
+    if hasattr(loss, "quantile"):
+        name += f"(quantile={loss.quantile})"
+    elif hasattr(loss, "power"):
+        name += f"(power={loss.power})"
+    return name
+
+
+def random_y_true_raw_prediction(
+    loss, n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=42
+):
+    """Random generate y_true and raw_prediction in valid range."""
+    rng = np.random.RandomState(seed)
+    if loss.n_classes <= 2:
+        raw_prediction = rng.uniform(
+            low=raw_bound[0], high=raw_bound[0], size=n_samples
+        )
+        # generate a y_true in valid range
+        low, high = _inclusive_low_high(loss.interval_y_true)
+        low = max(low, y_bound[0])
+        high = min(high, y_bound[1])
+        y_true = rng.uniform(low, high, size=n_samples)
+        # set some values at special boundaries
+        if (
+            loss.interval_y_true.low == 0
+            and loss.interval_y_true.low_inclusive
+        ):
+            y_true[:: (n_samples // 3)] = 0
+        if (
+            loss.interval_y_true.high == 1
+            and loss.interval_y_true.high_inclusive
+        ):
+            y_true[1:: (n_samples // 3)] = 1
+    else:
+        raw_prediction = np.empty((n_samples, loss.n_classes))
+        raw_prediction.flat[:] = rng.uniform(
+            low=raw_bound[0],
+            high=raw_bound[1],
+            size=n_samples * loss.n_classes,
+        )
+        y_true = np.arange(n_samples).astype(float) % loss.n_classes
+
+    return y_true, raw_prediction
+
+
+def numerical_derivative(func, x, eps):
+    """Helper function for numerical (first) derivatives.
+
+    # For numerical derivatives, see
+    # https://en.wikipedia.org/wiki/Numerical_differentiation
+    # https://en.wikipedia.org/wiki/Finite_difference_coefficient
+    # We use central finite differences of accuracy 4.
+    """
+    h = np.full_like(x, fill_value=eps)
+    f_minus_2h = func(x - 2 * h)
+    f_minus_1h = func(x - h)
+    f_plus_1h = func(x + h)
+    f_plus_2h = func(x + 2 * h)
+    return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (
+        12.0 * eps
+    )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_loss_boundary(loss):
+    # make sure low and high are always within the interval, used for linspace
+    if loss.n_classes is None or loss.n_classes <= 2:
+        low, high = _inclusive_low_high(loss.interval_y_true)
+        y_true = np.linspace(low, high, num=10)
+    else:
+        y_true = np.linspace(0, 9, num=10)
+
+    # add boundaries if they are included
+    if loss.interval_y_true.low_inclusive:
+        y_true = np.r_[y_true, loss.interval_y_true.low]
+    if loss.interval_y_true.high_inclusive:
+        y_true = np.r_[y_true, loss.interval_y_true.high]
+
+    assert loss.in_y_true_range(y_true)
+
+    low, high = _inclusive_low_high(loss.interval_y_pred)
+    if loss.n_classes is None or loss.n_classes <= 2:
+        y_pred = np.linspace(low, high, num=10)
+    else:
+        y_pred = np.empty((10, 3))
+        y_pred[:, 0] = np.linspace(low, high, num=10)
+        y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0])
+        y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0])
+
+    assert loss.in_y_pred_range(y_pred)
+
+    # calculating losses should not fail
+    raw_prediction = loss.link(y_pred)
+    loss.loss(y_true=y_true, raw_prediction=raw_prediction)
+
+
+@pytest.mark.parametrize(
+    "loss, y_true_success, y_true_fail",
+    [
+        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (HalfPoissonLoss(), [0, 0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfTweedieLoss(power=-3), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (HalfTweedieLoss(power=0), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (
+            HalfTweedieLoss(power=1.5),
+            [0, 0.1, 100],
+            [-np.inf, -3, -0.1, np.inf],
+        ),
+        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (BinaryCrossEntropy(), [0, 0.5, 1], [-np.inf, -1, 2, np.inf]),
+        (CategoricalCrossEntropy(), [0.0, 1.0, 2], [-np.inf, -1, 1.1, np.inf]),
+    ],
+)
+def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
+    # Test boundaries of y_true for loss functions.
+    for y in y_true_success:
+        assert loss.in_y_true_range(np.array([y]))
+    for y in y_true_fail:
+        assert not loss.in_y_true_range(np.array([y]))
+
+
+@pytest.mark.parametrize(
+    "loss, y_pred_success, y_pred_fail",
+    [
+        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (
+            HalfTweedieLoss(power=-3),
+            [0.1, 100],
+            [-np.inf, -3, -0.1, 0, np.inf],
+        ),
+        (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (
+            HalfTweedieLoss(power=1.5),
+            [0.1, 100],
+            [-np.inf, -3, -0.1, 0, np.inf],
+        ),
+        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (BinaryCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
+        (CategoricalCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
+    ],
+)
+def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
+    # Test boundaries of y_pred for loss functions.
+    for y in y_pred_success:
+        assert loss.in_y_pred_range(np.array([y]))
+    for y in y_pred_fail:
+        assert not loss.in_y_pred_range(np.array([y]))
+
+
+@pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
+@pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
+@pytest.mark.parametrize("sample_weight", [None, 1])
+@pytest.mark.parametrize("out1", [None, 1])
+@pytest.mark.parametrize("out2", [None, 1])
+@pytest.mark.parametrize("n_threads", [1, 2])
+def test_loss_dtype(
+    loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
+):
+    # Test that loss accepts if all input arrays are either all float32 or all
+    # float64, and all output arrays are either all float32 or all float64.
+    loss = loss()
+    if loss.n_classes <= 2:
+        # generate a y_true in valid range
+        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
+        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
+        raw_prediction = np.array([0.0], dtype=dtype_in)
+    else:
+        y_true = np.array([0], dtype=dtype_in)
+        raw_prediction = np.full(
+            shape=(1, loss.n_classes), fill_value=0.0, dtype=dtype_in
+        )
+
+    if sample_weight is not None:
+        sample_weight = np.array([2.0], dtype=dtype_in)
+    if out1 is not None:
+        out1 = np.empty_like(y_true, dtype=dtype_out)
+    if out2 is not None:
+        out2 = np.empty_like(raw_prediction, dtype=dtype_out)
+
+    loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out1,
+        n_threads=n_threads,
+    )
+    loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out2,
+        n_threads=n_threads,
+    )
+    loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out1,
+        gradient=out2,
+        n_threads=n_threads,
+    )
+    if out1 is not None and loss.n_classes >= 3:
+        out1 = np.empty_like(raw_prediction, dtype=dtype_out)
+    loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out1,
+        hessian=out2,
+        n_threads=n_threads,
+    )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_same_as_C_functions(loss, sample_weight):
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    out_l1 = np.empty_like(y_true)
+    out_l2 = np.empty_like(y_true)
+    out_g1 = np.empty_like(raw_prediction)
+    out_g2 = np.empty_like(raw_prediction)
+    out_h1 = np.empty_like(raw_prediction)
+    out_h2 = np.empty_like(raw_prediction)
+    assert_allclose(
+        loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=out_l1,
+        ),
+        loss._loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=out_l2,
+        ),
+    )
+    assert_allclose(
+        loss.gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out_g1,
+        ),
+        loss._gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out_g2,
+        ),
+    )
+    loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l1,
+        gradient=out_g1,
+    )
+    loss._loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l2,
+        gradient=out_g2,
+    )
+    assert_allclose(out_l1, out_l2)
+    assert_allclose(out_g1, out_g2)
+    loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g1,
+        hessian=out_h1,
+    )
+    loss._gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g2,
+        hessian=out_h2,
+    )
+    assert_allclose(out_g1, out_g2)
+    assert_allclose(out_h1, out_h2)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_gradients_are_the_same(loss, sample_weight):
+    # Test that loss and gradient are the same accross different functions
+    # Also test that output arguments contain correct result.
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    out_l1 = np.empty_like(y_true)
+    out_l2 = np.empty_like(y_true)
+    out_g1 = np.empty_like(raw_prediction)
+    out_g2 = np.empty_like(raw_prediction)
+    out_g3 = np.empty_like(raw_prediction)
+    out_h3 = np.empty_like(raw_prediction)
+
+    l1 = loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l1,
+    )
+    g1 = loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g1,
+    )
+    l2, g2 = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l2,
+        gradient=out_g2,
+    )
+    g3, h3 = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g3,
+        hessian=out_h3,
+    )
+    assert_allclose(l1, l2)
+    assert_array_equal(l1, out_l1)
+    assert np.shares_memory(l1, out_l1)
+    assert_array_equal(l2, out_l2)
+    assert np.shares_memory(l2, out_l2)
+    assert_allclose(g1, g2)
+    assert_allclose(g1, g3)
+    assert_array_equal(g1, out_g1)
+    assert np.shares_memory(g1, out_g1)
+    assert_array_equal(g2, out_g2)
+    assert np.shares_memory(g2, out_g2)
+    assert_array_equal(g3, out_g3)
+    assert np.shares_memory(g3, out_g3)
+
+    if hasattr(loss, "gradient_proba"):
+        assert loss.n_classes >= 3  # only for CategoricalCrossEntropy
+        out_g4 = np.empty_like(raw_prediction)
+        out_proba = np.empty_like(raw_prediction)
+        g4, proba = loss.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out_g4,
+            proba=out_proba,
+        )
+        assert_allclose(g1, out_g4)
+        assert_allclose(g1, g4)
+        assert_allclose(proba, out_proba)
+        assert_allclose(np.sum(proba, axis=1), 1)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", ["ones", "random"])
+def test_sample_weight_multiplies_gradients(loss, sample_weight):
+    # Make sure that passing sample weights to the gradient and hessians
+    # computation methods is equivalent to multiplying by the weights.
+
+    n_samples = 100
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+
+    if sample_weight == "ones":
+        sample_weight = np.ones(shape=n_samples, dtype=np.float64)
+    else:
+        rng = np.random.RandomState(42)
+        sample_weight = rng.normal(size=n_samples).astype(np.float64)
+
+    baseline_prediction = loss.fit_intercept_only(
+        y_true=y_true, sample_weight=None
+    )
+
+    if loss.n_classes <= 2:
+        raw_prediction = np.zeros(
+            shape=(n_samples,), dtype=baseline_prediction.dtype
+        )
+    else:
+        raw_prediction = np.zeros(
+            shape=(n_samples, loss.n_classes), dtype=baseline_prediction.dtype
+        )
+    raw_prediction += baseline_prediction
+
+    gradient, hessian = loss.gradient_hessian(
+        y_true=y_true, raw_prediction=raw_prediction, sample_weight=None
+    )
+
+    gradient_sw, hessian_sw = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+
+    if loss.n_classes <= 2:
+        assert_allclose(gradient * sample_weight, gradient_sw)
+        assert_allclose(hessian * sample_weight, hessian_sw)
+    else:
+        assert_allclose(gradient * sample_weight[:, None], gradient_sw)
+        assert_allclose(hessian * sample_weight[:, None], hessian_sw)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_of_perfect_prediction(loss, sample_weight):
+    # Test that loss of y_true = y_pred plus constant_to_optimal_zero sums up
+    # to zero.
+    if loss.n_classes <= 2:
+        # Use small values such that exp(value) is not nan.
+        raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
+        y_true = loss.inverse(raw_prediction)
+    else:
+        # CategoricalCrossEntropy
+        y_true = np.arange(loss.n_classes).astype(float)
+        # raw_prediction with entries -exp(10), but +exp(10) on the diagonal
+        # this is close enough to np.inf which would produce nan
+        raw_prediction = np.full(
+            shape=(loss.n_classes, loss.n_classes),
+            fill_value=-np.exp(10),
+            dtype=float,
+        )
+        raw_prediction.flat[:: loss.n_classes + 1] = np.exp(10)
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    loss_value = loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    constant_term = loss.constant_to_optimal_zero(
+        y_true=y_true, sample_weight=sample_weight
+    )
+    # Comparing loss_value + constant_term to zero would result in large
+    # round-off errors.
+    assert_allclose(loss_value, -constant_term, atol=1e-14, rtol=1e-15)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_gradients_hessians_numerically(loss, sample_weight):
+    # Test that gradients are computed correctly by comparing to numerical
+    # derivatives of loss functions.
+    # Test that hessians are correct by numerical derivative of gradients.
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    g, h = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+
+    assert g.shape == raw_prediction.shape
+    assert h.shape == raw_prediction.shape
+
+    if loss.n_classes <= 2:
+
+        def loss_func(x):
+            return loss.loss(
+                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+            )
+
+        g_numeric = numerical_derivative(loss_func, raw_prediction, eps=1e-6)
+        assert_allclose(g, g_numeric, rtol=5e-6, atol=1e-10)
+
+        def grad_func(x):
+            return loss.gradient(
+                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+            )
+
+        h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6)
+        if loss.approx_hessian:
+            assert np.all(h >= h_numeric)
+        else:
+            assert_allclose(h, h_numeric, rtol=5e-6, atol=1e-10)
+    else:
+        # For multiclass loss, we should only change the predictions of the
+        # class for which the derivative is taken for, e.g. offset[:, k] = eps
+        # for class k.
+        # As a softmax is computed, offsetting the whole array by a constant
+        # would have no effect on the probabilities, and thus on the loss.
+        for k in range(loss.n_classes):
+
+            def loss_func(x):
+                raw = raw_prediction.copy()
+                raw[:, k] = x
+                return loss.loss(
+                    y_true=y_true,
+                    raw_prediction=raw,
+                    sample_weight=sample_weight,
+                )
+
+            g_numeric = numerical_derivative(
+                loss_func, raw_prediction[:, k], eps=1e-5
+            )
+            assert_allclose(g[:, k], g_numeric, rtol=5e-6, atol=1e-10)
+
+            def grad_func(x):
+                raw = raw_prediction.copy()
+                raw[:, k] = x
+                return loss.gradient(
+                    y_true=y_true,
+                    raw_prediction=raw,
+                    sample_weight=sample_weight,
+                )[:, k]
+
+            h_numeric = numerical_derivative(
+                grad_func, raw_prediction[:, k], eps=1e-6
+            )
+            if loss.approx_hessian:
+                assert np.all(h >= h_numeric)
+            else:
+                assert_allclose(h[:, k], h_numeric, rtol=5e-6, atol=1e-10)
+
+
+@pytest.mark.parametrize(
+    "loss, x0, y_true",
+    [
+        ("squared_error", -2.0, 42),
+        ("squared_error", 117.0, 1.05),
+        ("squared_error", 0.0, 0.0),
+        # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp.
+        # -inf and +inf due to logit, cf. "complete separation". Therefore, we
+        # use 0 < y_true < 1.
+        ("binary_crossentropy", 0.3, 0.1),
+        ("binary_crossentropy", -12, 0.2),
+        ("binary_crossentropy", 30, 0.9),
+        ("poisson_loss", 12.0, 1.0),
+        ("poisson_loss", 0.0, 2.0),
+        ("poisson_loss", -22.0, 10.0),
+    ],
+)
+@pytest.mark.skipif(
+    sp_version == parse_version("1.2.0"),
+    reason="bug in scipy 1.2.0, see scipy issue #9608",
+)
+@skip_if_32bit
+def test_derivatives(loss, x0, y_true):
+    # Check that gradients are zero when the loss is minimized on a single
+    # value/sample using Halley's method with the first and second order
+    # derivatives computed by the Loss instance.
+    # Note that methods of Loss instances operate on arrays while the newton
+    # root finder expects a scalar or a one-element array for this purpose.
+
+    loss = _LOSSES[loss](sample_weight=None)
+    y_true = np.array([y_true], dtype=np.float64)
+    x0 = np.array([x0], dtype=np.float64)
+
+    def func(x: np.ndarray) -> np.ndarray:
+        # Add constant term such that loss has its minimum at zero, which is
+        # required by the newton method.
+        return loss.loss(
+            y_true=y_true, raw_prediction=x
+        ) + loss.constant_to_optimal_zero(y_true=y_true)
+
+    def fprime(x: np.ndarray) -> np.ndarray:
+        return loss.gradient(y_true=y_true, raw_prediction=x)
+
+    def fprime2(x: np.ndarray) -> np.ndarray:
+        return loss.gradient_hessian(y_true=y_true, raw_prediction=x)[1]
+
+    optimum = newton(
+        func,
+        x0=x0,
+        fprime=fprime,
+        fprime2=fprime2,
+        maxiter=100,
+        tol=5e-8,
+    )
+
+    # Need to ravel arrays because assert_allclose requires matching dimensions
+    y_true = y_true.ravel()
+    optimum = optimum.ravel()
+    assert_allclose(loss.inverse(optimum), y_true)
+    assert_allclose(func(optimum), 0, atol=1e-14)
+    assert_allclose(
+        loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7
+    )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_intercept_only(loss, sample_weight):
+    # Test that fit_intercept_only returns the argmin of the loss and that the
+    # gradient is zero.
+    n_samples = 50
+    if loss.n_classes <= 2:
+        y_true = loss.inverse(np.linspace(-4, 4, num=n_samples))
+    else:
+        y_true = np.arange(n_samples).astype(float) % loss.n_classes
+        y_true[::5] = 0  # exceedance of class 0
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(0.1, 2, num=n_samples)
+
+    a = loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight)
+
+    # find minimum by optimization
+    def fun(x):
+        if loss.n_classes <= 2:
+            raw_prediction = np.full(shape=(n_samples), fill_value=x)
+        else:
+            raw_prediction = np.ascontiguousarray(
+                np.broadcast_to(x, shape=(n_samples, loss.n_classes))
+            )
+        return loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+        )
+
+    if loss.n_classes <= 2:
+        opt = minimize_scalar(fun, tol=1e-7, options={"maxiter": 100})
+        grad = loss.gradient(
+            y_true=y_true,
+            raw_prediction=np.full_like(y_true, a),
+            sample_weight=sample_weight,
+        )
+        assert a.shape == tuple()  # scalar
+        assert a.dtype == y_true.dtype
+        assert_all_finite(a)
+        a == approx(opt.x, rel=1e-7)
+        grad.sum() == approx(0, abs=1e-12)
+    else:
+        # constraint corresponds to sum(raw_prediction) = 0
+        # without the constraint, we would need to apply
+        # loss.symmetrize_raw_prediction to opt.x before comparing
+        # TODO: With scipy 1.1.0, one could use
+        # LinearConstraint(np.ones((1, loss.n_classes)), 0, 0)
+        opt = minimize(
+            fun,
+            np.empty((loss.n_classes)),
+            tol=1e-13,
+            options={"maxiter": 100},
+            method="SLSQP",
+            constraints={
+                "type": "eq",
+                "fun": lambda x: np.ones((1, loss.n_classes)) @ x
+            },
+        )
+        grad = loss.gradient(
+            y_true=y_true,
+            raw_prediction=np.tile(a, (n_samples, 1)),
+            sample_weight=sample_weight,
+        )
+        assert a.dtype == y_true.dtype
+        assert_all_finite(a)
+        assert_allclose(a, opt.x, rtol=5e-6, atol=1e-12)
+        assert_allclose(grad.sum(axis=0), 0, atol=1e-12)
+
+
+@pytest.mark.parametrize(
+    "loss, func, link, low, high, random_dist",
+    [
+        (HalfSquaredError, np.mean, "identity", None, None, "normal"),
+        (AbsoluteError, np.median, "identity", None, None, "normal"),
+        (HalfPoissonLoss, np.mean, np.log, 0, None, "poisson"),
+        (BinaryCrossEntropy, np.mean, logit, 0, 1, "binomial"),
+    ],
+)
+def test_specific_fit_intercept_only(loss, func, link, low, high, random_dist):
+    rng = np.random.RandomState(0)
+    loss = loss()
+    if random_dist == "binomial":
+        y_train = rng.binomial(1, 0.5, size=100)
+    else:
+        y_train = getattr(rng, random_dist)(size=100)
+    baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+    # Make sure baseline prediction is the expected one, i.e. func, e.g.
+    # mean or median.
+    assert_all_finite(baseline_prediction)
+    if link == "identity":
+        assert baseline_prediction == approx(func(y_train))
+        assert_allclose(loss.inverse(baseline_prediction), baseline_prediction)
+    else:
+        assert baseline_prediction == approx(link(func(y_train)))
+
+    # Test baseline at boundary
+    if low is not None:
+        y_train.fill(low)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert_all_finite(baseline_prediction)
+    if high is not None:
+        y_train.fill(high)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert_all_finite(baseline_prediction)
+
+
+def test_categorical_crossentropy_fit_intercept_only():
+    rng = np.random.RandomState(0)
+    n_classes = 4
+    loss = CategoricalCrossEntropy(n_classes=n_classes)
+    # Same logic as test_single_fit_intercept_only. Here inverse link function
+    # = softmax and link function = log - symmetry term
+    y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64)
+    baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+    assert baseline_prediction.shape == (n_classes,)
+    p = np.zeros(n_classes, dtype=y_train.dtype)
+    for k in range(n_classes):
+        p[k] = (y_train == k).mean()
+    assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p)))
+    assert_allclose(baseline_prediction[None, :], loss.link(p[None, :]))
+
+    for y_train in (np.zeros(shape=10), np.ones(shape=10)):
+        y_train = y_train.astype(np.float64)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert baseline_prediction.dtype == y_train.dtype
+        assert_all_finite(baseline_prediction)
+
+
+def test_binary_and_categorical_crossentropy():
+    # Test that CategoricalCrossEntropy with n_classes = 2 is the same as
+    # BinaryCrossEntropy
+    rng = np.random.RandomState(0)
+    n_samples = 20
+    bce = BinaryCrossEntropy()
+    cce = CategoricalCrossEntropy(n_classes=2)
+    y_train = rng.randint(0, 2, size=n_samples).astype(np.float64)
+    raw_prediction = rng.normal(size=n_samples)
+    raw_cce = np.empty((n_samples, 2))
+    raw_cce[:, 0] = -0.5 * raw_prediction
+    raw_cce[:, 1] = 0.5 * raw_prediction
+    assert_allclose(
+        bce.loss(y_true=y_train, raw_prediction=raw_prediction),
+        cce.loss(y_true=y_train, raw_prediction=raw_cce)
+    )

From 830b814184a02f78a1f74a7518268e80f4fd356a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 2 Apr 2021 10:57:52 +0200
Subject: [PATCH 003/143] CLN replace deprecated np.int by int

---
 sklearn/_loss/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 49d968b6bd2af..2ac09628fed79 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -824,7 +824,7 @@ def in_y_true_range(self, y):
         y : ndarray
         """
         return is_in_interval_range(y, self.interval_y_true) and np.all(
-            y.astype(np.int) == y
+            y.astype(int) == y
         )
 
     def fit_intercept_only(self, y_true, sample_weight=None):

From 9504c89899f428de11d43413822447f526b83c90 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 2 Apr 2021 11:05:37 +0200
Subject: [PATCH 004/143] DOC document default=1 for n_threads

---
 sklearn/_loss/loss.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 2ac09628fed79..37a04d88c5740 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -155,7 +155,7 @@ def loss(
         loss : None or C-contiguous array of shape (n_samples,)
             A location into which the result is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -203,7 +203,7 @@ def loss_gradient(
             of shape (n_samples, n_classes)
             A location into which the gradient is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -261,7 +261,7 @@ def gradient(
             of shape (n_samples, n_classes)
             A location into which the result is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -314,7 +314,7 @@ def gradient_hessian(
             of shape (n_samples, n_classes)
             A location into which the hessian is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -365,7 +365,7 @@ def __call__(
             Raw prediction values (in link space).
         sample_weight : None or C-contiguous array of shape (n_samples,)
             Sample weights.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -868,7 +868,7 @@ def gradient_proba(
         proba : None or array of shape (n_samples, n_classes)
             A location into which the class probabilities are stored. If None,
             a new array might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns

From fb3bce22177038a5540036818eeaf5c1bf057a52 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:05:04 +0200
Subject: [PATCH 005/143] CLN comments and line wrapping

---
 sklearn/_loss/_loss.pyx | 127 +++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 35 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index f94c4118119f9..2e10b1c6ec721 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -27,7 +27,8 @@
 #      input checking like None -> np.empty().
 #
 # Note: We require 1-dim ndarrays to be contiguous.
-# TODO: Use const memoryviews with Cython 3.0 where appropriate (# IN)
+# TODO: Use const memoryviews with fused types with Cython 3.0 where
+#       appropriate (arguments marked by "# IN")
 
 cimport cython
 from cython.parallel import parallel, prange
@@ -57,8 +58,9 @@ cdef inline double log1pexp(double x) nogil:
 
 
 cdef inline void sum_exp_minus_max(
-    const int i, Y_DTYPE_C[:, :] raw_prediction,  # IN
-    Y_DTYPE_C *p                                  # OUT
+    const int i,
+    Y_DTYPE_C[:, :] raw_prediction,  # IN
+    Y_DTYPE_C *p                     # OUT
 ) nogil:
     # Store p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
     #       p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
@@ -70,7 +72,8 @@ cdef inline void sum_exp_minus_max(
     # - i needs to be passed (and stays constant) because otherwise Cython does
     #   not generate optimal code, see
     #   https://github.com/scikit-learn/scikit-learn/issues/17299
-    # - We do not calculate p[k] = p[k] / sum_exps to save one loop over k.
+    # - We do not normalize p by calculating p[k] = p[k] / sum_exps.
+    #   This helps to save one loop over k.
     cdef:
         int k
         int n_classes = raw_prediction.shape[1]
@@ -93,18 +96,23 @@ cdef inline void sum_exp_minus_max(
 # Single point inline C functions
 # -------------------------------------
 # Half Squared Error
-cdef inline double closs_half_squared_error(double y_true, double raw_prediction) nogil:
+cdef inline double closs_half_squared_error(
+    double y_true,
+    double raw_prediction
+) nogil:
     return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
 
 
 cdef inline double cgradient_half_squared_error(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     return raw_prediction - y_true
 
 
 cdef inline double2 cgrad_hess_half_squared_error(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     cdef double2 gh
     gh.val1 = raw_prediction - y_true  # gradient
@@ -113,16 +121,23 @@ cdef inline double2 cgrad_hess_half_squared_error(
 
 
 # Absolute Error
-cdef inline double closs_absolute_error(double y_true, double raw_prediction) nogil:
+cdef inline double closs_absolute_error(
+    double y_true,
+    double raw_prediction
+) nogil:
     return fabs(raw_prediction - y_true)
 
 
-cdef inline double cgradient_absolute_error(double y_true, double raw_prediction) nogil:
+cdef inline double cgradient_absolute_error(
+    double y_true,
+    double raw_prediction
+) nogil:
     return 1. if raw_prediction > y_true else -1.
 
 
 cdef inline double2 cgrad_hess_absolute_error(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     cdef double2 gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
@@ -134,20 +149,26 @@ cdef inline double2 cgrad_hess_absolute_error(
 
 # Quantile Loss / Pinball Loss
 cdef inline double closs_pinball_loss(
-    double y_true, double raw_prediction, double quantile
+    double y_true,
+    double raw_prediction,
+    double quantile
 ) nogil:
     return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
             else (1. - quantile) * (raw_prediction - y_true))
 
 
 cdef inline double cgradient_pinball_loss(
-    double y_true, double raw_prediction, double quantile
+    double y_true,
+    double raw_prediction,
+    double quantile
 ) nogil:
     return -quantile if y_true >=raw_prediction else 1. - quantile
 
 
 cdef inline double2 cgrad_hess_pinball_loss(
-    double y_true, double raw_prediction, double quantile
+    double y_true,
+    double raw_prediction,
+    double quantile
 ) nogil:
     cdef double2 gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
@@ -158,24 +179,36 @@ cdef inline double2 cgrad_hess_pinball_loss(
 
 
 # Half Poisson Deviance with Log-Link, dropping constant terms
-cdef inline double closs_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double closs_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     return exp(raw_prediction) - y_true * raw_prediction
 
 
-cdef inline double cgradient_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double cgradient_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     # y_pred - y_true
     return exp(raw_prediction) - y_true
 
 
-cdef inline double2 closs_grad_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double2 closs_grad_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 lg
-    lg.val2 = exp(raw_prediction)
+    lg.val2 = exp(raw_prediction)                # used as temporary
     lg.val1 = lg.val2 - y_true * raw_prediction  # loss
     lg.val2 -= y_true                            # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double2 cgrad_hess_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 gh
     gh.val2 = exp(raw_prediction)  # hessian
     gh.val1 = gh.val2 - y_true     # gradient
@@ -183,25 +216,37 @@ cdef inline double2 cgrad_hess_half_poisson(double y_true, double raw_prediction
 
 
 # Half Gamma Deviance with Log-Link, dropping constant terms
-cdef inline double closs_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double closs_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     return raw_prediction + y_true * exp(-raw_prediction)
 
 
-cdef inline double cgradient_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double cgradient_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     return 1. - y_true * exp(-raw_prediction)
 
 
-cdef inline double2 closs_grad_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double2 closs_grad_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 lg
-    lg.val2 = exp(-raw_prediction)
+    lg.val2 = exp(-raw_prediction)               # used as temporary
     lg.val1 = raw_prediction + y_true * lg.val2  # loss
     lg.val2 = 1. - y_true * lg.val2              # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double2 cgrad_hess_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 gh
-    gh.val2 = exp(-raw_prediction)
+    gh.val2 = exp(-raw_prediction)   # used as temporary
     gh.val1 = 1. - y_true * gh.val2  # gradient
     gh.val2 *= y_true                # hessian
     return gh
@@ -210,7 +255,9 @@ cdef inline double2 cgrad_hess_half_gamma(double y_true, double raw_prediction)
 # Half Tweedie Deviance with Log-Link, dropping constant terms
 # Note that by dropping constants this is no longer smooth in parameter power.
 cdef inline double closs_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     if power == 0.:
         return closs_half_squared_error(y_true, exp(raw_prediction))
@@ -224,7 +271,9 @@ cdef inline double closs_half_tweedie(
 
 
 cdef inline double cgradient_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     cdef double exp1
     if power == 0.:
@@ -240,7 +289,9 @@ cdef inline double cgradient_half_tweedie(
 
 
 cdef inline double2 closs_grad_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     cdef double2 lg
     cdef double exp1, exp2
@@ -261,7 +312,9 @@ cdef inline double2 closs_grad_half_tweedie(
 
 
 cdef inline double2 cgrad_hess_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     cdef double2 gh
     cdef double exp1, exp2
@@ -283,14 +336,16 @@ cdef inline double2 cgrad_hess_half_tweedie(
 
 # Binary cross entropy aka log-loss
 cdef inline double closs_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     # log1p(exp(raw_prediction)) - y_true * raw_prediction
     return log1pexp(raw_prediction) - y_true * raw_prediction
 
 
 cdef inline double cgradient_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     # y_pred - y_true = expit(raw_prediction) - y_true
     # Numerically more stable, see
@@ -314,18 +369,19 @@ cdef inline double cgradient_binary_crossentropy(
 
 
 cdef inline double2 closs_grad_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     cdef double2 lg
     if raw_prediction <= 0:
-        lg.val2 = exp(raw_prediction)
+        lg.val2 = exp(raw_prediction)  # used as temporary
         if raw_prediction <= -37:
             lg.val1 = lg.val2 - y_true * raw_prediction              # loss
         else:
             lg.val1 = log1p(lg.val2) - y_true * raw_prediction       # loss
         lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2)  # gradient
     else:
-        lg.val2 = exp(-raw_prediction)
+        lg.val2 = exp(-raw_prediction)  # used as temporary
         if raw_prediction <= 18:
             # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
             lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction  # loss
@@ -336,13 +392,14 @@ cdef inline double2 closs_grad_binary_crossentropy(
 
 
 cdef inline double2 cgrad_hess_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     # with y_pred = expit(raw)
     # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2
     #                                 = exp(-raw) / (1 + exp(-raw))**2
     cdef double2 gh
-    gh.val2 = exp(-raw_prediction)
+    gh.val2 = exp(-raw_prediction)  # used as temporary
     gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
     gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
     return gh

From 2c86bf465a220b763d7b96e3110c7428fc833655 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:16:18 +0200
Subject: [PATCH 006/143] CLN comments and doc

---
 sklearn/_loss/_loss.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index 2e10b1c6ec721..c57965cb5f59c 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -358,7 +358,7 @@ cdef inline double cgradient_binary_crossentropy(
     #         return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
     # Note that optimal speed would be achieved, at the cost of precision, by
     #     return expit(raw_prediction) - y_true
-    # i.e. no if else, and an own inline implemention of expit instead of
+    # i.e. no "if else" and an own inline implemention of expit instead of
     #     from scipy.special.cython_special cimport expit
     # The case distinction raw_prediction < 0 in the stable implementation
     # does not provide significant better precision. Therefore we go without
@@ -465,7 +465,7 @@ cdef class cLossFunction:
 
         Returns
         -------
-        grad_hess_pair
+        double2
             Gradient and hessian of the loss function w.r.t. `raw_prediction`.
         """
         pass
@@ -495,7 +495,7 @@ cdef class cLossFunction:
         loss : array of shape (n_samples,)
             A location into which the result is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------
@@ -525,7 +525,7 @@ cdef class cLossFunction:
         gradient : array of shape (n_samples,)
             A location into which the result is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------
@@ -558,7 +558,7 @@ cdef class cLossFunction:
         gradient : array of shape (n_samples,)
             A location into which the gradient is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------
@@ -598,7 +598,7 @@ cdef class cLossFunction:
         hessian : array of shape (n_samples,)
             A location into which the hessian is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------

From d68c07ee0fa9d9739cd8ef68f9676105de84f932 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:17:49 +0200
Subject: [PATCH 007/143] BUG remove useless line of code

---
 sklearn/_loss/_loss.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index c57965cb5f59c..59a46dcab522b 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -1601,7 +1601,6 @@ cdef class cCategoricalCrossEntropy(cLossFunction):
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
-                    max_value = raw_prediction[i, 0]
                     max_value = p[n_classes]     # p[-2]
                     sum_exps = p[n_classes + 1]  # p[-1]
                     loss[i] = log(sum_exps) + max_value

From 3d9c800d2e8eb0be89f68d873ccf237797ea093f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:21:52 +0200
Subject: [PATCH 008/143] CLN remove line that was commented out

---
 sklearn/_loss/setup.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index 23d35439885ba..227eee5e47a64 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -4,13 +4,9 @@
 
 def configuration(parent_package="", top_path=None):
     config = Configuration("_loss", parent_package, top_path)
-
     config.add_extension(
         "_loss", sources=["_loss.pyx"], include_dirs=[numpy.get_include()]
     )
-
-    # config.add_subpackage("tests")
-
     return config
 
 
From aba1b67437bf4435355b6a1a5b54d6e351a0c215 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:41:16 +0200
Subject: [PATCH 009/143] CLN nitpicks in comments and docstrings

---
 sklearn/_loss/loss.py            | 4 +---
 sklearn/_loss/tests/test_link.py | 6 +++---
 sklearn/_loss/tests/test_loss.py | 6 ++----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 37a04d88c5740..4a3e0dbdcde5e 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -466,9 +466,6 @@ def gradient(
         gradient=None,
         n_threads=1,
     ):
-        # easier in numpy
-        # gradient = raw_prediction - y_true is easier in numpy
-
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
@@ -479,6 +476,7 @@ def gradient(
         ):
             gradient = gradient.squeeze(1)
 
+        # gradient = raw_prediction - y_true is easier in numpy
         gradient = np.subtract(raw_prediction, y_true, out=gradient)
         if sample_weight is None:
             return gradient
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index a8dbbff511373..b049f5ac637d6 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -54,7 +54,7 @@ def test_is_in_range(interval):
 
 @pytest.mark.parametrize("link", LINK_FUNCTIONS)
 def test_link_inverse_identity(link):
-    # Test that link of inverse gives idendity.
+    # Test that link of inverse gives identity.
     rng = np.random.RandomState(42)
     link = link()
     n_samples, n_classes = 100, None
@@ -67,7 +67,7 @@ def test_link_inverse_identity(link):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:
         # So far, the valid interval of raw_prediction is (-inf, inf) and
-        # we do not need to distinguish
+        # we do not need to distinguish.
         raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
 
     assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
@@ -90,7 +90,7 @@ def test_link_out_argument(link):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:
         # So far, the valid interval of raw_prediction is (-inf, inf) and
-        # we do not need to distinguish
+        # we do not need to distinguish.
         raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
 
     y_pred = link.inverse(raw_prediction, out=None)
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 5f7e001f2d6de..68832a3cdc273 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -87,13 +87,11 @@ def random_y_true_raw_prediction(
 
 
 def numerical_derivative(func, x, eps):
-    """Helper function for numerical (first) derivatives.
-
+    """Helper function for numerical (first) derivatives."""
     # For numerical derivatives, see
     # https://en.wikipedia.org/wiki/Numerical_differentiation
     # https://en.wikipedia.org/wiki/Finite_difference_coefficient
     # We use central finite differences of accuracy 4.
-    """
     h = np.full_like(x, fill_value=eps)
     f_minus_2h = func(x - 2 * h)
     f_minus_1h = func(x - h)
@@ -348,7 +346,7 @@ def test_loss_same_as_C_functions(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_gradients_are_the_same(loss, sample_weight):
-    # Test that loss and gradient are the same accross different functions
+    # Test that loss and gradient are the same across different functions.
     # Also test that output arguments contain correct result.
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,

From 022e4185b02fa7b029687b995f8ee681c82f1453 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:50:23 +0200
Subject: [PATCH 010/143] ENH set NPY_NO_DEPRECATED_API

---
 sklearn/_loss/setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index 227eee5e47a64..ad380fc8e429b 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -5,7 +5,10 @@
 def configuration(parent_package="", top_path=None):
     config = Configuration("_loss", parent_package, top_path)
     config.add_extension(
-        "_loss", sources=["_loss.pyx"], include_dirs=[numpy.get_include()]
+        "_loss",
+        sources=["_loss.pyx"],
+        include_dirs=[numpy.get_include()],
+        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_13_API_VERSION")],
     )
     return config
 

From 49bb402ac42f4e3c4a37e36be49b515b41ac7bcb Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 15 Apr 2021 08:36:43 +0200
Subject: [PATCH 011/143] MNT change NPY_1_13_API_VERSION to
 NPY_1_7_API_VERSION

---
 sklearn/_loss/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index ad380fc8e429b..63546bd29c90b 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -8,7 +8,7 @@ def configuration(parent_package="", top_path=None):
         "_loss",
         sources=["_loss.pyx"],
         include_dirs=[numpy.get_include()],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_13_API_VERSION")],
+        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
     )
     return config
 

From 6d77090572870bb5c31105e121f71123727e2645 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 15 Apr 2021 08:58:10 +0200
Subject: [PATCH 012/143] MNT comment out NPY_NO_DEPRECATED_API

---
 sklearn/_loss/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index 63546bd29c90b..c7f11afe9e30a 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -8,7 +8,7 @@ def configuration(parent_package="", top_path=None):
         "_loss",
         sources=["_loss.pyx"],
         include_dirs=[numpy.get_include()],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        # define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
     )
     return config
 

From ceda6731925be40ade91c927f42f63ece3b19098 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 8 May 2021 15:21:35 +0200
Subject: [PATCH 013/143] TST restructure domain test cases

---
 sklearn/_loss/tests/test_loss.py | 82 ++++++++++++++++----------------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 68832a3cdc273..8ca75d9a966e2 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -135,26 +135,46 @@ def test_loss_boundary(loss):
     loss.loss(y_true=y_true, raw_prediction=raw_prediction)
 
 
+# Fixture to test valid value ranges.
+Y_COMMON_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLoss(power=-3), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLoss(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (BinaryCrossEntropy(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]),
+    (CategoricalCrossEntropy(), [], [-np.inf, -1, 1.1, np.inf]),
+]
+# y_pred and y_true do not always have the same domain (valid value range).
+# Hence, we define extra sets of parameters for each of them.
+Y_TRUE_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfPoissonLoss(), [0], []),
+    (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []),
+    (HalfTweedieLoss(power=0), [-100, 0], []),
+    (HalfTweedieLoss(power=1.5), [0], []),
+    (BinaryCrossEntropy(), [0, 1], []),
+    (CategoricalCrossEntropy(), [0.0, 1.0, 2], []),
+]
+Y_PRED_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfPoissonLoss(), [], [0]),
+    (HalfTweedieLoss(power=-3), [], [-3, -0.1, 0]),
+    (HalfTweedieLoss(power=0), [], [-3, -0.1, 0]),
+    (HalfTweedieLoss(power=1.5), [], [0]),
+    (BinaryCrossEntropy(), [], [0, 1]),
+    (CategoricalCrossEntropy(), [0.1, 0.5], [0, 1]),
+]
+
+
 @pytest.mark.parametrize(
-    "loss, y_true_success, y_true_fail",
-    [
-        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (HalfPoissonLoss(), [0, 0.1, 100], [-np.inf, -3, -0.1, np.inf]),
-        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfTweedieLoss(power=-3), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (HalfTweedieLoss(power=0), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (
-            HalfTweedieLoss(power=1.5),
-            [0, 0.1, 100],
-            [-np.inf, -3, -0.1, np.inf],
-        ),
-        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (BinaryCrossEntropy(), [0, 0.5, 1], [-np.inf, -1, 2, np.inf]),
-        (CategoricalCrossEntropy(), [0.0, 1.0, 2], [-np.inf, -1, 1.1, np.inf]),
-    ],
+    "loss, y_true_success, y_true_fail", Y_COMMON_PARAMS + Y_TRUE_PARAMS
 )
 def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
     # Test boundaries of y_true for loss functions.
@@ -165,29 +185,7 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
 
 
 @pytest.mark.parametrize(
-    "loss, y_pred_success, y_pred_fail",
-    [
-        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (
-            HalfTweedieLoss(power=-3),
-            [0.1, 100],
-            [-np.inf, -3, -0.1, 0, np.inf],
-        ),
-        (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (
-            HalfTweedieLoss(power=1.5),
-            [0.1, 100],
-            [-np.inf, -3, -0.1, 0, np.inf],
-        ),
-        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (BinaryCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
-        (CategoricalCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
-    ],
+    "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
     # Test boundaries of y_pred for loss functions.

From c73e3fad114123a7859fe765329c69dfa3948b60 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 18:36:03 +0200
Subject: [PATCH 014/143] DOC add losses to API reference

---
 doc/modules/classes.rst   | 24 ++++++++++++++++++++++++
 sklearn/_loss/__init__.py |  4 ++++
 2 files changed, 28 insertions(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index ddcbe36bb1b33..3f6083a4a29e8 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1647,3 +1647,27 @@ Recently deprecated
 
 To be removed in 1.0 (renaming of 0.25)
 ---------------------------------------
+
+.. _loss_function_ref:
+
+:mod:`sklearn._loss`: Non-public Loss Function Classes
+===========================================================
+
+.. automodule:: sklearn._loss
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   _loss.HalfSquaredError
+   _loss.AbsoluteError
+   _loss.PinballLoss
+   _loss.HalfPoissonLoss
+   _loss.HalfGammaLoss
+   _loss.HalfTweedieLoss
+   _loss.BinaryCrossEntropy
+   _loss.CategoricalCrossEntropy
diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index e69de29bb2d1d..bb71abe0ad48a 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -0,0 +1,4 @@
+"""
+The :mod:`sklearn._loss` module includes loss function classes suitable for
+fitting classification and regression tasks.
+"""

From e6505224a9372bd74901590eaded0c4a1e5db51a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 20:22:43 +0200
Subject: [PATCH 015/143] MNT add classes to __init__

---
 sklearn/_loss/__init__.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index bb71abe0ad48a..282a3df9bdb93 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -2,3 +2,26 @@
 The :mod:`sklearn._loss` module includes loss function classes suitable for
 fitting classification and regression tasks.
 """
+
+from ._loss import (
+    HalfSquaredError,
+    AbsoluteError,
+    PinballLoss,
+    HalfPoissonLoss,
+    HalfGammaLoss,
+    HalfTweedieLoss,
+    BinaryCrossEntropy,
+    CategoricalCrossEntropy,
+)
+
+
+__all__ = [
+    "HalfSquaredError",
+    "AbsoluteError",
+    "PinballLoss",
+    "HalfPoissonLoss",
+    "HalfGammaLoss",
+    "HalfTweedieLoss",
+    "BinaryCrossEntropy",
+    "CategoricalCrossEntropy",
+]

From a31d8fbf9baa5623e581e6b46de213f072231859 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 20:54:28 +0200
Subject: [PATCH 016/143] CLN fix import

---
 sklearn/_loss/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index 282a3df9bdb93..ae7bac5f1a8d8 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -3,7 +3,7 @@
 fitting classification and regression tasks.
 """
 
-from ._loss import (
+from .loss import (
     HalfSquaredError,
     AbsoluteError,
     PinballLoss,

From e5b626678555b267a22c1036445b03a23f4daacf Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 22:05:16 +0200
Subject: [PATCH 017/143] DOC minor docstring changes

---
 sklearn/_loss/loss.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 4a3e0dbdcde5e..a636d9fa29c6a 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -560,8 +560,8 @@ class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
         loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
 
         rho_{quantile}(u) = u * (quantile - 1_{u<0})
-                          = -u (1 - quantile)  if u < 0
-                            u * quantile       if u >= 0
+                          = -u *(1 - quantile)  if u < 0
+                             u * quantile       if u >= 0
 
     Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
 
@@ -649,9 +649,9 @@ class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
         loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                     + y_true/exp(raw_prediction_i) - 1
 
-    Half the Gamma deviance is actually proportional the negative log
-    likelihood up constant terms (not involving raw_prediction) and simplifies
-    the computation of the gradients.
+    Half the Gamma deviance is actually proportional to the negative log
+    likelihood up to constant terms (not involving raw_prediction) and
+    simplifies the computation of the gradients.
     We also skip the constant term `-log(y_true_i) - 1`.
     """
 

From 349238388699571ff3242899f88fd31322772111 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 May 2021 17:25:58 +0200
Subject: [PATCH 018/143] TST prefer docstring over comment

---
 sklearn/_loss/tests/test_loss.py | 117 +++++++++++++++++++------------
 1 file changed, 71 insertions(+), 46 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 8ca75d9a966e2..26c936e428d97 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -7,9 +7,8 @@
     minimize_scalar,
     newton,
 )
-from scipy.special import logit
 
-from sklearn._loss.link import _inclusive_low_high
+from sklearn._loss.link import _inclusive_low_high, IdentityLink
 from sklearn._loss.loss import (
     _LOSSES,
     AbsoluteError,
@@ -104,6 +103,7 @@ def numerical_derivative(func, x, eps):
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_loss_boundary(loss):
+    """Test interval ranges of y_true and y_pred in losses."""
     # make sure low and high are always within the interval, used for linspace
     if loss.n_classes is None or loss.n_classes <= 2:
         low, high = _inclusive_low_high(loss.interval_y_true)
@@ -177,7 +177,7 @@ def test_loss_boundary(loss):
     "loss, y_true_success, y_true_fail", Y_COMMON_PARAMS + Y_TRUE_PARAMS
 )
 def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
-    # Test boundaries of y_true for loss functions.
+    """Test boundaries of y_true for loss functions."""
     for y in y_true_success:
         assert loss.in_y_true_range(np.array([y]))
     for y in y_true_fail:
@@ -188,7 +188,7 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
     "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
-    # Test boundaries of y_pred for loss functions.
+    """Test boundaries of y_pred for loss functions."""
     for y in y_pred_success:
         assert loss.in_y_pred_range(np.array([y]))
     for y in y_pred_fail:
@@ -205,8 +205,11 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
 def test_loss_dtype(
     loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
 ):
-    # Test that loss accepts if all input arrays are either all float32 or all
-    # float64, and all output arrays are either all float32 or all float64.
+    """Test acceptance of dtypes in loss functions.
+
+    Check that loss accepts if all input arrays are either all float32 or all
+    float64, and all output arrays are either all float32 or all float64.
+    """
     loss = loss()
     if loss.n_classes <= 2:
         # generate a y_true in valid range
@@ -263,6 +266,7 @@ def test_loss_dtype(
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_same_as_C_functions(loss, sample_weight):
+    """Test that Python and Cython functions return same results."""
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
         n_samples=20,
@@ -344,8 +348,10 @@ def test_loss_same_as_C_functions(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_gradients_are_the_same(loss, sample_weight):
-    # Test that loss and gradient are the same across different functions.
-    # Also test that output arguments contain correct result.
+    """Test that loss and gradient are the same across different functions.
+
+    Also test that output arguments contain correct result.
+    """
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
         n_samples=20,
@@ -423,9 +429,11 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", ["ones", "random"])
 def test_sample_weight_multiplies_gradients(loss, sample_weight):
-    # Make sure that passing sample weights to the gradient and hessians
-    # computation methods is equivalent to multiplying by the weights.
+    """Test sample weights in gradients and hessians.
 
+    Make sure that passing sample weights to the gradient and hessians
+    computation methods is equivalent to multiplying by the weights.
+    """
     n_samples = 100
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -476,8 +484,11 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_of_perfect_prediction(loss, sample_weight):
-    # Test that loss of y_true = y_pred plus constant_to_optimal_zero sums up
-    # to zero.
+    """Test value of perfect predictions.
+
+    Loss of y_pred = y_true plus constant_to_optimal_zero should sums up to
+    zero.
+    """
     if loss.n_classes <= 2:
         # Use small values such that exp(value) is not nan.
         raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
@@ -513,9 +524,11 @@ def test_loss_of_perfect_prediction(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_gradients_hessians_numerically(loss, sample_weight):
-    # Test that gradients are computed correctly by comparing to numerical
-    # derivatives of loss functions.
-    # Test that hessians are correct by numerical derivative of gradients.
+    """Test gradients and hessians with numerical derivatives.
+
+    Gradient should equal the numerical derivatives of the loss function.
+    Hessians should equal the numerical derivatives of gradients.
+    """
     n_samples = 20
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -620,19 +633,23 @@ def grad_func(x):
 )
 @skip_if_32bit
 def test_derivatives(loss, x0, y_true):
-    # Check that gradients are zero when the loss is minimized on a single
-    # value/sample using Halley's method with the first and second order
-    # derivatives computed by the Loss instance.
-    # Note that methods of Loss instances operate on arrays while the newton
-    # root finder expects a scalar or a one-element array for this purpose.
+    """Test that gradients are zero at the minimum of the loss.
 
+    We check this on a single value/sample using Halley's method with the
+    first and second order derivatives computed by the Loss instance.
+    Note that methods of Loss instances operate on arrays while the newton
+    root finder expects a scalar or a one-element array for this purpose.
+    """
     loss = _LOSSES[loss](sample_weight=None)
     y_true = np.array([y_true], dtype=np.float64)
     x0 = np.array([x0], dtype=np.float64)
 
     def func(x: np.ndarray) -> np.ndarray:
-        # Add constant term such that loss has its minimum at zero, which is
-        # required by the newton method.
+        """Compute loss plus constant term.
+
+        The constant term is such that the minimum function value is zero,
+        which is required by the Newton method.
+        """
         return loss.loss(
             y_true=y_true, raw_prediction=x
         ) + loss.constant_to_optimal_zero(y_true=y_true)
@@ -652,7 +669,8 @@ def fprime2(x: np.ndarray) -> np.ndarray:
         tol=5e-8,
     )
 
-    # Need to ravel arrays because assert_allclose requires matching dimensions
+    # Need to ravel arrays because assert_allclose requires matching
+    # dimensions.
     y_true = y_true.ravel()
     optimum = optimum.ravel()
     assert_allclose(loss.inverse(optimum), y_true)
@@ -665,8 +683,10 @@ def fprime2(x: np.ndarray) -> np.ndarray:
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_intercept_only(loss, sample_weight):
-    # Test that fit_intercept_only returns the argmin of the loss and that the
-    # gradient is zero.
+    """Test that fit_intercept_only returns the argmin of the loss.
+
+    Also test that the gradient is zero at the minimum.
+    """
     n_samples = 50
     if loss.n_classes <= 2:
         y_true = loss.inverse(np.linspace(-4, 4, num=n_samples))
@@ -734,15 +754,20 @@ def fun(x):
 
 
 @pytest.mark.parametrize(
-    "loss, func, link, low, high, random_dist",
+    "loss, func, random_dist",
     [
-        (HalfSquaredError, np.mean, "identity", None, None, "normal"),
-        (AbsoluteError, np.median, "identity", None, None, "normal"),
-        (HalfPoissonLoss, np.mean, np.log, 0, None, "poisson"),
-        (BinaryCrossEntropy, np.mean, logit, 0, 1, "binomial"),
+        (HalfSquaredError, np.mean, "normal"),
+        (AbsoluteError, np.median, "normal"),
+        (HalfPoissonLoss, np.mean, "poisson"),
+        (BinaryCrossEntropy, np.mean, "binomial"),
     ],
 )
-def test_specific_fit_intercept_only(loss, func, link, low, high, random_dist):
+def test_specific_fit_intercept_only(loss, func, random_dist):
+    """Test that fit_intercept_only returns the correct functional.
+
+    We test the functional for specific, meaningful distributions, e.g.
+    squared error estimates the expectation of a probability distribution.
+    """
     rng = np.random.RandomState(0)
     loss = loss()
     if random_dist == "binomial":
@@ -750,32 +775,33 @@ def test_specific_fit_intercept_only(loss, func, link, low, high, random_dist):
     else:
         y_train = getattr(rng, random_dist)(size=100)
     baseline_prediction = loss.fit_intercept_only(y_true=y_train)
-    # Make sure baseline prediction is the expected one, i.e. func, e.g.
-    # mean or median.
+    # Make sure baseline prediction is the expected functional=func, e.g. mean
+    # or median.
     assert_all_finite(baseline_prediction)
-    if link == "identity":
-        assert baseline_prediction == approx(func(y_train))
-        assert_allclose(loss.inverse(baseline_prediction), baseline_prediction)
-    else:
-        assert baseline_prediction == approx(link(func(y_train)))
+    assert baseline_prediction == approx(loss.link(func(y_train)))
+    if isinstance(loss, IdentityLink):
+        assert_allclose(
+            loss.inverse(baseline_prediction), baseline_prediction
+        )
 
     # Test baseline at boundary
-    if low is not None:
-        y_train.fill(low)
+    if loss.interval_y_true.low_inclusive:
+        y_train.fill(loss.interval_y_true.low)
         baseline_prediction = loss.fit_intercept_only(y_true=y_train)
         assert_all_finite(baseline_prediction)
-    if high is not None:
-        y_train.fill(high)
+    if loss.interval_y_true.high_inclusive:
+        y_train.fill(loss.interval_y_true.high)
         baseline_prediction = loss.fit_intercept_only(y_true=y_train)
         assert_all_finite(baseline_prediction)
 
 
 def test_categorical_crossentropy_fit_intercept_only():
+    """Test that fit_intercept_only returns the mean functional for CCE."""
     rng = np.random.RandomState(0)
     n_classes = 4
     loss = CategoricalCrossEntropy(n_classes=n_classes)
-    # Same logic as test_single_fit_intercept_only. Here inverse link function
-    # = softmax and link function = log - symmetry term
+    # Same logic as test_specific_fit_intercept_only. Here inverse link
+    # function = softmax and link function = log - symmetry term.
     y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64)
     baseline_prediction = loss.fit_intercept_only(y_true=y_train)
     assert baseline_prediction.shape == (n_classes,)
@@ -793,8 +819,7 @@ def test_categorical_crossentropy_fit_intercept_only():
 
 
 def test_binary_and_categorical_crossentropy():
-    # Test that CategoricalCrossEntropy with n_classes = 2 is the same as
-    # BinaryCrossEntropy
+    """Test that CCE with n_classes = 2 is the same as BinaryCrossEntropy."""
     rng = np.random.RandomState(0)
     n_samples = 20
     bce = BinaryCrossEntropy()

From 9d86d82950a950874ce6144d992ebf6918d17ecd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 May 2021 17:58:15 +0200
Subject: [PATCH 019/143] ENH define loss.is_multiclass

---
 sklearn/_loss/loss.py            |  5 +++
 sklearn/_loss/tests/test_loss.py | 53 ++++++++++++++++----------------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index a636d9fa29c6a..bb7aa2f29c69e 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -87,6 +87,8 @@ class BaseLoss(BaseLink, cLossFunction):
         approximated, it should be larger or equal to the exact one.
     constant_hessian : bool
         Indicates whether the hessian is one for this loss.
+    is_multiclass : bool
+        Indicates whether n_classes > 2 is allowed.
     """
 
     # Inherited methods from BaseLink:
@@ -107,6 +109,7 @@ class BaseLoss(BaseLink, cLossFunction):
     # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
     need_update_leaves_values = False
     differentiable = True
+    is_multiclass = False
 
     def __init__(self, n_classes=1):
         self.approx_hessian = False
@@ -809,6 +812,8 @@ class CategoricalCrossEntropy(
         https://arxiv.org/pdf/1311.6529.pdf
     """
 
+    is_multiclass = True
+
     def __init__(self, sample_weight=None, n_classes=3):
         super().__init__(n_classes=n_classes)
         self.interval_y_true = Interval(0, np.inf, True, False)
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 26c936e428d97..2b2c9ea22aaca 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -53,7 +53,15 @@ def random_y_true_raw_prediction(
 ):
     """Random generate y_true and raw_prediction in valid range."""
     rng = np.random.RandomState(seed)
-    if loss.n_classes <= 2:
+    if loss.is_multiclass:
+        raw_prediction = np.empty((n_samples, loss.n_classes))
+        raw_prediction.flat[:] = rng.uniform(
+            low=raw_bound[0],
+            high=raw_bound[1],
+            size=n_samples * loss.n_classes,
+        )
+        y_true = np.arange(n_samples).astype(float) % loss.n_classes
+    else:
         raw_prediction = rng.uniform(
             low=raw_bound[0], high=raw_bound[0], size=n_samples
         )
@@ -73,14 +81,6 @@ def random_y_true_raw_prediction(
             and loss.interval_y_true.high_inclusive
         ):
             y_true[1:: (n_samples // 3)] = 1
-    else:
-        raw_prediction = np.empty((n_samples, loss.n_classes))
-        raw_prediction.flat[:] = rng.uniform(
-            low=raw_bound[0],
-            high=raw_bound[1],
-            size=n_samples * loss.n_classes,
-        )
-        y_true = np.arange(n_samples).astype(float) % loss.n_classes
 
     return y_true, raw_prediction
 
@@ -105,11 +105,11 @@ def numerical_derivative(func, x, eps):
 def test_loss_boundary(loss):
     """Test interval ranges of y_true and y_pred in losses."""
     # make sure low and high are always within the interval, used for linspace
-    if loss.n_classes is None or loss.n_classes <= 2:
+    if loss.is_multiclass:
+        y_true = np.linspace(0, 9, num=10)
+    else:
         low, high = _inclusive_low_high(loss.interval_y_true)
         y_true = np.linspace(low, high, num=10)
-    else:
-        y_true = np.linspace(0, 9, num=10)
 
     # add boundaries if they are included
     if loss.interval_y_true.low_inclusive:
@@ -120,13 +120,13 @@ def test_loss_boundary(loss):
     assert loss.in_y_true_range(y_true)
 
     low, high = _inclusive_low_high(loss.interval_y_pred)
-    if loss.n_classes is None or loss.n_classes <= 2:
-        y_pred = np.linspace(low, high, num=10)
-    else:
+    if loss.is_multiclass:
         y_pred = np.empty((10, 3))
         y_pred[:, 0] = np.linspace(low, high, num=10)
         y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0])
         y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0])
+    else:
+        y_pred = np.linspace(low, high, num=10)
 
     assert loss.in_y_pred_range(y_pred)
 
@@ -153,7 +153,7 @@ def test_loss_boundary(loss):
 ]
 # y_pred and y_true do not always have the same domain (valid value range).
 # Hence, we define extra sets of parameters for each of them.
-Y_TRUE_PARAMS = [
+Y_TRUE_PARAMS = [  # type: ignore
     # (loss, [y success], [y fail])
     (HalfPoissonLoss(), [0], []),
     (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []),
@@ -185,7 +185,8 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
 
 
 @pytest.mark.parametrize(
-    "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS
+    "loss, y_pred_success, y_pred_fail",
+    Y_COMMON_PARAMS + Y_PRED_PARAMS  # type: ignore
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
     """Test boundaries of y_pred for loss functions."""
@@ -211,16 +212,16 @@ def test_loss_dtype(
     float64, and all output arrays are either all float32 or all float64.
     """
     loss = loss()
-    if loss.n_classes <= 2:
-        # generate a y_true in valid range
-        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
-        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
-        raw_prediction = np.array([0.0], dtype=dtype_in)
-    else:
+    # generate a y_true and raw_prediction in valid range
+    if loss.is_multiclass:
         y_true = np.array([0], dtype=dtype_in)
         raw_prediction = np.full(
             shape=(1, loss.n_classes), fill_value=0.0, dtype=dtype_in
         )
+    else:
+        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
+        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
+        raw_prediction = np.array([0.0], dtype=dtype_in)
 
     if sample_weight is not None:
         sample_weight = np.array([2.0], dtype=dtype_in)
@@ -251,7 +252,7 @@ def test_loss_dtype(
         gradient=out2,
         n_threads=n_threads,
     )
-    if out1 is not None and loss.n_classes >= 3:
+    if out1 is not None and loss.is_multiclass:
         out1 = np.empty_like(raw_prediction, dtype=dtype_out)
     loss.gradient_hessian(
         y_true=y_true,
@@ -350,7 +351,7 @@ def test_loss_same_as_C_functions(loss, sample_weight):
 def test_loss_gradients_are_the_same(loss, sample_weight):
     """Test that loss and gradient are the same across different functions.
 
-    Also test that output arguments contain correct result.
+    Also test that output arguments contain correct results.
     """
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -410,7 +411,7 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
     assert np.shares_memory(g3, out_g3)
 
     if hasattr(loss, "gradient_proba"):
-        assert loss.n_classes >= 3  # only for CategoricalCrossEntropy
+        assert loss.is_multiclass  # only for CategoricalCrossEntropy
         out_g4 = np.empty_like(raw_prediction)
         out_proba = np.empty_like(raw_prediction)
         g4, proba = loss.gradient_proba(

From cc90e4d45d4b0638091f07df9ee38006ec6f4965 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 May 2021 22:02:08 +0200
Subject: [PATCH 020/143] DOC fix typos

---
 sklearn/_loss/loss.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index bb7aa2f29c69e..14210b2a6202b 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -438,7 +438,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
 
 
 class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
-    """Half Squared Error with identity link, for regression.
+    """Half squared error with identity link, for regression.
 
     Domain:
     y_true and y_pred all real numbers
@@ -446,7 +446,7 @@ class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
     Link:
     y_pred = raw_prediction
 
-    For a given sample x_i, half squares error is defined as::
+    For a given sample x_i, half squared error is defined as::
 
         loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
 
@@ -549,7 +549,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
 
 
 class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
-    """Quantile Loss aka Pinball Loss, for regression.
+    """Quantile loss aka pinball loss, for regression.
 
     Domain:
     y_true and y_pred all real numbers
@@ -558,7 +558,7 @@ class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
     Link:
     y_pred = raw_prediction
 
-    For a given sample x_i, the pinball loss loss is defined as::
+    For a given sample x_i, the pinball loss is defined as::
 
         loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
 
@@ -620,7 +620,7 @@ class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
         loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                     - y_true_i + exp(raw_prediction_i)
 
-    Half the Poisson deviance is actually the negative log likelihood up to
+    Half the Poisson deviance is actually the negative log-likelihood up to
     constant terms (not involving raw_prediction) and simplifies the
     computation of the gradients.
     We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
@@ -652,7 +652,7 @@ class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
         loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                     + y_true/exp(raw_prediction_i) - 1
 
-    Half the Gamma deviance is actually proportional to the negative log
+    Half the Gamma deviance is actually proportional to the negative log-
     likelihood up to constant terms (not involving raw_prediction) and
     simplifies the computation of the gradients.
     We also skip the constant term `-log(y_true_i) - 1`.
@@ -744,7 +744,7 @@ class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
     y_pred = expit(raw_prediction)
 
     For a given sample x_i, the binary cross-entropy, aka log loss, is defined
-    as the negative log-likelihood of the Bernoulli distributions and can be
+    as the negative log-likelihood of the Bernoulli distribution and can be
     expressed as::
 
         loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
@@ -784,7 +784,7 @@ class CategoricalCrossEntropy(
 
     Domain:
     y_true in {0, 1, 2, 3, .., n_classes - 1}
-    y_pred a n_classes array, each element in (0, 1)
+    y_pred has n_classes elements, each element in (0, 1)
 
     Link:
     y_pred = softmax(raw_prediction)
@@ -792,8 +792,8 @@ class CategoricalCrossEntropy(
     Note: We assume y_true to be already label encoded.
 
     For a given sample x_i, the categorical cross-entropy loss is defined as
-    the negative log-likelihood of the multinomial distribution, it generalizes
-    the binary cross-entropy to more than 2 classes::
+    the negative log-likelihood of the multinomial distribution, it
+    generalizes the binary cross-entropy to more than 2 classes::
 
         loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                 - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
@@ -855,7 +855,7 @@ def gradient_proba(
         proba=None,
         n_threads=1,
     ):
-        """Compute gradient and probabilities of loss w.r.t raw_prediction.
+        """Compute gradient and probabilities fow raw_prediction.
 
         Parameters
         ----------

From d0b48ac300daf990cd99cac4c765491982d832d6 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 18 May 2021 15:25:40 +0200
Subject: [PATCH 021/143] CLN address review comments

---
 doc/modules/classes.rst |  2 +-
 sklearn/_loss/_loss.pxd | 18 ++++----
 sklearn/_loss/_loss.pyx | 94 +++++++++++++++++++++--------------------
 3 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 3f6083a4a29e8..103cceaed485a 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1650,7 +1650,7 @@ To be removed in 1.0 (renaming of 0.25)
 
 .. _loss_function_ref:
 
-:mod:`sklearn._loss`: Non-public Loss Function Classes
+:mod:`sklearn._loss`: Private Loss Function Classes
 ===========================================================
 
 .. automodule:: sklearn._loss
diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index 1528ab28741fd..8ad45f3bed389 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -19,7 +19,7 @@ ctypedef fused G_DTYPE_C:
 
 
 # Struct to return 2 doubles
-ctypedef struct double2:
+ctypedef struct double_pair:
    double val1
    double val2
 
@@ -28,48 +28,48 @@ ctypedef struct double2:
 cdef class cLossFunction:
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfSquaredError(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cAbsoluteError(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cPinballLoss(cLossFunction):
     cdef readonly double quantile  # readonly makes it inherited by children
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfPoissonLoss(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfGammaLoss(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfTweedieLoss(cLossFunction):
     cdef readonly double power  # readonly makes it inherited by children
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cBinaryCrossEntropy(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index 59a46dcab522b..df1c7ec8e8e79 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -28,7 +28,7 @@
 #
 # Note: We require 1-dim ndarrays to be contiguous.
 # TODO: Use const memoryviews with fused types with Cython 3.0 where
-#       appropriate (arguments marked by "# IN")
+#       appropriate (arguments marked by "# IN").
 
 cimport cython
 from cython.parallel import parallel, prange
@@ -62,9 +62,11 @@ cdef inline void sum_exp_minus_max(
     Y_DTYPE_C[:, :] raw_prediction,  # IN
     Y_DTYPE_C *p                     # OUT
 ) nogil:
-    # Store p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
-    #       p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
-    #       p[-1] = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
+    # Thread local buffers are used to stores results of this function via p.
+    # The results are stored as follows:
+    #     p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
+    #     p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
+    #     p[-1] = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
     # len(p) must be n_classes + 2
     # Notes:
     # - Using "by reference" arguments doesn't work well, therefore we use a
@@ -110,11 +112,11 @@ cdef inline double cgradient_half_squared_error(
     return raw_prediction - y_true
 
 
-cdef inline double2 cgrad_hess_half_squared_error(
+cdef inline double_pair cgrad_hess_half_squared_error(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     gh.val1 = raw_prediction - y_true  # gradient
     gh.val2 = 1.                       # hessian
     return gh
@@ -135,11 +137,11 @@ cdef inline double cgradient_absolute_error(
     return 1. if raw_prediction > y_true else -1.
 
 
-cdef inline double2 cgrad_hess_absolute_error(
+cdef inline double_pair cgrad_hess_absolute_error(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
     # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
     gh.val1 = 1. if raw_prediction > y_true else -1.  # gradient
@@ -165,12 +167,12 @@ cdef inline double cgradient_pinball_loss(
     return -quantile if y_true >=raw_prediction else 1. - quantile
 
 
-cdef inline double2 cgrad_hess_pinball_loss(
+cdef inline double_pair cgrad_hess_pinball_loss(
     double y_true,
     double raw_prediction,
     double quantile
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
     # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
     gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile  # gradient
@@ -194,22 +196,22 @@ cdef inline double cgradient_half_poisson(
     return exp(raw_prediction) - y_true
 
 
-cdef inline double2 closs_grad_half_poisson(
+cdef inline double_pair closs_grad_half_poisson(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     lg.val2 = exp(raw_prediction)                # used as temporary
     lg.val1 = lg.val2 - y_true * raw_prediction  # loss
     lg.val2 -= y_true                            # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_poisson(
+cdef inline double_pair cgrad_hess_half_poisson(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     gh.val2 = exp(raw_prediction)  # hessian
     gh.val1 = gh.val2 - y_true     # gradient
     return gh
@@ -230,22 +232,22 @@ cdef inline double cgradient_half_gamma(
     return 1. - y_true * exp(-raw_prediction)
 
 
-cdef inline double2 closs_grad_half_gamma(
+cdef inline double_pair closs_grad_half_gamma(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     lg.val2 = exp(-raw_prediction)               # used as temporary
     lg.val1 = raw_prediction + y_true * lg.val2  # loss
     lg.val2 = 1. - y_true * lg.val2              # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_gamma(
+cdef inline double_pair cgrad_hess_half_gamma(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     gh.val2 = exp(-raw_prediction)   # used as temporary
     gh.val1 = 1. - y_true * gh.val2  # gradient
     gh.val2 *= y_true                # hessian
@@ -288,12 +290,12 @@ cdef inline double cgradient_half_tweedie(
                 - y_true * exp((1. - power) * raw_prediction))
 
 
-cdef inline double2 closs_grad_half_tweedie(
+cdef inline double_pair closs_grad_half_tweedie(
     double y_true,
     double raw_prediction,
     double power
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     cdef double exp1, exp2
     if power == 0.:
         exp1 = exp(raw_prediction)
@@ -311,12 +313,12 @@ cdef inline double2 closs_grad_half_tweedie(
     return lg
 
 
-cdef inline double2 cgrad_hess_half_tweedie(
+cdef inline double_pair cgrad_hess_half_tweedie(
     double y_true,
     double raw_prediction,
     double power
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     cdef double exp1, exp2
     if power == 0.:
         exp1 = exp(raw_prediction)
@@ -368,11 +370,11 @@ cdef inline double cgradient_binary_crossentropy(
     return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
 
 
-cdef inline double2 closs_grad_binary_crossentropy(
+cdef inline double_pair closs_grad_binary_crossentropy(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     if raw_prediction <= 0:
         lg.val2 = exp(raw_prediction)  # used as temporary
         if raw_prediction <= -37:
@@ -391,14 +393,14 @@ cdef inline double2 closs_grad_binary_crossentropy(
     return lg
 
 
-cdef inline double2 cgrad_hess_binary_crossentropy(
+cdef inline double_pair cgrad_hess_binary_crossentropy(
     double y_true,
     double raw_prediction
 ) nogil:
     # with y_pred = expit(raw)
     # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2
     #                                 = exp(-raw) / (1 + exp(-raw))**2
-    cdef double2 gh
+    cdef double_pair gh
     gh.val2 = exp(-raw_prediction)  # used as temporary
     gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
     gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
@@ -445,7 +447,7 @@ cdef class cLossFunction:
         """
         pass
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         """Compute gradient and hessian.
 
         Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
@@ -465,7 +467,7 @@ cdef class cLossFunction:
 
         Returns
         -------
-        double2
+        double_pair
             Gradient and hessian of the loss function w.r.t. `raw_prediction`.
         """
         pass
@@ -627,7 +629,7 @@ cdef class cHalfSquaredError(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_squared_error(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_squared_error(y_true, raw_prediction)
 
     def _loss(
@@ -699,7 +701,7 @@ cdef class cHalfSquaredError(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -735,7 +737,7 @@ cdef class cAbsoluteError(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_absolute_error(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_absolute_error(y_true, raw_prediction)
 
     def _loss(
@@ -804,7 +806,7 @@ cdef class cAbsoluteError(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -846,7 +848,7 @@ cdef class cPinballLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_pinball_loss(y_true, raw_prediction, self.quantile)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_pinball_loss(y_true, raw_prediction, self.quantile)
 
     def _loss(
@@ -919,7 +921,7 @@ cdef class cPinballLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -968,7 +970,7 @@ cdef class cHalfPoissonLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_poisson(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_poisson(y_true, raw_prediction)
 
     def _loss(
@@ -1011,7 +1013,7 @@ cdef class cHalfPoissonLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1070,7 +1072,7 @@ cdef class cHalfPoissonLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1113,7 +1115,7 @@ cdef class cHalfGammaLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_gamma(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_gamma(y_true, raw_prediction)
 
     def _loss(
@@ -1156,7 +1158,7 @@ cdef class cHalfGammaLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1215,7 +1217,7 @@ cdef class cHalfGammaLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1276,7 +1278,7 @@ cdef class cHalfTweedieLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_tweedie(y_true, raw_prediction, self.power)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_tweedie(y_true, raw_prediction, self.power)
 
     def _loss(
@@ -1319,7 +1321,7 @@ cdef class cHalfTweedieLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1380,7 +1382,7 @@ cdef class cHalfTweedieLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1417,7 +1419,7 @@ cdef class cBinaryCrossEntropy(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_binary_crossentropy(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_binary_crossentropy(y_true, raw_prediction)
 
     def _loss(
@@ -1460,7 +1462,7 @@ cdef class cBinaryCrossEntropy(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1519,7 +1521,7 @@ cdef class cBinaryCrossEntropy(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(

From 7794617d394a71a7ac04c88f280970079bfe153e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 15 Jun 2021 17:34:20 +0200
Subject: [PATCH 022/143] DOC small docstring improvements

---
 sklearn/_loss/loss.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 14210b2a6202b..321d000636fe5 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -451,7 +451,8 @@ class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
         loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
 
     The factor of 0.5 simplifies the computation of gradients and results in a
-    unit hessian (and be consistent with what is done in LightGBM).
+    unit hessian (and is consistent with what is done in LightGBM). It is also
+    half the Normal distribution deviance.
     """
 
     def __init__(self, sample_weight=None):
@@ -512,7 +513,7 @@ def gradient_hessian(
 
 
 class AbsoluteError(IdentityLink, BaseLoss, cAbsoluteError):
-    """Least absolute error, for regression.
+    """Absolute error with identity link, for regression.
 
     Domain:
     y_true and y_pred all real numbers
@@ -734,7 +735,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
 
 
 class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
-    """Binary cross entropy loss for binary classification.
+    """Binary cross entropy loss with logit link, for binary classification.
 
     Domain:
     y_true in [0, 1]
@@ -743,14 +744,20 @@ class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
     Link:
     y_pred = expit(raw_prediction)
 
-    For a given sample x_i, the binary cross-entropy, aka log loss, is defined
-    as the negative log-likelihood of the Bernoulli distribution and can be
-    expressed as::
+    For a given sample x_i, the binary cross-entropy, is defined as the
+    negative log-likelihood of the Bernoulli distribution and can be expressed
+    as::
 
         loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
 
     See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
     section 4.4.1 (about logistic regression).
+
+    This loss is also known as log loss or logistic loss.
+    Note that the formulation works for classification, y = {0, 1}, as well as
+    logistic regression, y = [0, 1].
+    If you add `constant_to_optimal_zero` to the loss, you get half the
+    Bernoulli/binomial deviance.
     """
 
     def __init__(self, sample_weight=None):
@@ -780,7 +787,7 @@ def predict_proba(self, raw_prediction):
 class CategoricalCrossEntropy(
     MultinomialLogit, BaseLoss, cCategoricalCrossEntropy
 ):
-    """Categorical cross-entropy loss for multiclass classification.
+    """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:
     y_true in {0, 1, 2, 3, .., n_classes - 1}
@@ -789,7 +796,9 @@ class CategoricalCrossEntropy(
     Link:
     y_pred = softmax(raw_prediction)
 
-    Note: We assume y_true to be already label encoded.
+    Note: We assume y_true to be already label encoded. The inverse link is
+    softmax. But the full link function is the symmetric multinomial logit
+    function.
 
     For a given sample x_i, the categorical cross-entropy loss is defined as
     the negative log-likelihood of the multinomial distribution, it

From 35b74234a666bad026999b8d66b5bf783c449a79 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 15 Jun 2021 20:15:51 +0200
Subject: [PATCH 023/143] TST test more losses in
 test_specific_fit_intercept_only

---
 sklearn/_loss/tests/test_loss.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 2b2c9ea22aaca..f7228b722dbb1 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -757,10 +757,13 @@ def fun(x):
 @pytest.mark.parametrize(
     "loss, func, random_dist",
     [
-        (HalfSquaredError, np.mean, "normal"),
-        (AbsoluteError, np.median, "normal"),
-        (HalfPoissonLoss, np.mean, "poisson"),
-        (BinaryCrossEntropy, np.mean, "binomial"),
+        (HalfSquaredError(), np.mean, "normal"),
+        (AbsoluteError(), np.median, "normal"),
+        (PinballLoss(quantile=0.25), lambda x: np.quantile(x, q=0.25), "normal"),
+        (HalfPoissonLoss(), np.mean, "poisson"),
+        (HalfGammaLoss(), np.mean, "exponential"),
+        (HalfTweedieLoss(), np.mean, "exponential"),
+        (BinaryCrossEntropy(), np.mean, "binomial"),
     ],
 )
 def test_specific_fit_intercept_only(loss, func, random_dist):
@@ -770,7 +773,6 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
     squared error estimates the expectation of a probability distribution.
     """
     rng = np.random.RandomState(0)
-    loss = loss()
     if random_dist == "binomial":
         y_train = rng.binomial(1, 0.5, size=100)
     else:
@@ -780,6 +782,7 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
     # or median.
     assert_all_finite(baseline_prediction)
     assert baseline_prediction == approx(loss.link(func(y_train)))
+    assert loss.inverse(baseline_prediction) == approx(func(y_train))
     if isinstance(loss, IdentityLink):
         assert_allclose(
             loss.inverse(baseline_prediction), baseline_prediction

From b3900024ba2395b32269642103af3dce0a817601 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 18 Jul 2021 22:14:59 +0200
Subject: [PATCH 024/143] FIX test_loss_boundary

---
 sklearn/_loss/tests/test_loss.py | 66 ++++++++++++--------------------
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index f7228b722dbb1..5e674ccc00942 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -71,16 +71,10 @@ def random_y_true_raw_prediction(
         high = min(high, y_bound[1])
         y_true = rng.uniform(low, high, size=n_samples)
         # set some values at special boundaries
-        if (
-            loss.interval_y_true.low == 0
-            and loss.interval_y_true.low_inclusive
-        ):
+        if loss.interval_y_true.low == 0 and loss.interval_y_true.low_inclusive:
             y_true[:: (n_samples // 3)] = 0
-        if (
-            loss.interval_y_true.high == 1
-            and loss.interval_y_true.high_inclusive
-        ):
-            y_true[1:: (n_samples // 3)] = 1
+        if loss.interval_y_true.high == 1 and loss.interval_y_true.high_inclusive:
+            y_true[1 :: (n_samples // 3)] = 1
 
     return y_true, raw_prediction
 
@@ -96,9 +90,7 @@ def numerical_derivative(func, x, eps):
     f_minus_1h = func(x - h)
     f_plus_1h = func(x + h)
     f_plus_2h = func(x + 2 * h)
-    return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (
-        12.0 * eps
-    )
+    return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (12.0 * eps)
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
@@ -119,14 +111,15 @@ def test_loss_boundary(loss):
 
     assert loss.in_y_true_range(y_true)
 
+    n = y_true.shape[0]
     low, high = _inclusive_low_high(loss.interval_y_pred)
     if loss.is_multiclass:
-        y_pred = np.empty((10, 3))
-        y_pred[:, 0] = np.linspace(low, high, num=10)
+        y_pred = np.empty((n, 3))
+        y_pred[:, 0] = np.linspace(low, high, num=n)
         y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0])
         y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0])
     else:
-        y_pred = np.linspace(low, high, num=10)
+        y_pred = np.linspace(low, high, num=n)
 
     assert loss.in_y_pred_range(y_pred)
 
@@ -185,8 +178,7 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
 
 
 @pytest.mark.parametrize(
-    "loss, y_pred_success, y_pred_fail",
-    Y_COMMON_PARAMS + Y_PRED_PARAMS  # type: ignore
+    "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS  # type: ignore
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
     """Test boundaries of y_pred for loss functions."""
@@ -203,9 +195,7 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
 @pytest.mark.parametrize("out1", [None, 1])
 @pytest.mark.parametrize("out2", [None, 1])
 @pytest.mark.parametrize("n_threads", [1, 2])
-def test_loss_dtype(
-    loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
-):
+def test_loss_dtype(loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads):
     """Test acceptance of dtypes in loss functions.
 
     Check that loss accepts if all input arrays are either all float32 or all
@@ -450,14 +440,10 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         rng = np.random.RandomState(42)
         sample_weight = rng.normal(size=n_samples).astype(np.float64)
 
-    baseline_prediction = loss.fit_intercept_only(
-        y_true=y_true, sample_weight=None
-    )
+    baseline_prediction = loss.fit_intercept_only(y_true=y_true, sample_weight=None)
 
     if loss.n_classes <= 2:
-        raw_prediction = np.zeros(
-            shape=(n_samples,), dtype=baseline_prediction.dtype
-        )
+        raw_prediction = np.zeros(shape=(n_samples,), dtype=baseline_prediction.dtype)
     else:
         raw_prediction = np.zeros(
             shape=(n_samples, loss.n_classes), dtype=baseline_prediction.dtype
@@ -555,7 +541,9 @@ def test_gradients_hessians_numerically(loss, sample_weight):
 
         def loss_func(x):
             return loss.loss(
-                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+                y_true=y_true,
+                raw_prediction=x,
+                sample_weight=sample_weight,
             )
 
         g_numeric = numerical_derivative(loss_func, raw_prediction, eps=1e-6)
@@ -563,7 +551,9 @@ def loss_func(x):
 
         def grad_func(x):
             return loss.gradient(
-                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+                y_true=y_true,
+                raw_prediction=x,
+                sample_weight=sample_weight,
             )
 
         h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6)
@@ -588,9 +578,7 @@ def loss_func(x):
                     sample_weight=sample_weight,
                 )
 
-            g_numeric = numerical_derivative(
-                loss_func, raw_prediction[:, k], eps=1e-5
-            )
+            g_numeric = numerical_derivative(loss_func, raw_prediction[:, k], eps=1e-5)
             assert_allclose(g[:, k], g_numeric, rtol=5e-6, atol=1e-10)
 
             def grad_func(x):
@@ -602,9 +590,7 @@ def grad_func(x):
                     sample_weight=sample_weight,
                 )[:, k]
 
-            h_numeric = numerical_derivative(
-                grad_func, raw_prediction[:, k], eps=1e-6
-            )
+            h_numeric = numerical_derivative(grad_func, raw_prediction[:, k], eps=1e-6)
             if loss.approx_hessian:
                 assert np.all(h >= h_numeric)
             else:
@@ -676,9 +662,7 @@ def fprime2(x: np.ndarray) -> np.ndarray:
     optimum = optimum.ravel()
     assert_allclose(loss.inverse(optimum), y_true)
     assert_allclose(func(optimum), 0, atol=1e-14)
-    assert_allclose(
-        loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7
-    )
+    assert_allclose(loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7)
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
@@ -740,7 +724,7 @@ def fun(x):
             method="SLSQP",
             constraints={
                 "type": "eq",
-                "fun": lambda x: np.ones((1, loss.n_classes)) @ x
+                "fun": lambda x: np.ones((1, loss.n_classes)) @ x,
             },
         )
         grad = loss.gradient(
@@ -784,9 +768,7 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
     assert baseline_prediction == approx(loss.link(func(y_train)))
     assert loss.inverse(baseline_prediction) == approx(func(y_train))
     if isinstance(loss, IdentityLink):
-        assert_allclose(
-            loss.inverse(baseline_prediction), baseline_prediction
-        )
+        assert_allclose(loss.inverse(baseline_prediction), baseline_prediction)
 
     # Test baseline at boundary
     if loss.interval_y_true.low_inclusive:
@@ -835,5 +817,5 @@ def test_binary_and_categorical_crossentropy():
     raw_cce[:, 1] = 0.5 * raw_prediction
     assert_allclose(
         bce.loss(y_true=y_train, raw_prediction=raw_prediction),
-        cce.loss(y_true=y_train, raw_prediction=raw_cce)
+        cce.loss(y_true=y_train, raw_prediction=raw_cce),
     )

From 12b4634ea5380b9a4c19c62e6e5f5ba1f21698b8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 18 Jul 2021 22:17:04 +0200
Subject: [PATCH 025/143] ENH Tempita for losses

---
 .gitignore                                |    1 +
 setup.cfg                                 |    1 +
 sklearn/_loss/{_loss.pyx => _loss.pyx.tp} | 1021 ++++-----------------
 sklearn/_loss/setup.py                    |    6 +
 4 files changed, 206 insertions(+), 823 deletions(-)
 rename sklearn/_loss/{_loss.pyx => _loss.pyx.tp} (60%)

diff --git a/.gitignore b/.gitignore
index 3ebd8e2bb1699..e8f2a354a3e26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -76,6 +76,7 @@ _configtest.o.d
 .mypy_cache/
 
 # files generated from a template
+sklearn/_loss/_loss.pyx
 sklearn/utils/_seq_dataset.pyx
 sklearn/utils/_seq_dataset.pxd
 sklearn/linear_model/_sag_fast.pyx
diff --git a/setup.cfg b/setup.cfg
index 8ee90da7436c0..3c11d832d6719 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,6 +65,7 @@ allow_redefinition = True
 [check-manifest]
 # ignore files missing in VCS
 ignore =
+    sklearn/_loss/_loss.pyx
     sklearn/linear_model/_sag_fast.pyx
     sklearn/utils/_seq_dataset.pxd
     sklearn/utils/_seq_dataset.pyx
diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx.tp
similarity index 60%
rename from sklearn/_loss/_loss.pyx
rename to sklearn/_loss/_loss.pyx.tp
index df1c7ec8e8e79..92dcf57e9f1fb 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -1,3 +1,164 @@
+{{py:
+
+"""
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: _loss.pyx
+
+Each loss class is generated by a cdef functions  on single samples.
+The keywords between double braces are substituted in setup.py.
+"""
+
+doc_SquaredError = (
+    """Half Squared Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+)
+
+doc_AbsoluteError = (
+    """Absolute Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+)
+
+doc_PinballLoss = (
+    """Quantile Loss aka Pinball Loss with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
+    """
+)
+
+doc_PoissonLoss = (
+    """Half Poisson deviance loss with log-link.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Poisson deviance with log-link is
+        y_true * log(y_true/y_pred) + y_pred - y_true
+        = y_true * log(y_true) - y_true * raw_prediction
+          + exp(raw_prediction) - y_true
+
+    Dropping constant terms, this gives:
+        exp(raw_prediction) - y_true * raw_prediction
+    """
+)
+
+doc_GammaLoss = (
+    """Half Gamma deviance loss with log-link.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Gamma deviance with log-link is
+        log(y_pred/y_true) + y_true/y_pred - 1
+        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
+
+    Dropping constant terms, this gives:
+        raw_prediction + y_true * exp(-raw_prediction)
+    """
+)
+
+doc_TweedieLoss = (
+    """Half Tweedie deviance loss with log-link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Tweedie deviance with log-link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+        + exp((2-p) * raw_prediction) / (2-p)
+
+    Dropping constant terms, this gives:
+        exp((2-p) * raw_prediction) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+
+    Notes:
+    - Poisson with p=1 and and Gamma with p=2 have different terms dropped such
+      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
+    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
+      0<p<1 still gives a strictly consistent scoring function for the
+      expectation.
+    """
+)
+
+doc_BinaryCrossEntropy = (
+    """BinaryCrossEntropy with logit link.
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+    """
+)
+
+# loss class name, docstring, param, closs, closs_grad, cgrad, cgrad_hess,
+class_list = [
+    ("cHalfSquaredError", doc_SquaredError, None,
+     "closs_half_squared_error", None,
+     "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
+    ("cAbsoluteError", doc_AbsoluteError, None,
+     "closs_absolute_error", None,
+     "cgradient_absolute_error", "cgrad_hess_absolute_error"),
+    ("cPinballLoss", doc_PinballLoss, "quantile",
+     "closs_pinball_loss", None,
+     "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
+    ("cHalfPoissonLoss", doc_PoissonLoss, None,
+     "closs_half_poisson", "closs_grad_half_poisson",
+     "cgradient_half_poisson", "cgrad_hess_half_poisson"),
+    ("cHalfGammaLoss", doc_GammaLoss, None,
+     "closs_half_gamma", "closs_grad_half_gamma",
+     "cgradient_half_gamma", "cgrad_hess_half_gamma"),
+    ("cHalfTweedieLoss", doc_TweedieLoss, "power",
+     "closs_half_tweedie", "closs_grad_half_tweedie",
+     "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
+    ("cBinaryCrossEntropy", doc_BinaryCrossEntropy, None,
+     "closs_binary_crossentropy", "closs_grad_binary_crossentropy",
+     "cgradient_binary_crossentropy", "cgrad_hess_binary_crossentropy"),
+]
+}}
+"""
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+#------------------------------------------------------------------------------
+
 # cython: cdivision=True
 # cython: boundscheck=False
 # cython: wraparound=False
@@ -613,24 +774,30 @@ cdef class cLossFunction:
         pass
 
 
-cdef class cHalfSquaredError(cLossFunction):
-    """Half Squared Error with identity link.
+{{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}}
+{{py:
+if param is None:
+    with_param = ""
+else:
+    with_param = ", self." + param
+}}
 
-    Domain:
-    y_true and y_pred all real numbers
+cdef class {{name}}(cLossFunction):
+    """{{docstring}}"""
 
-    Link:
-    y_pred = raw_prediction
-    """
+    {{if param is not None}}
+    def __init__(self, {{param}}):
+        self.{{param}} = {{param}}
+    {{endif}}
 
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_squared_error(y_true, raw_prediction)
+    cdef inline double closs(self, double y_true, double raw_prediction) nogil:
+        return {{closs}}(y_true, raw_prediction{{with_param}})
 
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_squared_error(y_true, raw_prediction)
+    cdef inline double cgradient(self, double y_true, double raw_prediction) nogil:
+        return {{cgrad}}(y_true, raw_prediction{{with_param}})
 
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_squared_error(y_true, raw_prediction)
+    cdef inline double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
 
     def _loss(
         self,
@@ -648,54 +815,23 @@ cdef class cHalfSquaredError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = closs_half_squared_error(y_true[i], raw_prediction[i])
+                loss[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_squared_error(y_true[i], raw_prediction[i])
-                )
+                loss[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
 
         return np.asarray(loss)
 
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_squared_error(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_squared_error(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-
-    def _gradient_hessian(
+    {{if closs_grad is not None}}
+    def _loss_gradient(
         self,
         Y_DTYPE_C[::1] y_true,
         Y_DTYPE_C[::1] raw_prediction,
         Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
         G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
         int n_threads=1
     ):
         cdef:
@@ -707,64 +843,19 @@ cdef class cHalfSquaredError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cAbsoluteError(cLossFunction):
-    """Absolute Error with identity link.
-
-    Domain:
-    y_true and y_pred all real numbers
-
-    Link:
-    y_pred = raw_prediction
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_absolute_error(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_absolute_error(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_absolute_error(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_absolute_error(y_true[i], raw_prediction[i])
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = (sample_weight[i]
-                    * closs_absolute_error(y_true[i], raw_prediction[i]))
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
 
-        return np.asarray(loss)
+        return np.asarray(loss), np.asarray(gradient)
+    {{endif}}
 
     def _gradient(
         self,
@@ -782,15 +873,12 @@ cdef class cAbsoluteError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                gradient[i] = cgradient_absolute_error(y_true[i], raw_prediction[i])
+                gradient[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_absolute_error(y_true[i], raw_prediction[i])
-                )
+                gradient[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
 
         return np.asarray(gradient)
 
@@ -812,733 +900,20 @@ cdef class cAbsoluteError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
                 gradient[i] = dbl2.val1
                 hessian[i] = dbl2.val2
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
                 gradient[i] = sample_weight[i] * dbl2.val1
                 hessian[i] = sample_weight[i] * dbl2.val2
 
         return np.asarray(gradient), np.asarray(hessian)
 
-
-cdef class cPinballLoss(cLossFunction):
-    """Quantile Loss aka Pinball Loss with identity link.
-
-    Domain:
-    y_true and y_pred all real numbers
-    quantile in (0, 1)
-
-    Link:
-    y_pred = raw_prediction
-
-    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
-    """
-
-    def __init__(self, quantile):
-        self.quantile = quantile
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_pinball_loss(y_true, raw_prediction, self.quantile)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_pinball_loss(y_true, raw_prediction, self.quantile)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_pinball_loss(y_true, raw_prediction, self.quantile)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
-                )
-
-        return np.asarray(loss)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_pinball_loss(
-                    y_true[i], raw_prediction[i], self.quantile
-                )
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_pinball_loss(
-                    y_true[i], raw_prediction[i], self.quantile
-                )
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_pinball_loss(
-                    y_true[i], raw_prediction[i], self.quantile
-                )
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cHalfPoissonLoss(cLossFunction):
-    """Half Poisson deviance loss with log-link.
-
-    Domain:
-    y_true in non-negative real numbers
-    y_pred in positive real numbers
-
-    Link:
-    y_pred = exp(raw_prediction)
-
-    Half Poisson deviance with log-link is
-        y_true * log(y_true/y_pred) + y_pred - y_true
-        = y_true * log(y_true) - y_true * raw_prediction
-          + exp(raw_prediction) - y_true
-
-    Dropping constant terms, this gives:
-        exp(raw_prediction) - y_true * raw_prediction
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_poisson(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_poisson(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_poisson(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_half_poisson(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_poisson(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_poisson(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_poisson(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cHalfGammaLoss(cLossFunction):
-    """Half Gamma deviance loss with log-link.
-
-    Domain:
-    y_true and y_pred in positive real numbers
-
-    Link:
-    y_pred = exp(raw_prediction)
-
-    Half Gamma deviance with log-link is
-        log(y_pred/y_true) + y_true/y_pred - 1
-        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
-
-    Dropping constant terms, this gives:
-        raw_prediction + y_true * exp(-raw_prediction)
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_gamma(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_gamma(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_gamma(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_half_gamma(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_gamma(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_gamma(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_gamma(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cHalfTweedieLoss(cLossFunction):
-    """Half Tweedie deviance loss with log-link.
-
-    Domain:
-    y_true in real numbers if p <= 0
-    y_true in non-negative real numbers if 0 < p < 2
-    y_true in positive real numbers if p >= 2
-    y_pred and power in positive real numbers
-
-    Link:
-    y_pred = exp(raw_prediction)
-
-    Half Tweedie deviance with log-link and p=power is
-        max(y_true, 0)**(2-p) / (1-p) / (2-p)
-        - y_true * y_pred**(1-p) / (1-p)
-        + y_pred**(2-p) / (2-p)
-        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
-        - y_true * exp((1-p) * raw_prediction) / (1-p)
-        + exp((2-p) * raw_prediction) / (2-p)
-
-    Dropping constant terms, this gives:
-        exp((2-p) * raw_prediction) / (2-p)
-        - y_true * exp((1-p) * raw_prediction) / (1-p)
-
-    Notes:
-    - Poisson with p=1 and and Gamma with p=2 have different terms dropped such
-      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
-    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
-      0<p<1 still gives a strictly consistent scoring function for the
-      expectation.
-    """
-
-    def __init__(self, power):
-        self.power = power
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_tweedie(y_true, raw_prediction, self.power)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_tweedie(y_true, raw_prediction, self.power)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_tweedie(y_true, raw_prediction, self.power)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_tweedie(
-                    y_true[i], raw_prediction[i], self.power
-                )
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cBinaryCrossEntropy(cLossFunction):
-    """BinaryCrossEntropy with logit link.
-
-    Domain:
-    y_true in [0, 1]
-    y_pred in (0, 1), i.e. boundaries excluded
-
-    Link:
-    y_pred = expit(raw_prediction)
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_binary_crossentropy(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_binary_crossentropy(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_binary_crossentropy(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_binary_crossentropy(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_binary_crossentropy(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
+{{endfor}}
 
 
 cdef class cCategoricalCrossEntropy(cLossFunction):
diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index c7f11afe9e30a..b80584d8707c8 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -1,9 +1,15 @@
 import numpy
 from numpy.distutils.misc_util import Configuration
+from sklearn._build_utils import gen_from_templates
 
 
 def configuration(parent_package="", top_path=None):
     config = Configuration("_loss", parent_package, top_path)
+
+    # generate _loss.pyx from template
+    templates = ["sklearn/_loss/_loss.pyx.tp"]
+    gen_from_templates(templates, top_path)
+
     config.add_extension(
         "_loss",
         sources=["_loss.pyx"],

From 061a41bd43ce1b636112a6dc74c299123e2e39de Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 19 Jul 2021 22:59:15 +0200
Subject: [PATCH 026/143] MNT apply black

---
 sklearn/_loss/link.py            |  4 +---
 sklearn/_loss/loss.py            | 24 ++++++------------------
 sklearn/_loss/tests/test_link.py | 17 ++++++-----------
 3 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index f5567e6dd7b49..ed9f12b577c62 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -12,9 +12,7 @@
 from ..utils.extmath import softmax
 
 
-Interval = namedtuple(
-    "Interval", ("low", "high", "low_inclusive", "high_inclusive")
-)
+Interval = namedtuple("Interval", ("low", "high", "low_inclusive", "high_inclusive"))
 
 
 def is_in_interval_range(x, interval):
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 321d000636fe5..37818c33b3978 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -354,9 +354,7 @@ def gradient_hessian(
             n_threads=n_threads,
         )
 
-    def __call__(
-        self, y_true, raw_prediction, sample_weight=None, n_threads=1
-    ):
+    def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
         """Compute the weighted average loss.
 
         Parameters
@@ -473,11 +471,7 @@ def gradient(
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
-        if (
-            gradient is not None
-            and gradient.ndim == 2
-            and gradient.shape[1] == 1
-        ):
+        if gradient is not None and gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
         # gradient = raw_prediction - y_true is easier in numpy
@@ -588,7 +582,7 @@ def __init__(self, sample_weight=None, quantile=0.5):
             self.constant_hessian = False
         if quantile <= 0 or quantile >= 1:
             raise ValueError(
-                f"PinballLoss aka quantile loss only accepts "
+                "PinballLoss aka quantile loss only accepts "
                 f"0 < quantile < 1; {quantile} was given."
             )
 
@@ -601,9 +595,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
         if sample_weight is None:
             return np.percentile(y_true, 100 * self.quantile, axis=0)
         else:
-            return _weighted_percentile(
-                y_true, sample_weight, 100 * self.quantile
-            )
+            return _weighted_percentile(y_true, sample_weight, 100 * self.quantile)
 
 
 class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
@@ -776,17 +768,13 @@ def predict_proba(self, raw_prediction):
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
-        proba = np.empty(
-            (raw_prediction.shape[0], 2), dtype=raw_prediction.dtype
-        )
+        proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
         proba[:, 1] = self.inverse(raw_prediction)
         proba[:, 0] = 1 - proba[:, 1]
         return proba
 
 
-class CategoricalCrossEntropy(
-    MultinomialLogit, BaseLoss, cCategoricalCrossEntropy
-):
+class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, cCategoricalCrossEntropy):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index b049f5ac637d6..f2846c17b3f1d 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -36,8 +36,7 @@ def test_is_in_range(interval):
 
     # x contains lower bound
     assert (
-        is_in_interval_range(np.r_[x, interval.low], interval)
-        == interval.low_inclusive
+        is_in_interval_range(np.r_[x, interval.low], interval) == interval.low_inclusive
     )
 
     # x contains upper bound
@@ -47,9 +46,9 @@ def test_is_in_range(interval):
     )
 
     # x contains upper and lower bound
-    assert is_in_interval_range(
-        np.r_[x, interval.low, interval.high], interval
-    ) == (interval.low_inclusive and interval.high_inclusive)
+    assert is_in_interval_range(np.r_[x, interval.low, interval.high], interval) == (
+        interval.low_inclusive and interval.high_inclusive
+    )
 
 
 @pytest.mark.parametrize("link", LINK_FUNCTIONS)
@@ -60,9 +59,7 @@ def test_link_inverse_identity(link):
     n_samples, n_classes = 100, None
     if link.multiclass:
         n_classes = 10
-        raw_prediction = rng.normal(
-            loc=0, scale=10, size=(n_samples, n_classes)
-        )
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
         if isinstance(link, MultinomialLogit):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:
@@ -83,9 +80,7 @@ def test_link_out_argument(link):
     n_samples, n_classes = 100, None
     if link.multiclass:
         n_classes = 10
-        raw_prediction = rng.normal(
-            loc=0, scale=10, size=(n_samples, n_classes)
-        )
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
         if isinstance(link, MultinomialLogit):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:

From 98f88778c3ab0d40955a9ded83460955620e1116 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 20 Jul 2021 20:22:44 +0200
Subject: [PATCH 027/143] TST replace np.quantile by np.percentile

---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 5e674ccc00942..1673382114378 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -743,7 +743,7 @@ def fun(x):
     [
         (HalfSquaredError(), np.mean, "normal"),
         (AbsoluteError(), np.median, "normal"),
-        (PinballLoss(quantile=0.25), lambda x: np.quantile(x, q=0.25), "normal"),
+        (PinballLoss(quantile=0.25), lambda x: np.percentile(x, q=25), "normal"),
         (HalfPoissonLoss(), np.mean, "poisson"),
         (HalfGammaLoss(), np.mean, "exponential"),
         (HalfTweedieLoss(), np.mean, "exponential"),

From 3f8ffe972c23a88e0449743c43e06e0202df39f9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 13:51:30 +0200
Subject: [PATCH 028/143] ENH make Interval a dataclass

- function is_in_interval_range -> method Interval.includes
---
 sklearn/_loss/link.py            | 65 +++++++++++++++++---------------
 sklearn/_loss/loss.py            |  9 ++---
 sklearn/_loss/tests/test_link.py | 14 ++-----
 3 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index ed9f12b577c62..a172ac4d9e49c 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -4,7 +4,7 @@
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
 
 from abc import ABC, abstractmethod
-from collections import namedtuple
+from dataclasses import dataclass
 
 import numpy as np
 from scipy.special import expit, logit
@@ -12,41 +12,46 @@
 from ..utils.extmath import softmax
 
 
-Interval = namedtuple("Interval", ("low", "high", "low_inclusive", "high_inclusive"))
+@dataclass
+class Interval:
+    low: float
+    high: float
+    low_inclusive: bool
+    high_inclusive: bool
 
+    def includes(self, x):
+        """Test whether values of x are in interval range.
 
-def is_in_interval_range(x, interval):
-    """Test whether values of x are in interval range from Interval.
-
-    Parameters
-    ----------
-    x : ndarray
-        Array whose elements are tested to be in interval range.
-    interval: Interval
-        An Interval range.
-    """
-    if interval.low_inclusive:
-        low = np.greater_equal(x, interval.low)
-    else:
-        low = np.greater(x, interval.low)
+        Parameters
+        ----------
+        x : ndarray
+            Array whose elements are tested to be in interval range.
+        """
+        if self.low_inclusive:
+            low = np.greater_equal(x, self.low)
+        else:
+            low = np.greater(x, self.low)
 
-    if not np.all(low):
-        return False
+        if not np.all(low):
+            return False
 
-    if interval.high_inclusive:
-        high = np.less_equal(x, interval.high)
-    else:
-        high = np.less(x, interval.high)
+        if self.high_inclusive:
+            high = np.less_equal(x, self.high)
+        else:
+            high = np.less(x, self.high)
 
-    # Note: np.all returns numpy.bool_
-    if np.all(high):
-        return True
-    else:
-        return False
+        # Note: np.all returns numpy.bool_
+        if np.all(high):
+            return True
+        else:
+            return False
 
 
 def _inclusive_low_high(interval, dtype=float):
-    """Generate values low and high to be within the interval range."""
+    """Generate values low and high to be within the interval range.
+
+    This is used in tests only.
+    """
     eps = 10 * np.finfo(dtype).eps
     if interval.low == -np.inf:
         low = -1e10
@@ -76,8 +81,8 @@ class BaseLink(ABC):
     called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
     conditional (on X) expected value of the target `y_true`.
 
-    In case a link function needs parameters, the methods are not implemented
-    as staticmethods.
+    The methods are not implemented as staticmethods in case a link function needs
+    parameters.
     """
 
     multiclass = False
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 37818c33b3978..1608bd27902a8 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -30,7 +30,6 @@
 )
 from .link import (
     Interval,
-    is_in_interval_range,
     BaseLink,
     IdentityLink,
     LogLink,
@@ -125,7 +124,7 @@ def in_y_true_range(self, y):
         ----------
         y : ndarray
         """
-        return is_in_interval_range(y, self.interval_y_true)
+        return self.interval_y_true.includes(y)
 
     def in_y_pred_range(self, y):
         """Return True if y is in the valid range of y_pred.
@@ -134,7 +133,7 @@ def in_y_pred_range(self, y):
         ----------
         y : ndarray
         """
-        return is_in_interval_range(y, self.interval_y_pred)
+        return self.interval_y_pred.includes(y)
 
     def loss(
         self,
@@ -823,9 +822,7 @@ def in_y_true_range(self, y):
         ----------
         y : ndarray
         """
-        return is_in_interval_range(y, self.interval_y_true) and np.all(
-            y.astype(int) == y
-        )
+        return self.interval_y_true.includes(y) and np.all(y.astype(int) == y)
 
     def fit_intercept_only(self, y_true, sample_weight=None):
         """Compute raw_prediction of an intercept-only model.
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index f2846c17b3f1d..d9b1e36e68a19 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -7,7 +7,6 @@
     _inclusive_low_high,
     MultinomialLogit,
     Interval,
-    is_in_interval_range,
 )
 
 
@@ -32,21 +31,16 @@ def test_is_in_range(interval):
     low, high = _inclusive_low_high(interval)
 
     x = np.linspace(low, high, num=10)
-    assert is_in_interval_range(x, interval)
+    assert interval.includes(x)
 
     # x contains lower bound
-    assert (
-        is_in_interval_range(np.r_[x, interval.low], interval) == interval.low_inclusive
-    )
+    assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
 
     # x contains upper bound
-    assert (
-        is_in_interval_range(np.r_[x, interval.high], interval)
-        == interval.high_inclusive
-    )
+    assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
 
     # x contains upper and lower bound
-    assert is_in_interval_range(np.r_[x, interval.low, interval.high], interval) == (
+    assert interval.includes(np.r_[x, interval.low, interval.high]) == (
         interval.low_inclusive and interval.high_inclusive
     )
 

From b5e61d273c8bf2b8ecd1055e7ed433deefa52cb7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 13:58:55 +0200
Subject: [PATCH 029/143] DOC improve docstrings in link.py

---
 sklearn/_loss/link.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index a172ac4d9e49c..b3ba52d7c3bce 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -20,12 +20,16 @@ class Interval:
     high_inclusive: bool
 
     def includes(self, x):
-        """Test whether values of x are in interval range.
+        """Test whether all values of x are in interval range.
 
         Parameters
         ----------
         x : ndarray
             Array whose elements are tested to be in interval range.
+
+        Returns
+        -------
+        result : bool
         """
         if self.low_inclusive:
             low = np.greater_equal(x, self.low)
@@ -51,6 +55,11 @@ def _inclusive_low_high(interval, dtype=float):
     """Generate values low and high to be within the interval range.
 
     This is used in tests only.
+
+    Returns
+    -------
+    low, high : tuple
+        The returned values low and high lie within the interval.
     """
     eps = 10 * np.finfo(dtype).eps
     if interval.low == -np.inf:

From cfdd67c95ea98bb59c4c95414e5ac4a2190465e9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:00:58 +0200
Subject: [PATCH 030/143] MNT use numpy dtype instead of Python type

---
 sklearn/_loss/link.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index b3ba52d7c3bce..9bb223eb0dca6 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -51,7 +51,7 @@ def includes(self, x):
             return False
 
 
-def _inclusive_low_high(interval, dtype=float):
+def _inclusive_low_high(interval, dtype=np.float64):
     """Generate values low and high to be within the interval range.
 
     This is used in tests only.

From 73311e9a3ba8df4a7acd5d9e8b0f2ea7140bd1e5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:08:28 +0200
Subject: [PATCH 031/143] TST add negative intervals

---
 sklearn/_loss/tests/test_link.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index d9b1e36e68a19..6a7f1b7598b73 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -24,6 +24,10 @@
         Interval(-np.inf, np.inf, False, True),
         Interval(-np.inf, np.inf, True, False),
         Interval(-np.inf, np.inf, True, True),
+        Interval(-10, -1, False, False),
+        Interval(-10, -1, False, True),
+        Interval(-10, -1, True, False),
+        Interval(-10, -1, True, True),
     ],
 )
 def test_is_in_range(interval):

From 36922808018031a23e8ac37510b6d54c1f04ceae Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:14:02 +0200
Subject: [PATCH 032/143] ENH add __post_init__ to class Interval

---
 sklearn/_loss/link.py            | 5 +++++
 sklearn/_loss/tests/test_link.py | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index 9bb223eb0dca6..b756e275c6d0e 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -19,6 +19,11 @@ class Interval:
     low_inclusive: bool
     high_inclusive: bool
 
+    def __post_init__(self):
+        """Check that low <= high"""
+        if self.low > self.high:
+            raise ValueError("On must have low <= high; got low={low}, high={high}.")
+
     def includes(self, x):
         """Test whether all values of x are in interval range.
 
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index 6a7f1b7598b73..3239ade25f3c7 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -13,6 +13,12 @@
 LINK_FUNCTIONS = list(_LINKS.values())
 
 
+def test_interval_raises():
+    """Test that interval with low > high raises ValueError."""
+    with pytest.raises(ValueError, match="On must have low <= high"):
+        Interval(1, 0, False, False)
+
+
 @pytest.mark.parametrize(
     "interval",
     [

From 4cb4d3d4636061dbafcd9d759c46ffbc3cda9d1d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:54:25 +0200
Subject: [PATCH 033/143] MNT rename cython losses

- class name from cLoss to CyLoss
- single sample methods
  from closs to cy_loss, cgradient to cy_gradient, ..
---
 sklearn/_loss/_loss.pxd    | 64 +++++++++++++++++++-------------------
 sklearn/_loss/_loss.pyx.tp | 57 +++++++++++++++++----------------
 sklearn/_loss/loss.py      | 42 ++++++++++++-------------
 3 files changed, 83 insertions(+), 80 deletions(-)

diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index 8ad45f3bed389..b00379a1e793d 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -25,51 +25,51 @@ ctypedef struct double_pair:
 
 
 # C base class for loss functions
-cdef class cLossFunction:
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyLossFunction:
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfSquaredError(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyHalfSquaredError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cAbsoluteError(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyAbsoluteError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cPinballLoss(cLossFunction):
+cdef class CyPinballLoss(CyLossFunction):
     cdef readonly double quantile  # readonly makes it inherited by children
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfPoissonLoss(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyHalfPoissonLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfGammaLoss(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyHalfGammaLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfTweedieLoss(cLossFunction):
+cdef class CyHalfTweedieLoss(CyLossFunction):
     cdef readonly double power  # readonly makes it inherited by children
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cBinaryCrossEntropy(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyBinaryCrossEntropy(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 92dcf57e9f1fb..63ada42133dcf 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -1,12 +1,12 @@
 {{py:
 
 """
-Template file for easily generate fused types consistent code using Tempita
+Template file for easily generate loops over samples using Tempita
 (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
 
 Generated file: _loss.pyx
 
-Each loss class is generated by a cdef functions  on single samples.
+Each loss class is generated by a cdef functions on single samples.
 The keywords between double braces are substituted in setup.py.
 """
 
@@ -129,33 +129,36 @@ doc_BinaryCrossEntropy = (
     """
 )
 
-# loss class name, docstring, param, closs, closs_grad, cgrad, cgrad_hess,
+# loss class name, docstring, param,
+# cy_loss, cy_loss_grad,
+# cy_grad, cy_grad_hess,
 class_list = [
-    ("cHalfSquaredError", doc_SquaredError, None,
+    ("CyHalfSquaredError", doc_SquaredError, None,
      "closs_half_squared_error", None,
      "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
-    ("cAbsoluteError", doc_AbsoluteError, None,
+    ("CyAbsoluteError", doc_AbsoluteError, None,
      "closs_absolute_error", None,
      "cgradient_absolute_error", "cgrad_hess_absolute_error"),
-    ("cPinballLoss", doc_PinballLoss, "quantile",
+    ("CyPinballLoss", doc_PinballLoss, "quantile",
      "closs_pinball_loss", None,
      "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
-    ("cHalfPoissonLoss", doc_PoissonLoss, None,
+    ("CyHalfPoissonLoss", doc_PoissonLoss, None,
      "closs_half_poisson", "closs_grad_half_poisson",
      "cgradient_half_poisson", "cgrad_hess_half_poisson"),
-    ("cHalfGammaLoss", doc_GammaLoss, None,
+    ("CyHalfGammaLoss", doc_GammaLoss, None,
      "closs_half_gamma", "closs_grad_half_gamma",
      "cgradient_half_gamma", "cgrad_hess_half_gamma"),
-    ("cHalfTweedieLoss", doc_TweedieLoss, "power",
+    ("CyHalfTweedieLoss", doc_TweedieLoss, "power",
      "closs_half_tweedie", "closs_grad_half_tweedie",
      "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
-    ("cBinaryCrossEntropy", doc_BinaryCrossEntropy, None,
+    ("CyBinaryCrossEntropy", doc_BinaryCrossEntropy, None,
      "closs_binary_crossentropy", "closs_grad_binary_crossentropy",
      "cgradient_binary_crossentropy", "cgrad_hess_binary_crossentropy"),
 ]
 }}
 """
-WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+WARNING: Do not edit `sklearn/_loss/_loss.pyx` file directly, as it is generated from
+`sklearn/_loss/_loss.pyx.tp`. Changes must be made there.
 """
 #------------------------------------------------------------------------------
 
@@ -170,22 +173,22 @@ WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
 #    stability, i.e. use raw_prediction instead of y_pred in signature.
 # b) Pure C functions (nogil) calculate single points (single sample)
 # c) Wrap C functions in a loop to get Python functions operating on ndarrays.
-#   - Write loops manually.
+#   - Write loops manually---use Tempita for this.
 #     Reason: There is still some performance overhead when using a wrapper
 #     function "wrap" that carries out the loop and gets as argument a function
 #     pointer to one of the C functions from b), e.g.
 #     wrap(closs_half_poisson, y_true, ...)
 #   - Pass n_threads as argument to prange and propagate option to all callers.
-# d) Provide classes (Cython extension types) per loss in order to have
-#    semantical structured objects.
-#    - Member function for single points just call the C function from b).
+# d) Provide classes (Cython extension types) per loss (names start with Cy) in
+#    order to have semantical structured objects.
+#    - Member functions for single points just call the C function from b).
 #      These are used e.g. in SGD `_plain_sgd`.
-#    - Member functions operating on ndarrays looping, see c), over calls to C
+#    - Member functions operating on ndarrays, see c), looping over calls to C
 #      functions from b).
 # e) Provide convenience Python classes that inherit from these extension types
 #    elsewhere (see loss.py)
-#    - Example: loss.gradient calls extension_type._gradient but does some
-#      input checking like None -> np.empty().
+#    - Example: loss.gradient calls CyLoss._gradient but does some input
+#      checking like None -> np.empty().
 #
 # Note: We require 1-dim ndarrays to be contiguous.
 # TODO: Use const memoryviews with fused types with Cython 3.0 where
@@ -571,10 +574,10 @@ cdef inline double_pair cgrad_hess_binary_crossentropy(
 # ---------------------------------------------------
 # Extension Types for Loss Functions of 1-dim targets
 # ---------------------------------------------------
-cdef class cLossFunction:
+cdef class CyLossFunction:
     """Base class for convex loss functions."""
 
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil:
         """Compute the loss for a single sample.
 
         Parameters
@@ -591,7 +594,7 @@ cdef class cLossFunction:
         """
         pass
 
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil:
         """Compute gradient of loss w.r.t. raw_prediction for a single sample.
 
         Parameters
@@ -608,7 +611,7 @@ cdef class cLossFunction:
         """
         pass
 
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil:
         """Compute gradient and hessian.
 
         Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
@@ -782,7 +785,7 @@ else:
     with_param = ", self." + param
 }}
 
-cdef class {{name}}(cLossFunction):
+cdef class {{name}}(CyLossFunction):
     """{{docstring}}"""
 
     {{if param is not None}}
@@ -790,13 +793,13 @@ cdef class {{name}}(cLossFunction):
         self.{{param}} = {{param}}
     {{endif}}
 
-    cdef inline double closs(self, double y_true, double raw_prediction) nogil:
+    cdef inline double cy_loss(self, double y_true, double raw_prediction) nogil:
         return {{closs}}(y_true, raw_prediction{{with_param}})
 
-    cdef inline double cgradient(self, double y_true, double raw_prediction) nogil:
+    cdef inline double cy_gradient(self, double y_true, double raw_prediction) nogil:
         return {{cgrad}}(y_true, raw_prediction{{with_param}})
 
-    cdef inline double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil:
         return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
 
     def _loss(
@@ -916,7 +919,7 @@ cdef class {{name}}(cLossFunction):
 {{endfor}}
 
 
-cdef class cCategoricalCrossEntropy(cLossFunction):
+cdef class CyCategoricalCrossEntropy(CyLossFunction):
     """CategoricalCrossEntropy with multinomial logit link.
 
     Domain:
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 1608bd27902a8..53add11d9c89e 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -18,15 +18,15 @@
 import numpy as np
 from scipy.special import xlogy
 from ._loss import (
-    cLossFunction,
-    cHalfSquaredError,
-    cAbsoluteError,
-    cPinballLoss,
-    cHalfPoissonLoss,
-    cHalfGammaLoss,
-    cHalfTweedieLoss,
-    cBinaryCrossEntropy,
-    cCategoricalCrossEntropy,
+    CyLossFunction,
+    CyHalfSquaredError,
+    CyAbsoluteError,
+    CyPinballLoss,
+    CyHalfPoissonLoss,
+    CyHalfGammaLoss,
+    CyHalfTweedieLoss,
+    CyBinaryCrossEntropy,
+    CyCategoricalCrossEntropy,
 )
 from .link import (
     Interval,
@@ -42,7 +42,7 @@
 # Note: The shape of raw_prediction for multiclass classifications are
 # - GradientBoostingClassifier: (n_samples, n_classes)
 # - HistGradientBoostingClassifier: (n_classes, n_samples)
-class BaseLoss(BaseLink, cLossFunction):
+class BaseLoss(BaseLink, CyLossFunction):
     """Base class for a loss function of 1-dimensional targets.
 
     Conventions:
@@ -94,7 +94,7 @@ class BaseLoss(BaseLink, cLossFunction):
     # - link
     # - inverse
     #
-    # Inherited methods from cLossFunction:
+    # Inherited methods from CyLossFunction:
     # - _loss, _loss_gradient, _gradient, _gradient_hessian
 
     # For decision trees:
@@ -434,7 +434,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return np.zeros_like(y_true)
 
 
-class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
+class HalfSquaredError(IdentityLink, BaseLoss, CyHalfSquaredError):
     """Half squared error with identity link, for regression.
 
     Domain:
@@ -505,7 +505,7 @@ def gradient_hessian(
         return gradient, hessian
 
 
-class AbsoluteError(IdentityLink, BaseLoss, cAbsoluteError):
+class AbsoluteError(IdentityLink, BaseLoss, CyAbsoluteError):
     """Absolute error with identity link, for regression.
 
     Domain:
@@ -542,7 +542,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 50)
 
 
-class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
+class PinballLoss(IdentityLink, BaseLoss, CyPinballLoss):
     """Quantile loss aka pinball loss, for regression.
 
     Domain:
@@ -573,7 +573,7 @@ class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
 
     def __init__(self, sample_weight=None, quantile=0.5):
         BaseLoss.__init__(self)
-        cPinballLoss.__init__(self, quantile=float(quantile))
+        CyPinballLoss.__init__(self, quantile=float(quantile))
         self.approx_hessian = True
         if sample_weight is None:
             self.constant_hessian = True
@@ -597,7 +597,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 100 * self.quantile)
 
 
-class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
+class HalfPoissonLoss(LogLink, BaseLoss, CyHalfPoissonLoss):
     """Poisson deviance loss with log-link, for regression.
 
     Domain:
@@ -630,7 +630,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
+class HalfGammaLoss(LogLink, BaseLoss, CyHalfGammaLoss):
     """Gamma deviance loss with log-link, for regression.
 
     Domain:
@@ -662,7 +662,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfTweedieLoss(LogLink, BaseLoss, cHalfTweedieLoss):
+class HalfTweedieLoss(LogLink, BaseLoss, CyHalfTweedieLoss):
     """Tweedie deviance loss with log-link, for regression.
 
     Domain:
@@ -695,7 +695,7 @@ class HalfTweedieLoss(LogLink, BaseLoss, cHalfTweedieLoss):
 
     def __init__(self, sample_weight=None, power=1.5):
         BaseLoss.__init__(self)
-        cHalfTweedieLoss.__init__(self, power=power)
+        CyHalfTweedieLoss.__init__(self, power=power)
         self.interval_y_pred = Interval(0, np.inf, False, False)
         if self.power <= 0:
             self.interval_y_true = Interval(-np.inf, np.inf, False, False)
@@ -725,7 +725,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
             return term
 
 
-class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
+class BinaryCrossEntropy(LogitLink, BaseLoss, CyBinaryCrossEntropy):
     """Binary cross entropy loss with logit link, for binary classification.
 
     Domain:
@@ -773,7 +773,7 @@ def predict_proba(self, raw_prediction):
         return proba
 
 
-class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, cCategoricalCrossEntropy):
+class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, CyCategoricalCrossEntropy):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:

From c13e3b1dfc72658b31044296d7336c10d33979bc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:32:55 +0200
Subject: [PATCH 034/143] TST loss.predict_proba

---
 sklearn/_loss/tests/test_loss.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 1673382114378..4d977d4078071 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -819,3 +819,24 @@ def test_binary_and_categorical_crossentropy():
         bce.loss(y_true=y_train, raw_prediction=raw_prediction),
         cce.loss(y_true=y_train, raw_prediction=raw_cce),
     )
+
+
+@pytest.mark.parametrize(
+    "loss",
+    [loss for loss in LOSS_INSTANCES if hasattr(loss, "predict_proba")],
+    ids=loss_instance_name,
+)
+def test_predict_proba(loss):
+    """Test that predict_proba works as expected."""
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+    proba = loss.predict_proba(raw_prediction)
+
+    assert proba.shape == (n_samples, loss.n_classes)
+    assert np.sum(proba, axis=1) == approx(1)

From 390ff19478563b7736dfc1808a0efa8e4266791c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:44:05 +0200
Subject: [PATCH 035/143] TST predict_proba and gradient_proba

---
 sklearn/_loss/tests/test_loss.py | 41 +++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 4d977d4078071..47a212b77a34d 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -821,13 +821,9 @@ def test_binary_and_categorical_crossentropy():
     )
 
 
-@pytest.mark.parametrize(
-    "loss",
-    [loss for loss in LOSS_INSTANCES if hasattr(loss, "predict_proba")],
-    ids=loss_instance_name,
-)
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_predict_proba(loss):
-    """Test that predict_proba works as expected."""
+    """Test that predict_proba and gradient_proba work as expected."""
     n_samples = 20
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -836,7 +832,34 @@ def test_predict_proba(loss):
         raw_bound=(-5, 5),
         seed=42,
     )
-    proba = loss.predict_proba(raw_prediction)
 
-    assert proba.shape == (n_samples, loss.n_classes)
-    assert np.sum(proba, axis=1) == approx(1)
+    if hasattr(loss, "predict_proba"):
+        proba = loss.predict_proba(raw_prediction)
+        assert proba.shape == (n_samples, loss.n_classes)
+        assert np.sum(proba, axis=1) == approx(1)
+
+    if hasattr(loss, "gradient_proba"):
+        for grad, proba in (
+            (None, None),
+            (None, np.empty_like(raw_prediction)),
+            (np.empty_like(raw_prediction), None),
+            (np.empty_like(raw_prediction), np.empty_like(raw_prediction)),
+        ):
+            grad, proba = loss.gradient_proba(
+                y_true=y_true,
+                raw_prediction=raw_prediction,
+                sample_weight=None,
+                gradient=grad,
+                proba=proba,
+            )
+            assert proba.shape == (n_samples, loss.n_classes)
+            assert np.sum(proba, axis=1) == approx(1)
+            assert_allclose(
+                grad,
+                loss.gradient(
+                    y_true=y_true,
+                    raw_prediction=raw_prediction,
+                    sample_weight=None,
+                    gradient=None,
+                ),
+            )

From ebd9f409025501e25d9717cf825e1d85b281febd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:56:05 +0200
Subject: [PATCH 036/143] MNT use is_multiclass in tests instead of n_classes
 <= 2

---
 sklearn/_loss/tests/test_loss.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 47a212b77a34d..4527b58a09ea7 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -442,7 +442,7 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
 
     baseline_prediction = loss.fit_intercept_only(y_true=y_true, sample_weight=None)
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         raw_prediction = np.zeros(shape=(n_samples,), dtype=baseline_prediction.dtype)
     else:
         raw_prediction = np.zeros(
@@ -460,7 +460,7 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         sample_weight=sample_weight,
     )
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         assert_allclose(gradient * sample_weight, gradient_sw)
         assert_allclose(hessian * sample_weight, hessian_sw)
     else:
@@ -476,7 +476,7 @@ def test_loss_of_perfect_prediction(loss, sample_weight):
     Loss of y_pred = y_true plus constant_to_optimal_zero should sums up to
     zero.
     """
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         # Use small values such that exp(value) is not nan.
         raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
         y_true = loss.inverse(raw_prediction)
@@ -537,7 +537,7 @@ def test_gradients_hessians_numerically(loss, sample_weight):
     assert g.shape == raw_prediction.shape
     assert h.shape == raw_prediction.shape
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
 
         def loss_func(x):
             return loss.loss(
@@ -673,7 +673,7 @@ def test_loss_intercept_only(loss, sample_weight):
     Also test that the gradient is zero at the minimum.
     """
     n_samples = 50
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         y_true = loss.inverse(np.linspace(-4, 4, num=n_samples))
     else:
         y_true = np.arange(n_samples).astype(float) % loss.n_classes
@@ -686,7 +686,7 @@ def test_loss_intercept_only(loss, sample_weight):
 
     # find minimum by optimization
     def fun(x):
-        if loss.n_classes <= 2:
+        if not loss.is_multiclass:
             raw_prediction = np.full(shape=(n_samples), fill_value=x)
         else:
             raw_prediction = np.ascontiguousarray(
@@ -698,7 +698,7 @@ def fun(x):
             sample_weight=sample_weight,
         )
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         opt = minimize_scalar(fun, tol=1e-7, options={"maxiter": 100})
         grad = loss.gradient(
             y_true=y_true,

From 60e0fc46ab44fd82ec9738ffbabeac49feff0059 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:58:00 +0200
Subject: [PATCH 037/143] DOC docstring predict_proba and more

---
 sklearn/_loss/_loss.pyx.tp | 82 +++++++++++++++++++-------------------
 sklearn/_loss/loss.py      | 28 ++++++++++++-
 2 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 63ada42133dcf..573b1bba8d47b 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -804,10 +804,10 @@ cdef class {{name}}(CyLossFunction):
 
     def _loss(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
         int n_threads=1
     ):
         cdef:
@@ -830,11 +830,11 @@ cdef class {{name}}(CyLossFunction):
     {{if closs_grad is not None}}
     def _loss_gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
+        G_DTYPE_C[::1] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -862,10 +862,10 @@ cdef class {{name}}(CyLossFunction):
 
     def _gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -887,11 +887,11 @@ cdef class {{name}}(CyLossFunction):
 
     def _gradient_hessian(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] hessian,         # OUT
         int n_threads=1
     ):
         cdef:
@@ -938,10 +938,10 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
     # opposite are welcome.
     def _loss(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[::1] loss,             # OUT
         int n_threads=1
     ):
         cdef:
@@ -998,11 +998,11 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
     def _loss_gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[:, :] gradient,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[::1] loss,             # OUT
+        G_DTYPE_C[:, :] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -1060,10 +1060,10 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
     def _gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[:, :] gradient,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[:, :] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -1109,11 +1109,11 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
     def _gradient_hessian(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[:, :] gradient,
-        G_DTYPE_C[:, :] hessian,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[:, :] gradient,        # OUT
+        G_DTYPE_C[:, :] hessian,         # OUT
         int n_threads=1
     ):
         cdef:
@@ -1167,11 +1167,11 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
     # diagonal (in the classes) approximation as implemented above.
     def _gradient_proba(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[:, :] gradient,
-        G_DTYPE_C[:, :] proba,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[:, :] gradient,        # OUT
+        G_DTYPE_C[:, :] proba,           # OUT
         int n_threads=1
     ):
         cdef:
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 53add11d9c89e..4530fd90a5212 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -764,6 +764,18 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
     def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
+            Raw prediction values (in link space).
+
+        Returns
+        -------
+        proba : array of shape (n_samples, 2)
+            Element-wise class probabilites.
+        """
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
@@ -838,6 +850,18 @@ def fit_intercept_only(self, y_true, sample_weight=None):
         return self.link(out[None, :]).reshape(-1)
 
     def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+
+        Returns
+        -------
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilites.
+        """
         return self.inverse(raw_prediction)
 
     def gradient_proba(
@@ -849,7 +873,7 @@ def gradient_proba(
         proba=None,
         n_threads=1,
     ):
-        """Compute gradient and probabilities fow raw_prediction.
+        """Compute gradient and class probabilities fow raw_prediction.
 
         Parameters
         ----------
@@ -870,7 +894,7 @@ def gradient_proba(
 
         Returns
         -------
-        gradient, proba : array of shape (n_samples, n_classes)
+        gradient : array of shape (n_samples, n_classes)
             Element-wise gradients.
 
         proba : array of shape (n_samples, n_classes)

From 70199a44073e9ef4958c46eb4a3c0f6bb79a26d1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 17:01:44 +0200
Subject: [PATCH 038/143] MNT remove top_path from gen_from_templates

---
 sklearn/_loss/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index b80584d8707c8..2a2d2b5f13b8a 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -8,7 +8,7 @@ def configuration(parent_package="", top_path=None):
 
     # generate _loss.pyx from template
     templates = ["sklearn/_loss/_loss.pyx.tp"]
-    gen_from_templates(templates, top_path)
+    gen_from_templates(templates)
 
     config.add_extension(
         "_loss",

From 5721077dbaf430b25ba9d7a1516491066189d945 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 17:26:23 +0200
Subject: [PATCH 039/143] CI add --allow-releaseinfo-change in circleci

---
 build_tools/circle/build_doc.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh
index 42067732b8bad..8facdc59a6c56 100755
--- a/build_tools/circle/build_doc.sh
+++ b/build_tools/circle/build_doc.sh
@@ -134,7 +134,7 @@ make_args="SPHINXOPTS=-T $make_args"  # show full traceback on exception
 
 # Installing required system packages to support the rendering of math
 # notation in the HTML documentation and to optimize the image files
-sudo -E apt-get -yq update
+sudo -E apt-get -yq update --allow-releaseinfo-change
 sudo -E apt-get -yq --no-install-suggests --no-install-recommends \
     install dvipng gsfonts ccache zip optipng
 

From f53e3f831ab08decf1776e0d4996ad85fa0b64c2 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 17 Aug 2021 10:27:40 +0200
Subject: [PATCH 040/143] TST test graceful squeezing

---
 sklearn/_loss/tests/test_loss.py | 75 +++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 12 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 4527b58a09ea7..e179c310b2d2c 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -419,10 +419,10 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", ["ones", "random"])
-def test_sample_weight_multiplies_gradients(loss, sample_weight):
-    """Test sample weights in gradients and hessians.
+def test_sample_weight_multiplies(loss, sample_weight):
+    """Test sample weights in loss, gradients and hessians.
 
-    Make sure that passing sample weights to the gradient and hessians
+    Make sure that passing sample weights to loss, gradient and hessian
     computation methods is equivalent to multiplying by the weights.
     """
     n_samples = 100
@@ -440,26 +440,46 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         rng = np.random.RandomState(42)
         sample_weight = rng.normal(size=n_samples).astype(np.float64)
 
-    baseline_prediction = loss.fit_intercept_only(y_true=y_true, sample_weight=None)
+    assert_allclose(
+        loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+        ),
+        sample_weight
+        * loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=None,
+        ),
+    )
 
+    losses, gradient = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=None,
+    )
+    losses_sw, gradient_sw = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    assert_allclose(losses * sample_weight, losses_sw)
     if not loss.is_multiclass:
-        raw_prediction = np.zeros(shape=(n_samples,), dtype=baseline_prediction.dtype)
+        assert_allclose(gradient * sample_weight, gradient_sw)
     else:
-        raw_prediction = np.zeros(
-            shape=(n_samples, loss.n_classes), dtype=baseline_prediction.dtype
-        )
-    raw_prediction += baseline_prediction
+        assert_allclose(gradient * sample_weight[:, None], gradient_sw)
 
     gradient, hessian = loss.gradient_hessian(
-        y_true=y_true, raw_prediction=raw_prediction, sample_weight=None
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=None,
     )
-
     gradient_sw, hessian_sw = loss.gradient_hessian(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
     )
-
     if not loss.is_multiclass:
         assert_allclose(gradient * sample_weight, gradient_sw)
         assert_allclose(hessian * sample_weight, hessian_sw)
@@ -468,6 +488,37 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         assert_allclose(hessian * sample_weight[:, None], hessian_sw)
 
 
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_graceful_squeezing(loss):
+    """Test that Python and Cython functions return same results."""
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+
+    if raw_prediction.ndim == 1:
+        raw_prediction_2d = raw_prediction[:, None]
+        assert_allclose(
+            loss.loss(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.loss(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.gradient(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.gradient(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction),
+        )
+
+
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_of_perfect_prediction(loss, sample_weight):

From a5c1d3ca9c95c8b28e98af0616af700fafd51c8c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 17 Aug 2021 13:23:21 +0200
Subject: [PATCH 041/143] CLN no extra methods for HalfSquaredError

---
 sklearn/_loss/loss.py | 45 -------------------------------------------
 1 file changed, 45 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 4530fd90a5212..35b6e0903eb73 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -459,51 +459,6 @@ def __init__(self, sample_weight=None):
         else:
             self.constant_hessian = False
 
-    def gradient(
-        self,
-        y_true,
-        raw_prediction,
-        sample_weight=None,
-        gradient=None,
-        n_threads=1,
-    ):
-        # Be graceful to shape (n_samples, 1) -> (n_samples,)
-        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
-            raw_prediction = raw_prediction.squeeze(1)
-        if gradient is not None and gradient.ndim == 2 and gradient.shape[1] == 1:
-            gradient = gradient.squeeze(1)
-
-        # gradient = raw_prediction - y_true is easier in numpy
-        gradient = np.subtract(raw_prediction, y_true, out=gradient)
-        if sample_weight is None:
-            return gradient
-        else:
-            return np.multiply(sample_weight, gradient, out=gradient)
-
-    def gradient_hessian(
-        self,
-        y_true,
-        raw_prediction,
-        sample_weight=None,
-        gradient=None,
-        hessian=None,
-        n_threads=1,
-    ):
-        # easier in numpy
-        gradient = self.gradient(
-            y_true, raw_prediction, sample_weight, gradient, hessian
-        )
-        if hessian is None:
-            hessian = np.empty_like(gradient)
-        elif hessian.ndim == 2 and hessian.shape[1] == 1:
-            # Be graceful to shape (n_samples, 1) -> (n_samples,)
-            hessian = hessian.squeeze(1)
-        if sample_weight is None:
-            hessian.fill(1)
-        else:
-            np.copyto(hessian, sample_weight)
-        return gradient, hessian
-
 
 class AbsoluteError(IdentityLink, BaseLoss, CyAbsoluteError):
     """Absolute error with identity link, for regression.

From 7bee26b9dba71439af7a830d3a24753e586464bb Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 19 Aug 2021 21:30:43 +0200
Subject: [PATCH 042/143] TST remove testing if approx_hessian=True

---
 sklearn/_loss/tests/test_loss.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index e179c310b2d2c..2bf41bca8adb6 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -609,7 +609,8 @@ def grad_func(x):
 
         h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6)
         if loss.approx_hessian:
-            assert np.all(h >= h_numeric)
+            # TODO: What could we test if loss.approx_hessian?
+            pass
         else:
             assert_allclose(h, h_numeric, rtol=5e-6, atol=1e-10)
     else:
@@ -643,7 +644,8 @@ def grad_func(x):
 
             h_numeric = numerical_derivative(grad_func, raw_prediction[:, k], eps=1e-6)
             if loss.approx_hessian:
-                assert np.all(h >= h_numeric)
+                # TODO: What could we test if loss.approx_hessian?
+                pass
             else:
                 assert_allclose(h[:, k], h_numeric, rtol=5e-6, atol=1e-10)
 

From 15b7c992c81618c557a97c7d22a5c982e87758fd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 23 Aug 2021 19:36:46 +0200
Subject: [PATCH 043/143] DOC remove loss module for classes.rst

---
 doc/modules/classes.rst | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index e6fe6ade407d8..3edd8adee8191 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1646,27 +1646,3 @@ Recently deprecated
 
 To be removed in 1.0 (renaming of 0.25)
 ---------------------------------------
-
-.. _loss_function_ref:
-
-:mod:`sklearn._loss`: Private Loss Function Classes
-===========================================================
-
-.. automodule:: sklearn._loss
-   :no-members:
-   :no-inherited-members:
-
-.. currentmodule:: sklearn
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   _loss.HalfSquaredError
-   _loss.AbsoluteError
-   _loss.PinballLoss
-   _loss.HalfPoissonLoss
-   _loss.HalfGammaLoss
-   _loss.HalfTweedieLoss
-   _loss.BinaryCrossEntropy
-   _loss.CategoricalCrossEntropy

From 67a30693e81a37e8ba5e0f8353ff1b56ed879bac Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 27 Aug 2021 17:29:04 +0200
Subject: [PATCH 044/143] TST that losses can be pickled

---
 sklearn/_loss/tests/test_loss.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 2bf41bca8adb6..b867437ecb0f4 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1,3 +1,5 @@
+import pickle
+
 import numpy as np
 from numpy.testing import assert_allclose, assert_array_equal
 import pytest
@@ -916,3 +918,21 @@ def test_predict_proba(loss):
                     gradient=None,
                 ),
             )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_loss_pickle(loss):
+    """Test that losses can be pickled."""
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+    pickled_loss = pickle.dumps(loss)
+    unpickled_loss = pickle.loads(pickled_loss)
+    assert loss(y_true=y_true, raw_prediction=raw_prediction) == approx(
+        unpickled_loss(y_true=y_true, raw_prediction=raw_prediction)
+    )

From 696d18fe39a011d2c90dc876f14bade1bd6789f7 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 28 Aug 2021 13:37:49 +0200
Subject: [PATCH 045/143] TST add test_loss_on_specific_values

---
 sklearn/_loss/loss.py            |  2 +-
 sklearn/_loss/tests/test_loss.py | 60 ++++++++++++++++++++++++++++----
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 35b6e0903eb73..209f1352a2662 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -49,7 +49,7 @@ class BaseLoss(BaseLink, CyLossFunction):
 
         - y_true.shape = sample_weight.shape = (n_samples,)
         - y_pred.shape = raw_prediction.shape = (n_samples,)
-        - If n_classes >= 3 (multiclass classification), then
+        - If is_multiclass is true (multiclass classification), then
           y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
           Note that this corresponds to the return value of decision_function.
 
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index b867437ecb0f4..19d7dbf484455 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -9,10 +9,12 @@
     minimize_scalar,
     newton,
 )
+from scipy.special import logsumexp
 
 from sklearn._loss.link import _inclusive_low_high, IdentityLink
 from sklearn._loss.loss import (
     _LOSSES,
+    BaseLoss,
     AbsoluteError,
     BinaryCrossEntropy,
     CategoricalCrossEntropy,
@@ -41,13 +43,17 @@
 ]
 
 
-def loss_instance_name(loss):
-    name = loss.__class__.__name__
-    if hasattr(loss, "quantile"):
-        name += f"(quantile={loss.quantile})"
-    elif hasattr(loss, "power"):
-        name += f"(power={loss.power})"
-    return name
+def loss_instance_name(param):
+    if isinstance(param, BaseLoss):
+        loss = param
+        name = loss.__class__.__name__
+        if hasattr(loss, "quantile"):
+            name += f"(quantile={loss.quantile})"
+        elif hasattr(loss, "power"):
+            name += f"(power={loss.power})"
+        return name
+    else:
+        return str(param)
 
 
 def random_y_true_raw_prediction(
@@ -190,6 +196,46 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
         assert not loss.in_y_pred_range(np.array([y]))
 
 
+@pytest.mark.parametrize(
+    "loss, y_true, raw_prediction, loss_true",
+    [
+        (HalfSquaredError(), 1.0, 5.0, 8),
+        (AbsoluteError(), 1.0, 5.0, 4),
+        (PinballLoss(quantile=0.5), 1.0, 5.0, 2),
+        (PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25)),
+        (PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25),
+        (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4)),
+        (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4),
+        (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4 ** 2),
+        (BinaryCrossEntropy(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
+        (
+            CategoricalCrossEntropy(n_classes=3),
+            0.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.2,
+        ),
+        (
+            CategoricalCrossEntropy(n_classes=3),
+            1.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.5,
+        ),
+        (
+            CategoricalCrossEntropy(n_classes=3),
+            2.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.3,
+        ),
+    ],
+    ids=loss_instance_name,
+)
+def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
+    """Test losses at specific values."""
+    assert loss(
+        y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
+    ) == approx(loss_true)
+
+
 @pytest.mark.parametrize("loss", ALL_LOSSES)
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])

From cef6e24cc8a12a1a9605da06d7462cbd8fc8e5c1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 28 Aug 2021 13:45:01 +0200
Subject: [PATCH 046/143] FIX make cython inheritance happy and losses pickable

---
 sklearn/_loss/loss.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 209f1352a2662..e6bc17609c9f7 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -434,7 +434,11 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return np.zeros_like(y_true)
 
 
-class HalfSquaredError(IdentityLink, BaseLoss, CyHalfSquaredError):
+# Note: Naturally, we would inherit in the following order
+#         class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
+#       But because of https://github.com/cython/cython/issues/4350 we
+#       set BaseLoss as the last one. This, of course, changes the MRO.
+class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
     """Half squared error with identity link, for regression.
 
     Domain:
@@ -460,7 +464,7 @@ def __init__(self, sample_weight=None):
             self.constant_hessian = False
 
 
-class AbsoluteError(IdentityLink, BaseLoss, CyAbsoluteError):
+class AbsoluteError(IdentityLink, CyAbsoluteError, BaseLoss):
     """Absolute error with identity link, for regression.
 
     Domain:
@@ -497,7 +501,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 50)
 
 
-class PinballLoss(IdentityLink, BaseLoss, CyPinballLoss):
+class PinballLoss(IdentityLink, CyPinballLoss, BaseLoss):
     """Quantile loss aka pinball loss, for regression.
 
     Domain:
@@ -552,7 +556,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 100 * self.quantile)
 
 
-class HalfPoissonLoss(LogLink, BaseLoss, CyHalfPoissonLoss):
+class HalfPoissonLoss(LogLink, CyHalfPoissonLoss, BaseLoss):
     """Poisson deviance loss with log-link, for regression.
 
     Domain:
@@ -585,7 +589,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfGammaLoss(LogLink, BaseLoss, CyHalfGammaLoss):
+class HalfGammaLoss(LogLink, CyHalfGammaLoss, BaseLoss):
     """Gamma deviance loss with log-link, for regression.
 
     Domain:
@@ -617,7 +621,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfTweedieLoss(LogLink, BaseLoss, CyHalfTweedieLoss):
+class HalfTweedieLoss(LogLink, CyHalfTweedieLoss, BaseLoss):
     """Tweedie deviance loss with log-link, for regression.
 
     Domain:
@@ -680,7 +684,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
             return term
 
 
-class BinaryCrossEntropy(LogitLink, BaseLoss, CyBinaryCrossEntropy):
+class BinaryCrossEntropy(LogitLink, CyBinaryCrossEntropy, BaseLoss):
     """Binary cross entropy loss with logit link, for binary classification.
 
     Domain:
@@ -740,7 +744,7 @@ def predict_proba(self, raw_prediction):
         return proba
 
 
-class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, CyCategoricalCrossEntropy):
+class CategoricalCrossEntropy(MultinomialLogit, CyCategoricalCrossEntropy, BaseLoss):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:

From 130306c5bf69708f5785cfd8d338b894c84ceb0a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 29 Aug 2021 16:09:19 +0200
Subject: [PATCH 047/143] ENH support const memoryviews by ReadonlyWrapper

---
 sklearn/_loss/_loss.pyx.tp       | 32 ++++++++++++++++++++
 sklearn/_loss/loss.py            | 32 ++++++++++++++++++++
 sklearn/_loss/tests/test_loss.py | 51 ++++++++++++++++++++++++--------
 3 files changed, 102 insertions(+), 13 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 573b1bba8d47b..c32303c12b0c5 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -205,6 +205,38 @@ from libc.stdlib cimport malloc, free
 np.import_array()
 
 
+# -------------------------------------
+# Readonly array wrapper
+# -------------------------------------
+# TODO: Remove with Cython >= 3.0 which supports const memoryviews for fused types.
+#
+# This class supports the buffer protocol, thus can wrap arrays and memoryvies.
+# All it does is LIE about the readonly attribute: tell it's false!
+# This way, we can use it on arrays that we don't touch.
+# !!! USE CAREFULLY !!!
+
+from cpython cimport Py_buffer
+from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE
+
+cdef class ReadonlyWrapper:
+    cdef object wraps
+
+    def __init__(self, wraps):
+        self.wraps = wraps
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        request_for_writeable = False
+        if flags & PyBUF_WRITABLE:
+            flags ^= PyBUF_WRITABLE
+            request_for_writeable = True
+        PyObject_GetBuffer(self.wraps, buffer, flags)
+        if request_for_writeable:
+            buffer.readonly = False  # This is a lie!
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        PyBuffer_Release(buffer)
+
+
 # -------------------------------------
 # Helper functions
 # -------------------------------------
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index e6bc17609c9f7..4031be741dc65 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -27,6 +27,7 @@
     CyHalfTweedieLoss,
     CyBinaryCrossEntropy,
     CyCategoricalCrossEntropy,
+    ReadonlyWrapper,  # TODO: Remove with Cython >= 3.0
 )
 from .link import (
     Interval,
@@ -170,6 +171,13 @@ def loss(
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
+
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -231,6 +239,12 @@ def loss_gradient(
         if gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._loss_gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -280,6 +294,12 @@ def gradient(
         if gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -344,6 +364,12 @@ def gradient_hessian(
         if hessian.ndim == 2 and hessian.shape[1] == 1:
             hessian = hessian.squeeze(1)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._gradient_hessian(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -868,6 +894,12 @@ def gradient_proba(
         elif proba is None:
             proba = np.empty_like(gradient)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._gradient_proba(
             y_true=y_true,
             raw_prediction=raw_prediction,
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 19d7dbf484455..f1d3ca76ef094 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -25,7 +25,7 @@
     PinballLoss,
 )
 from sklearn.utils import assert_all_finite
-from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils._testing import create_memmap_backed_data, skip_if_32bit
 from sklearn.utils.fixes import sp_version, parse_version
 
 
@@ -237,37 +237,48 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
 
 
 @pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize("readonly_memmap", [False, True])
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
 @pytest.mark.parametrize("sample_weight", [None, 1])
 @pytest.mark.parametrize("out1", [None, 1])
 @pytest.mark.parametrize("out2", [None, 1])
 @pytest.mark.parametrize("n_threads", [1, 2])
-def test_loss_dtype(loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads):
-    """Test acceptance of dtypes in loss functions.
+def test_loss_dtype_readonly(
+    loss, readonly_memmap, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
+):
+    """Test acceptance of dtypes and readonly arrays in loss functions.
 
     Check that loss accepts if all input arrays are either all float32 or all
     float64, and all output arrays are either all float32 or all float64.
+
+    Also check that input arrays can be readonly, e.g. memory mapped.
     """
     loss = loss()
     # generate a y_true and raw_prediction in valid range
-    if loss.is_multiclass:
-        y_true = np.array([0], dtype=dtype_in)
-        raw_prediction = np.full(
-            shape=(1, loss.n_classes), fill_value=0.0, dtype=dtype_in
-        )
-    else:
-        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
-        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
-        raw_prediction = np.array([0.0], dtype=dtype_in)
+    n_samples = 5
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    y_true = y_true.astype(dtype_in)
+    raw_prediction = raw_prediction.astype(dtype_in)
 
     if sample_weight is not None:
-        sample_weight = np.array([2.0], dtype=dtype_in)
+        sample_weight = np.array([2.0] * n_samples, dtype=dtype_in)
     if out1 is not None:
         out1 = np.empty_like(y_true, dtype=dtype_out)
     if out2 is not None:
         out2 = np.empty_like(raw_prediction, dtype=dtype_out)
 
+    if readonly_memmap:
+        y_true, raw_prediction = create_memmap_backed_data([y_true, raw_prediction])
+        if sample_weight is not None:
+            sample_weight = create_memmap_backed_data(sample_weight)
+
     loss.loss(
         y_true=y_true,
         raw_prediction=raw_prediction,
@@ -300,6 +311,20 @@ def test_loss_dtype(loss, dtype_in, dtype_out, sample_weight, out1, out2, n_thre
         hessian=out2,
         n_threads=n_threads,
     )
+    loss(y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight)
+    loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight)
+    loss.constant_to_optimal_zero(y_true=y_true, sample_weight=sample_weight)
+    if hasattr(loss, "predict_proba"):
+        loss.predict_proba(raw_prediction=raw_prediction)
+    if hasattr(loss, "gradient_proba"):
+        loss.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out1,
+            proba=out2,
+            n_threads=n_threads,
+        )
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)

From 27f3dea4380a27a3ecef61edc52655c59441b2c9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 30 Aug 2021 22:19:44 +0200
Subject: [PATCH 048/143] address review comments

---
 sklearn/_loss/_loss.pxd          |  4 ++--
 sklearn/_loss/link.py            |  4 +++-
 sklearn/_loss/tests/test_link.py |  4 +++-
 sklearn/_loss/tests/test_loss.py | 10 +++++-----
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index b00379a1e793d..e2e44ca712b35 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -44,7 +44,7 @@ cdef class CyAbsoluteError(CyLossFunction):
 
 
 cdef class CyPinballLoss(CyLossFunction):
-    cdef readonly double quantile  # readonly makes it inherited by children
+    cdef readonly double quantile  # readonly makes it accessible from Python
     cdef double cy_loss(self, double y_true, double raw_prediction) nogil
     cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
@@ -63,7 +63,7 @@ cdef class CyHalfGammaLoss(CyLossFunction):
 
 
 cdef class CyHalfTweedieLoss(CyLossFunction):
-    cdef readonly double power  # readonly makes it inherited by children
+    cdef readonly double power  # readonly makes it accessible from Python
     cdef double cy_loss(self, double y_true, double raw_prediction) nogil
     cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index b756e275c6d0e..7dd40876a5683 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -22,7 +22,9 @@ class Interval:
     def __post_init__(self):
         """Check that low <= high"""
         if self.low > self.high:
-            raise ValueError("On must have low <= high; got low={low}, high={high}.")
+            raise ValueError(
+                f"On must have low <= high; got low={self.low}, high={self.high}."
+            )
 
     def includes(self, x):
         """Test whether all values of x are in interval range.
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index 3239ade25f3c7..4c0fc44060cbb 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -15,7 +15,9 @@
 
 def test_interval_raises():
     """Test that interval with low > high raises ValueError."""
-    with pytest.raises(ValueError, match="On must have low <= high"):
+    with pytest.raises(
+        ValueError, match="On must have low <= high; got low=1, high=0."
+    ):
         Interval(1, 0, False, False)
 
 
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index f1d3ca76ef094..98416099a28b3 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -233,7 +233,7 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
     """Test losses at specific values."""
     assert loss(
         y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
-    ) == approx(loss_true)
+    ) == approx(loss_true, rel=1e-11, abs=1e-12)
 
 
 @pytest.mark.parametrize("loss", ALL_LOSSES)
@@ -487,7 +487,7 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
         assert_allclose(g1, out_g4)
         assert_allclose(g1, g4)
         assert_allclose(proba, out_proba)
-        assert_allclose(np.sum(proba, axis=1), 1)
+        assert_allclose(np.sum(proba, axis=1), 1, rtol=1e-11)
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
@@ -563,7 +563,7 @@ def test_sample_weight_multiplies(loss, sample_weight):
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_graceful_squeezing(loss):
-    """Test that Python and Cython functions return same results."""
+    """Test that reshaped raw_prediction gives same results."""
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
         n_samples=20,
@@ -962,7 +962,7 @@ def test_predict_proba(loss):
     if hasattr(loss, "predict_proba"):
         proba = loss.predict_proba(raw_prediction)
         assert proba.shape == (n_samples, loss.n_classes)
-        assert np.sum(proba, axis=1) == approx(1)
+        assert np.sum(proba, axis=1) == approx(1, rel=1e-11)
 
     if hasattr(loss, "gradient_proba"):
         for grad, proba in (
@@ -979,7 +979,7 @@ def test_predict_proba(loss):
                 proba=proba,
             )
             assert proba.shape == (n_samples, loss.n_classes)
-            assert np.sum(proba, axis=1) == approx(1)
+            assert np.sum(proba, axis=1) == approx(1, rel=1e-11)
             assert_allclose(
                 grad,
                 loss.gradient(

From 86d659cea3c6d11bcd9b423c8774f6d5f62b592f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 31 Aug 2021 19:44:49 +0200
Subject: [PATCH 049/143] FEA add ReadonlyWrapper

---
 sklearn/utils/_readonly_array_wrapper.pyx    | 64 ++++++++++++++++++++
 sklearn/utils/setup.py                       |  6 ++
 sklearn/utils/tests/test_readonly_wrapper.py | 27 +++++++++
 3 files changed, 97 insertions(+)
 create mode 100644 sklearn/utils/_readonly_array_wrapper.pyx
 create mode 100644 sklearn/utils/tests/test_readonly_wrapper.py

diff --git a/sklearn/utils/_readonly_array_wrapper.pyx b/sklearn/utils/_readonly_array_wrapper.pyx
new file mode 100644
index 0000000000000..df888396176db
--- /dev/null
+++ b/sklearn/utils/_readonly_array_wrapper.pyx
@@ -0,0 +1,64 @@
+# cython: language_level=3
+
+# -------------------------------------
+# Readonly array wrapper
+# -------------------------------------
+# TODO: Remove with Cython >= 3.0 which supports const memoryviews for fused types.
+#
+# This class supports the buffer protocol, thus can wrap arrays and memoryviews.
+# All it does is LIE about the readonly attribute: tell it's false!
+# This way, we can use it on arrays that we don't touch.
+# !!! USE CAREFULLY !!!
+
+
+from cpython cimport Py_buffer
+from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE
+
+import numpy as np
+cimport numpy as np
+
+
+np.import_array()
+
+
+ctypedef fused NUM_TYPES:
+    np.npy_float64
+    np.npy_float32
+    np.npy_int64
+    np.npy_int32
+
+
+cdef class ReadonlyWrapper:
+    cdef object wraps
+
+    def __init__(self, wraps):
+        self.wraps = wraps
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        request_for_writeable = False
+        if flags & PyBUF_WRITABLE:
+            flags ^= PyBUF_WRITABLE
+            request_for_writeable = True
+        PyObject_GetBuffer(self.wraps, buffer, flags)
+        if request_for_writeable:
+            buffer.readonly = False  # This is a lie!
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        PyBuffer_Release(buffer)
+
+
+def _test_sum(NUM_TYPES[:] x):
+    """This function is for testing only.
+
+    As this function does not modify x, we would like to define it as
+        _test_sum(const NUM_TYPES[:] x)
+    which is not supported for fused types in Cython<3.0.
+    """
+    cdef:
+        int i
+        int n = x.shape[0]
+        NUM_TYPES sum = 0
+
+    for i in range(n):
+        sum += x[i]
+    return sum
diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py
index b06da9777be09..c75cbe2d86495 100644
--- a/sklearn/utils/setup.py
+++ b/sklearn/utils/setup.py
@@ -82,6 +82,12 @@ def configuration(parent_package="", top_path=None):
         libraries=libraries,
     )
 
+    config.add_extension(
+        "_readonly_array_wrapper",
+        sources=["_readonly_array_wrapper.pyx"],
+        libraries=libraries,
+    )
+
     config.add_subpackage("tests")
 
     return config
diff --git a/sklearn/utils/tests/test_readonly_wrapper.py b/sklearn/utils/tests/test_readonly_wrapper.py
new file mode 100644
index 0000000000000..4ef11a0007967
--- /dev/null
+++ b/sklearn/utils/tests/test_readonly_wrapper.py
@@ -0,0 +1,27 @@
+import numpy as np
+
+import pytest
+
+from sklearn.utils._readonly_array_wrapper import ReadonlyWrapper, _test_sum
+from sklearn.utils._testing import create_memmap_backed_data
+
+
+@pytest.mark.parametrize("readonly", ["flag", "memmap"])
+@pytest.mark.parametrize("dtype", [np.float32, np.float64, np.int32, np.int64])
+def test_readonly_array_wrapper(readonly, dtype):
+    """Test that ReadonlyWrapper works as expected."""
+    x = np.arange(10).astype(dtype)
+    sum_origin = _test_sum(x)
+
+    if readonly == "flag":
+        x_readonly = x.copy()
+        x_readonly.flags["WRITEABLE"] = False
+    else:
+        x_readonly = create_memmap_backed_data(x)
+
+    with pytest.raises(ValueError, match="buffer source array is read-only"):
+        _test_sum(x_readonly)
+
+    x_readonly = ReadonlyWrapper(x_readonly)
+    sum_readonly = _test_sum(x_readonly)
+    assert sum_readonly == pytest.approx(sum_origin, rel=1e-11)

From 7d30abbecb1849ba6d2528323fa38edb5bf5bde4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 2 Jan 2021 11:57:32 +0100
Subject: [PATCH 050/143] ENH add common link function submodule

---
 sklearn/_loss/link.py            | 246 +++++++++++++++++++++++++++++++
 sklearn/_loss/tests/test_link.py | 107 ++++++++++++++
 2 files changed, 353 insertions(+)
 create mode 100644 sklearn/_loss/link.py
 create mode 100644 sklearn/_loss/tests/test_link.py

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
new file mode 100644
index 0000000000000..f5567e6dd7b49
--- /dev/null
+++ b/sklearn/_loss/link.py
@@ -0,0 +1,246 @@
+"""
+Module contains classes for invertible (and differentiable) link functions.
+"""
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+
+from abc import ABC, abstractmethod
+from collections import namedtuple
+
+import numpy as np
+from scipy.special import expit, logit
+from scipy.stats import gmean
+from ..utils.extmath import softmax
+
+
+Interval = namedtuple(
+    "Interval", ("low", "high", "low_inclusive", "high_inclusive")
+)
+
+
+def is_in_interval_range(x, interval):
+    """Test whether values of x are in interval range from Interval.
+
+    Parameters
+    ----------
+    x : ndarray
+        Array whose elements are tested to be in interval range.
+    interval: Interval
+        An Interval range.
+    """
+    if interval.low_inclusive:
+        low = np.greater_equal(x, interval.low)
+    else:
+        low = np.greater(x, interval.low)
+
+    if not np.all(low):
+        return False
+
+    if interval.high_inclusive:
+        high = np.less_equal(x, interval.high)
+    else:
+        high = np.less(x, interval.high)
+
+    # Note: np.all returns numpy.bool_
+    if np.all(high):
+        return True
+    else:
+        return False
+
+
+def _inclusive_low_high(interval, dtype=float):
+    """Generate values low and high to be within the interval range."""
+    eps = 10 * np.finfo(dtype).eps
+    if interval.low == -np.inf:
+        low = -1e10
+    elif interval.low < 0:
+        low = interval.low * (1 - eps) + eps
+    else:
+        low = interval.low * (1 + eps) + eps
+
+    if interval.high == np.inf:
+        high = 1e10
+    elif interval.high < 0:
+        high = interval.high * (1 + eps) - eps
+    else:
+        high = interval.high * (1 - eps) - eps
+
+    return low, high
+
+
+class BaseLink(ABC):
+    """Abstract base class for differentiable, invertible link functions.
+
+    Convention:
+        - link function g: raw_prediction = g(y_pred)
+        - inverse link h: y_pred = h(raw_prediction)
+
+    For (generalized) linear models, `raw_prediction = X @ coef` is the so
+    called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
+    conditional (on X) expected value of the target `y_true`.
+
+    In case a link function needs parameters, the methods are not implemented
+    as staticmethods.
+    """
+
+    multiclass = False
+
+    # Usually, raw_prediction may be any real number and y_pred is an open
+    # interval.
+    interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
+    interval_y_pred = Interval(-np.inf, np.inf, False, False)
+
+    @abstractmethod
+    def link(self, y_pred, out=None):
+        """Compute the link function g(y_pred).
+
+        The link function maps (predicted) target values to raw predictions,
+        i.e. `g(y_pred) = raw_prediction`.
+
+        Parameters
+        ----------
+        y_pred : array
+            Predicted target values.
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise link function.
+        """
+
+    @abstractmethod
+    def inverse(self, raw_prediction, out=None):
+        """Compute the inverse link function h(raw_prediction).
+
+        The inverse link function maps raw predictions to predicted target
+        values, i.e. `h(raw_prediction) = y_pred`.
+
+        Parameters
+        ----------
+        raw_prediction : array
+            Raw prediction values (in link space).
+        out : array
+            A location into which the result is stored. If provided, it must
+            have a shape that the inputs broadcast to. If not provided or None,
+            a freshly-allocated array is returned.
+
+        Returns
+        -------
+        out : array
+            Output array, element-wise inverse link function.
+        """
+
+
+class IdentityLink(BaseLink):
+    """The identity link function g(x)=x."""
+
+    def link(self, y_pred, out=None):
+        if out is not None:
+            np.copyto(out, y_pred)
+            return out
+        else:
+            return y_pred
+
+    inverse = link
+
+
+class LogLink(BaseLink):
+    """The log link function g(x)=log(x)."""
+
+    interval_y_pred = Interval(0, np.inf, False, False)
+
+    def link(self, y_pred, out=None):
+        return np.log(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return np.exp(raw_prediction, out=out)
+
+
+class LogitLink(BaseLink):
+    """The logit link function g(x)=logit(x)."""
+
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def link(self, y_pred, out=None):
+        return logit(y_pred, out=out)
+
+    def inverse(self, raw_prediction, out=None):
+        return expit(raw_prediction, out=out)
+
+
+class MultinomialLogit(BaseLink):
+    """The symmetric multinomial logit function.
+
+    Convention:
+        - y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+
+    Notes:
+        - The inverse link h is the softmax function.
+        - The sum is over the second axis, i.e. axis=1 (n_classes).
+
+    We have to choose additional contraints in order to make
+
+        y_pred_k = exp(raw_pred_k) / sum(exp(raw_pred_k), k=0..n_classes-1)
+
+    for n_classes classes identifiable and invertible.
+    We choose the symmetric side contraint where the geometric mean response
+    is set as reference category, see [2]:
+
+    The symmetric multinomial logit link function for a single data point is
+    then defined as
+
+        raw_prediction[k] = g(y_pred[k]) = log(y_pred[k]/gmean(y_pred))
+        = log(y_pred[k]) - mean(log(y_pred)).
+
+    Note that this is equivalent to the definition in [1] and implies mean
+    centered raw predictions:
+
+        sum(raw_prediction[k], k=0..n_classes-1) = 0.
+
+    For linear models with raw_prediction = X @ coef, this corresponds to
+    sum(coef[k], k=0..n_classes-1) = 0, i.e. the sum over classes for every
+    feature is zero.
+
+    Reference
+    ---------
+    .. [1] Friedman, Jerome; Hastie, Trevor; Tibshirani, Robert. "Additive
+        logistic regression: a statistical view of boosting" Ann. Statist.
+        28 (2000), no. 2, 337--407. doi:10.1214/aos/1016218223.
+        https://projecteuclid.org/euclid.aos/1016218223
+
+    .. [2] Zahid, Faisal Maqbool and Gerhard Tutz. "Ridge estimation for
+        multinomial logit models with symmetric side constraints."
+        Computational Statistics 28 (2013): 1017-1034.
+        http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
+    """
+
+    multiclass = True
+    interval_y_pred = Interval(0, 1, False, False)
+
+    def symmetrize_raw_prediction(self, raw_prediction):
+        return raw_prediction - np.mean(raw_prediction, axis=1)[:, np.newaxis]
+
+    def link(self, y_pred, out=None):
+        # geometric mean as reference category
+        gm = gmean(y_pred, axis=1)
+        out = np.log(y_pred / gm[:, np.newaxis], out=out)
+        return out
+
+    def inverse(self, raw_prediction, out=None):
+        if out is None:
+            return softmax(raw_prediction, copy=True)
+        else:
+            np.copyto(out, raw_prediction)
+            softmax(out, copy=False)
+            return out
+
+
+_LINKS = {
+    "identity": IdentityLink,
+    "log": LogLink,
+    "logit": LogitLink,
+    "multinomial_logit": MultinomialLogit,
+}
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
new file mode 100644
index 0000000000000..a8dbbff511373
--- /dev/null
+++ b/sklearn/_loss/tests/test_link.py
@@ -0,0 +1,107 @@
+import numpy as np
+from numpy.testing import assert_allclose, assert_array_equal
+import pytest
+
+from sklearn._loss.link import (
+    _LINKS,
+    _inclusive_low_high,
+    MultinomialLogit,
+    Interval,
+    is_in_interval_range,
+)
+
+
+LINK_FUNCTIONS = list(_LINKS.values())
+
+
+@pytest.mark.parametrize(
+    "interval",
+    [
+        Interval(0, 1, False, False),
+        Interval(0, 1, False, True),
+        Interval(0, 1, True, False),
+        Interval(0, 1, True, True),
+        Interval(-np.inf, np.inf, False, False),
+        Interval(-np.inf, np.inf, False, True),
+        Interval(-np.inf, np.inf, True, False),
+        Interval(-np.inf, np.inf, True, True),
+    ],
+)
+def test_is_in_range(interval):
+    # make sure low and high are always within the interval, used for linspace
+    low, high = _inclusive_low_high(interval)
+
+    x = np.linspace(low, high, num=10)
+    assert is_in_interval_range(x, interval)
+
+    # x contains lower bound
+    assert (
+        is_in_interval_range(np.r_[x, interval.low], interval)
+        == interval.low_inclusive
+    )
+
+    # x contains upper bound
+    assert (
+        is_in_interval_range(np.r_[x, interval.high], interval)
+        == interval.high_inclusive
+    )
+
+    # x contains upper and lower bound
+    assert is_in_interval_range(
+        np.r_[x, interval.low, interval.high], interval
+    ) == (interval.low_inclusive and interval.high_inclusive)
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_inverse_identity(link):
+    # Test that link of inverse gives idendity.
+    rng = np.random.RandomState(42)
+    link = link()
+    n_samples, n_classes = 100, None
+    if link.multiclass:
+        n_classes = 10
+        raw_prediction = rng.normal(
+            loc=0, scale=10, size=(n_samples, n_classes)
+        )
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    else:
+        # So far, the valid interval of raw_prediction is (-inf, inf) and
+        # we do not need to distinguish
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
+
+    assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
+    y_pred = link.inverse(raw_prediction)
+    assert_allclose(link.inverse(link.link(y_pred)), y_pred)
+
+
+@pytest.mark.parametrize("link", LINK_FUNCTIONS)
+def test_link_out_argument(link):
+    # Test that out argument gets assigned the result.
+    rng = np.random.RandomState(42)
+    link = link()
+    n_samples, n_classes = 100, None
+    if link.multiclass:
+        n_classes = 10
+        raw_prediction = rng.normal(
+            loc=0, scale=10, size=(n_samples, n_classes)
+        )
+        if isinstance(link, MultinomialLogit):
+            raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
+    else:
+        # So far, the valid interval of raw_prediction is (-inf, inf) and
+        # we do not need to distinguish
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
+
+    y_pred = link.inverse(raw_prediction, out=None)
+    out = np.empty_like(raw_prediction)
+    y_pred_2 = link.inverse(raw_prediction, out=out)
+    assert_allclose(y_pred, out)
+    assert_array_equal(out, y_pred_2)
+    assert np.shares_memory(out, y_pred_2)
+
+    out = np.empty_like(y_pred)
+    raw_prediction_2 = link.link(y_pred, out=out)
+    assert_allclose(raw_prediction, out)
+    assert_array_equal(out, raw_prediction_2)
+    assert np.shares_memory(out, raw_prediction_2)

From 53a4774a780911f616e453600e443f08acc32d20 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 2 Jan 2021 12:58:58 +0100
Subject: [PATCH 051/143] ENH add common loss function submodule

---
 sklearn/_loss/_loss.pxd          |   75 ++
 sklearn/_loss/_loss.pyx          | 1780 ++++++++++++++++++++++++++++++
 sklearn/_loss/loss.py            |  910 +++++++++++++++
 sklearn/_loss/setup.py           |   20 +
 sklearn/_loss/tests/test_loss.py |  814 ++++++++++++++
 5 files changed, 3599 insertions(+)
 create mode 100644 sklearn/_loss/_loss.pxd
 create mode 100644 sklearn/_loss/_loss.pyx
 create mode 100644 sklearn/_loss/loss.py
 create mode 100644 sklearn/_loss/setup.py
 create mode 100644 sklearn/_loss/tests/test_loss.py

diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
new file mode 100644
index 0000000000000..1528ab28741fd
--- /dev/null
+++ b/sklearn/_loss/_loss.pxd
@@ -0,0 +1,75 @@
+# cython: language_level=3
+
+import numpy as np
+cimport numpy as np
+
+np.import_array()
+
+
+# Fused types for y_true, y_pred, raw_prediction
+ctypedef fused Y_DTYPE_C:
+    np.npy_float64
+    np.npy_float32
+
+
+# Fused types for gradient and hessian
+ctypedef fused G_DTYPE_C:
+    np.npy_float64
+    np.npy_float32
+
+
+# Struct to return 2 doubles
+ctypedef struct double2:
+   double val1
+   double val2
+
+
+# C base class for loss functions
+cdef class cLossFunction:
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfSquaredError(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cAbsoluteError(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cPinballLoss(cLossFunction):
+    cdef readonly double quantile  # readonly makes it inherited by children
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfPoissonLoss(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfGammaLoss(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cHalfTweedieLoss(cLossFunction):
+    cdef readonly double power  # readonly makes it inherited by children
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+
+
+cdef class cBinaryCrossEntropy(cLossFunction):
+    cdef double closs(self, double y_true, double raw_prediction) nogil
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
new file mode 100644
index 0000000000000..f94c4118119f9
--- /dev/null
+++ b/sklearn/_loss/_loss.pyx
@@ -0,0 +1,1780 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+
+# Design:
+# See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
+# a) Merge link functions into loss functions for speed and numerical
+#    stability, i.e. use raw_prediction instead of y_pred in signature.
+# b) Pure C functions (nogil) calculate single points (single sample)
+# c) Wrap C functions in a loop to get Python functions operating on ndarrays.
+#   - Write loops manually.
+#     Reason: There is still some performance overhead when using a wrapper
+#     function "wrap" that carries out the loop and gets as argument a function
+#     pointer to one of the C functions from b), e.g.
+#     wrap(closs_half_poisson, y_true, ...)
+#   - Pass n_threads as argument to prange and propagate option to all callers.
+# d) Provide classes (Cython extension types) per loss in order to have
+#    semantical structured objects.
+#    - Member function for single points just call the C function from b).
+#      These are used e.g. in SGD `_plain_sgd`.
+#    - Member functions operating on ndarrays looping, see c), over calls to C
+#      functions from b).
+# e) Provide convenience Python classes that inherit from these extension types
+#    elsewhere (see loss.py)
+#    - Example: loss.gradient calls extension_type._gradient but does some
+#      input checking like None -> np.empty().
+#
+# Note: We require 1-dim ndarrays to be contiguous.
+# TODO: Use const memoryviews with Cython 3.0 where appropriate (# IN)
+
+cimport cython
+from cython.parallel import parallel, prange
+import numpy as np
+cimport numpy as np
+
+from libc.math cimport exp, fabs, log, log1p
+from libc.stdlib cimport malloc, free
+
+np.import_array()
+
+
+# -------------------------------------
+# Helper functions
+# -------------------------------------
+# Numerically stable version of log(1 + exp(x)) for double precision
+# See https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
+cdef inline double log1pexp(double x) nogil:
+    if x <= -37:
+        return exp(x)
+    elif x <= 18:
+        return log1p(exp(x))
+    elif x <= 33.3:
+        return x + exp(-x)
+    else:
+        return x
+
+
+cdef inline void sum_exp_minus_max(
+    const int i, Y_DTYPE_C[:, :] raw_prediction,  # IN
+    Y_DTYPE_C *p                                  # OUT
+) nogil:
+    # Store p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
+    #       p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
+    #       p[-1] = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
+    # len(p) must be n_classes + 2
+    # Notes:
+    # - Using "by reference" arguments doesn't work well, therefore we use a
+    #   longer p, see https://github.com/cython/cython/issues/1863
+    # - i needs to be passed (and stays constant) because otherwise Cython does
+    #   not generate optimal code, see
+    #   https://github.com/scikit-learn/scikit-learn/issues/17299
+    # - We do not calculate p[k] = p[k] / sum_exps to save one loop over k.
+    cdef:
+        int k
+        int n_classes = raw_prediction.shape[1]
+        double max_value = raw_prediction[i, 0]
+        double sum_exps = 0
+    for k in range(1, n_classes):
+        # Compute max value of array for numerical stability
+        if max_value < raw_prediction[i, k]:
+            max_value = raw_prediction[i, k]
+
+    for k in range(n_classes):
+        p[k] = exp(raw_prediction[i, k] - max_value)
+        sum_exps += p[k]
+
+    p[n_classes] = max_value     # same as p[-2]
+    p[n_classes + 1] = sum_exps  # same as p[-1]
+
+
+# -------------------------------------
+# Single point inline C functions
+# -------------------------------------
+# Half Squared Error
+cdef inline double closs_half_squared_error(double y_true, double raw_prediction) nogil:
+    return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
+
+
+cdef inline double cgradient_half_squared_error(
+    double y_true, double raw_prediction
+) nogil:
+    return raw_prediction - y_true
+
+
+cdef inline double2 cgrad_hess_half_squared_error(
+    double y_true, double raw_prediction
+) nogil:
+    cdef double2 gh
+    gh.val1 = raw_prediction - y_true  # gradient
+    gh.val2 = 1.                       # hessian
+    return gh
+
+
+# Absolute Error
+cdef inline double closs_absolute_error(double y_true, double raw_prediction) nogil:
+    return fabs(raw_prediction - y_true)
+
+
+cdef inline double cgradient_absolute_error(double y_true, double raw_prediction) nogil:
+    return 1. if raw_prediction > y_true else -1.
+
+
+cdef inline double2 cgrad_hess_absolute_error(
+    double y_true, double raw_prediction
+) nogil:
+    cdef double2 gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = 1. if raw_prediction > y_true else -1.  # gradient
+    gh.val2 = 1.                                      # hessian
+    return gh
+
+
+# Quantile Loss / Pinball Loss
+cdef inline double closs_pinball_loss(
+    double y_true, double raw_prediction, double quantile
+) nogil:
+    return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
+            else (1. - quantile) * (raw_prediction - y_true))
+
+
+cdef inline double cgradient_pinball_loss(
+    double y_true, double raw_prediction, double quantile
+) nogil:
+    return -quantile if y_true >=raw_prediction else 1. - quantile
+
+
+cdef inline double2 cgrad_hess_pinball_loss(
+    double y_true, double raw_prediction, double quantile
+) nogil:
+    cdef double2 gh
+    # Note that exact hessian = 0 almost everywhere. Optimization routines like
+    # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
+    gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile  # gradient
+    gh.val2 = 1.                                                       # hessian
+    return gh
+
+
+# Half Poisson Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_poisson(double y_true, double raw_prediction) nogil:
+    return exp(raw_prediction) - y_true * raw_prediction
+
+
+cdef inline double cgradient_half_poisson(double y_true, double raw_prediction) nogil:
+    # y_pred - y_true
+    return exp(raw_prediction) - y_true
+
+
+cdef inline double2 closs_grad_half_poisson(double y_true, double raw_prediction) nogil:
+    cdef double2 lg
+    lg.val2 = exp(raw_prediction)
+    lg.val1 = lg.val2 - y_true * raw_prediction  # loss
+    lg.val2 -= y_true                            # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_half_poisson(double y_true, double raw_prediction) nogil:
+    cdef double2 gh
+    gh.val2 = exp(raw_prediction)  # hessian
+    gh.val1 = gh.val2 - y_true     # gradient
+    return gh
+
+
+# Half Gamma Deviance with Log-Link, dropping constant terms
+cdef inline double closs_half_gamma(double y_true, double raw_prediction) nogil:
+    return raw_prediction + y_true * exp(-raw_prediction)
+
+
+cdef inline double cgradient_half_gamma(double y_true, double raw_prediction) nogil:
+    return 1. - y_true * exp(-raw_prediction)
+
+
+cdef inline double2 closs_grad_half_gamma(double y_true, double raw_prediction) nogil:
+    cdef double2 lg
+    lg.val2 = exp(-raw_prediction)
+    lg.val1 = raw_prediction + y_true * lg.val2  # loss
+    lg.val2 = 1. - y_true * lg.val2              # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_half_gamma(double y_true, double raw_prediction) nogil:
+    cdef double2 gh
+    gh.val2 = exp(-raw_prediction)
+    gh.val1 = 1. - y_true * gh.val2  # gradient
+    gh.val2 *= y_true                # hessian
+    return gh
+
+
+# Half Tweedie Deviance with Log-Link, dropping constant terms
+# Note that by dropping constants this is no longer smooth in parameter power.
+cdef inline double closs_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    if power == 0.:
+        return closs_half_squared_error(y_true, exp(raw_prediction))
+    elif power == 1.:
+        return closs_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction) / (2. - power)
+                - y_true * exp((1. - power) * raw_prediction) / (1. - power))
+
+
+cdef inline double cgradient_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    cdef double exp1
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        return exp1 * (exp1 - y_true)
+    elif power == 1.:
+        return cgradient_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgradient_half_gamma(y_true, raw_prediction)
+    else:
+        return (exp((2. - power) * raw_prediction)
+                - y_true * exp((1. - power) * raw_prediction))
+
+
+cdef inline double2 closs_grad_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    cdef double2 lg
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        lg.val1 = closs_half_squared_error(y_true, exp1)  # loss
+        lg.val2 = exp1 * (exp1 - y_true)                  # gradient
+    elif power == 1.:
+        return closs_grad_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return closs_grad_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        lg.val1 = exp2 / (2. - power) - y_true * exp1 / (1. - power)  # loss
+        lg.val2 = exp2 - y_true * exp1                                # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_half_tweedie(
+    double y_true, double raw_prediction, double power
+) nogil:
+    cdef double2 gh
+    cdef double exp1, exp2
+    if power == 0.:
+        exp1 = exp(raw_prediction)
+        gh.val1 = exp1 * (exp1 - y_true)      # gradient
+        gh.val2 = exp1 * (2 * exp1 - y_true)  # hessian
+    elif power == 1.:
+        return cgrad_hess_half_poisson(y_true, raw_prediction)
+    elif power == 2.:
+        return cgrad_hess_half_gamma(y_true, raw_prediction)
+    else:
+        exp1 = exp((1. - power) * raw_prediction)
+        exp2 = exp((2. - power) * raw_prediction)
+        gh.val1 = exp2 - y_true * exp1                                # gradient
+        gh.val2 = (2. - power) * exp2 - (1. - power) * y_true * exp1  # hessian
+    return gh
+
+
+# Binary cross entropy aka log-loss
+cdef inline double closs_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    # log1p(exp(raw_prediction)) - y_true * raw_prediction
+    return log1pexp(raw_prediction) - y_true * raw_prediction
+
+
+cdef inline double cgradient_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    # y_pred - y_true = expit(raw_prediction) - y_true
+    # Numerically more stable, see
+    # http://fa.bianp.net/blog/2019/evaluate_logistic/
+    #     if raw_prediction < 0:
+    #         exp_tmp = exp(raw_prediction)
+    #         return ((1 - y_true) * exp_tmp - y_true) / (1 + exp_tmp)
+    #     else:
+    #         exp_tmp = exp(-raw_prediction)
+    #         return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+    # Note that optimal speed would be achieved, at the cost of precision, by
+    #     return expit(raw_prediction) - y_true
+    # i.e. no if else, and an own inline implemention of expit instead of
+    #     from scipy.special.cython_special cimport expit
+    # The case distinction raw_prediction < 0 in the stable implementation
+    # does not provide significant better precision. Therefore we go without
+    # it.
+    cdef double exp_tmp
+    exp_tmp = exp(-raw_prediction)
+    return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
+
+
+cdef inline double2 closs_grad_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    cdef double2 lg
+    if raw_prediction <= 0:
+        lg.val2 = exp(raw_prediction)
+        if raw_prediction <= -37:
+            lg.val1 = lg.val2 - y_true * raw_prediction              # loss
+        else:
+            lg.val1 = log1p(lg.val2) - y_true * raw_prediction       # loss
+        lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2)  # gradient
+    else:
+        lg.val2 = exp(-raw_prediction)
+        if raw_prediction <= 18:
+            # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
+            lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction  # loss
+        else:
+            lg.val1 = lg.val2 + (1 - y_true) * raw_prediction         # loss
+        lg.val2 = ((1 - y_true) - y_true * lg.val2) / (1 + lg.val2)   # gradient
+    return lg
+
+
+cdef inline double2 cgrad_hess_binary_crossentropy(
+    double y_true, double raw_prediction
+) nogil:
+    # with y_pred = expit(raw)
+    # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2
+    #                                 = exp(-raw) / (1 + exp(-raw))**2
+    cdef double2 gh
+    gh.val2 = exp(-raw_prediction)
+    gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
+    gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
+    return gh
+
+
+# ---------------------------------------------------
+# Extension Types for Loss Functions of 1-dim targets
+# ---------------------------------------------------
+cdef class cLossFunction:
+    """Base class for convex loss functions."""
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        """Compute the loss for a single sample.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        double
+            The loss evaluated at `y_true` and `raw_prediction`.
+        """
+        pass
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        """Compute gradient of loss w.r.t. raw_prediction for a single sample.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        double
+            The derivative of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        """Compute gradient and hessian.
+
+        Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
+
+        This is usually diagonal in raw_prediction_i and raw_prediction_j.
+        Therefore, we return the diagonal element i=j.
+
+        For a loss with a non-canonical link, this might implement the diagonal
+        of the Fisher matrix (=expected hessian) instead of the hessian.
+
+        Parameters
+        ----------
+        y_true : double
+            Observed, true target value.
+        raw_prediction : double
+            Raw prediction value (in link space).
+
+        Returns
+        -------
+        grad_hess_pair
+            Gradient and hessian of the loss function w.r.t. `raw_prediction`.
+        """
+        pass
+
+    # Note: With Cython 3.0, fused types can be used together with const:
+    #       const Y_DTYPE_C double[::1] y_true
+    # See release notes 3.0.0 alpha1
+    # https://cython.readthedocs.io/en/latest/src/changes.html#alpha-1-2020-04-12
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
+        int n_threads=1
+    ):
+        """Compute the pointwise loss value for each input.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+        """
+        pass
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
+        int n_threads=1
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient : array of shape (n_samples,)
+            A location into which the result is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples,)
+            Element-wise gradients.
+        """
+        pass
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
+        G_DTYPE_C[::1] gradient,        # OUT
+        int n_threads=1
+    ):
+        """Compute loss and gradient of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        loss : array of shape (n_samples,) or None
+            A location into which the element-wise loss is stored.
+        gradient : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+
+        gradient : array of shape (n_samples,)
+            Element-wise gradients.
+        """
+        self._loss(y_true, raw_prediction, sample_weight, loss,
+                            n_threads)
+        self._gradient(y_true, raw_prediction, sample_weight, gradient,
+                      n_threads)
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] hessian,         # OUT
+        int n_threads=1
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples,)
+            Raw prediction values (in link space).
+        sample_weight : array of shape (n_samples,) or None
+            Sample weights.
+        gradient : array of shape (n_samples,)
+            A location into which the gradient is stored.
+        hessian : array of shape (n_samples,)
+            A location into which the hessian is stored.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples,)
+            Element-wise gradients.
+
+        hessian : array of shape (n_samples,)
+            Element-wise hessians.
+        """
+        pass
+
+
+cdef class cHalfSquaredError(cLossFunction):
+    """Half Squared Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_squared_error(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_squared_error(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_squared_error(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_squared_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_squared_error(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_squared_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_squared_error(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cAbsoluteError(cLossFunction):
+    """Absolute Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_absolute_error(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_absolute_error(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_absolute_error(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_absolute_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (sample_weight[i]
+                    * closs_absolute_error(y_true[i], raw_prediction[i]))
+
+        return np.asarray(loss)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_absolute_error(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_absolute_error(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cPinballLoss(cLossFunction):
+    """Quantile Loss aka Pinball Loss with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
+    """
+
+    def __init__(self, quantile):
+        self.quantile = quantile
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_pinball_loss(y_true, raw_prediction, self.quantile)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_pinball_loss(y_true, raw_prediction, self.quantile)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_pinball_loss(y_true, raw_prediction, self.quantile)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
+                )
+
+        return np.asarray(loss)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_pinball_loss(
+                    y_true[i], raw_prediction[i], self.quantile
+                )
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_pinball_loss(
+                    y_true[i], raw_prediction[i], self.quantile
+                )
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_pinball_loss(
+                    y_true[i], raw_prediction[i], self.quantile
+                )
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cHalfPoissonLoss(cLossFunction):
+    """Half Poisson deviance loss with log-link.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Poisson deviance with log-link is
+        y_true * log(y_true/y_pred) + y_pred - y_true
+        = y_true * log(y_true) - y_true * raw_prediction
+          + exp(raw_prediction) - y_true
+
+    Dropping constant terms, this gives:
+        exp(raw_prediction) - y_true * raw_prediction
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_poisson(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_poisson(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_poisson(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_poisson(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_poisson(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_poisson(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_poisson(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cHalfGammaLoss(cLossFunction):
+    """Half Gamma deviance loss with log-link.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Gamma deviance with log-link is
+        log(y_pred/y_true) + y_true/y_pred - 1
+        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
+
+    Dropping constant terms, this gives:
+        raw_prediction + y_true * exp(-raw_prediction)
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_gamma(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_gamma(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_gamma(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_gamma(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_gamma(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_gamma(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_gamma(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cHalfTweedieLoss(cLossFunction):
+    """Half Tweedie deviance loss with log-link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Tweedie deviance with log-link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+        + exp((2-p) * raw_prediction) / (2-p)
+
+    Dropping constant terms, this gives:
+        exp((2-p) * raw_prediction) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+
+    Notes:
+    - Poisson with p=1 and and Gamma with p=2 have different terms dropped such
+      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
+    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
+      0<p<1 still gives a strictly consistent scoring function for the
+      expectation.
+    """
+
+    def __init__(self, power):
+        self.power = power
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_half_tweedie(y_true, raw_prediction, self.power)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_half_tweedie(y_true, raw_prediction, self.power)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_half_tweedie(y_true, raw_prediction, self.power)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_half_tweedie(
+                    y_true[i], raw_prediction[i], self.power
+                )
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cBinaryCrossEntropy(cLossFunction):
+    """BinaryCrossEntropy with logit link.
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+    """
+
+    cdef double closs(self, double y_true, double raw_prediction) nogil:
+        return closs_binary_crossentropy(y_true, raw_prediction)
+
+    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+        return cgradient_binary_crossentropy(y_true, raw_prediction)
+
+    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return cgrad_hess_binary_crossentropy(y_true, raw_prediction)
+
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = closs_binary_crossentropy(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                loss[i] = (
+                    sample_weight[i]
+                    * closs_binary_crossentropy(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                gradient[i] = (
+                    sample_weight[i]
+                    * cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
+                )
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[::1] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] gradient,
+        G_DTYPE_C[::1] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i
+            int n_samples = y_true.shape[0]
+            double2 dbl2
+
+        if sample_weight is None:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
+                gradient[i] = dbl2.val1
+                hessian[i] = dbl2.val2
+        else:
+            for i in prange(
+                n_samples, schedule='static', nogil=True, num_threads=n_threads
+            ):
+                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
+                gradient[i] = sample_weight[i] * dbl2.val1
+                hessian[i] = sample_weight[i] * dbl2.val2
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+cdef class cCategoricalCrossEntropy(cLossFunction):
+    """CategoricalCrossEntropy with multinomial logit link.
+
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred in (0, 1)**n_classes, i.e. interval with boundaries excluded
+
+    Link:
+    y_pred = softmax(raw_prediction)
+
+    Note: Label encoding is built-in, i.e. {0, 1, 2, 3, .., n_classes - 1} is
+    mapped to (y_true == k) for k = 0 .. n_classes - 1 which is either 0 or 1.
+    """
+
+    # Note that we do not assume memory alignement/contiguity of 2d arrays.
+    # There seems to be little benefit in doing so. Benchmarks proofing the
+    # opposite are welcome.
+    def _loss(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C max_value, sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        # We assume n_samples > n_classes. In this case having the inner loop
+        # over n_classes is a good default.
+        # TODO: If every memoryview is contiguous and raw_preduction is
+        #       f-contiguous, can we write a better algo (loops) to improve
+        #       performance?
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = p[n_classes]     # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss[i] -= raw_prediction[i, k]
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = raw_prediction[i, 0]
+                    max_value = p[n_classes]     # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true[i] == k:
+                            loss[i] -= raw_prediction[i, k]
+
+                    loss[i] *= sample_weight[i]
+
+                free(p)
+
+        return np.asarray(loss)
+
+    def _loss_gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
+        G_DTYPE_C[:, :] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C max_value, sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = p[n_classes]  # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true [i] == k:
+                            loss[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = p_k - (y_true == k)
+                        gradient[i, k] = p[k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    max_value = p[n_classes]  # p[-2]
+                    sum_exps = p[n_classes + 1]  # p[-1]
+                    loss[i] = log(sum_exps) + max_value
+
+                    for k in range(n_classes):
+                        # label decode y_true
+                        if y_true [i] == k:
+                            loss[i] -= raw_prediction[i, k]
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+
+                    loss[i] *= sample_weight[i]
+
+                free(p)
+
+        return np.asarray(loss), np.asarray(gradient)
+
+    def _gradient(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[:, :] gradient,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient[i, k] = p[k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+
+                free(p)
+
+        return np.asarray(gradient)
+
+    def _gradient_hessian(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[:, :] gradient,
+        G_DTYPE_C[:, :] hessian,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C sum_exps
+            Y_DTYPE_C* p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # hessian_k = p_k * (1 - p_k)
+                        # gradient_k = p_k - (y_true == k)
+                        gradient[i, k] = p[k] - (y_true[i] == k)
+                        hessian[i, k] = p[k] * (1. - p[k])
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        # hessian_k = p_k * (1 - p_k) * sw
+                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                        hessian[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
+
+                free(p)
+
+        return np.asarray(gradient), np.asarray(hessian)
+
+
+    # This method simplifies the implementation of hessp in linear models,
+    # i.e. the matrix-vector product of the full hessian, not only of the
+    # diagonal (in the classes) approximation as implemented above.
+    def _gradient_proba(
+        self,
+        Y_DTYPE_C[::1] y_true,
+        Y_DTYPE_C[:, :] raw_prediction,
+        Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[:, :] gradient,
+        G_DTYPE_C[:, :] proba,
+        int n_threads=1
+    ):
+        cdef:
+            int i, k
+            int n_samples = y_true.shape[0]
+            int n_classes = raw_prediction.shape[1]
+            Y_DTYPE_C sum_exps
+            Y_DTYPE_C*  p  # temporary buffer
+
+        if sample_weight is None:
+            # inner loop over n_classes
+            with nogil, parallel(num_threads=n_threads):
+                # Define private buffer variables as each thread might use its
+                # own.
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        proba[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = y_pred_k - (y_true == k)
+                        gradient[i, k] = proba[i, k] - (y_true[i] == k)
+
+                free(p)
+        else:
+            with nogil, parallel(num_threads=n_threads):
+                p = <Y_DTYPE_C *> malloc(sizeof(Y_DTYPE_C) * (n_classes + 2))
+
+                for i in prange(n_samples, schedule='static'):
+                    sum_exp_minus_max(i, raw_prediction, p)
+                    sum_exps = p[n_classes + 1]  # p[-1]
+
+                    for k in range(n_classes):
+                        proba[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        # gradient_k = (p_k - (y_true == k)) * sw
+                        gradient[i, k] = (proba[i, k] - (y_true[i] == k)) * sample_weight[i]
+
+                free(p)
+
+        return np.asarray(gradient), np.asarray(proba)
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
new file mode 100644
index 0000000000000..49d968b6bd2af
--- /dev/null
+++ b/sklearn/_loss/loss.py
@@ -0,0 +1,910 @@
+"""
+This module contains loss classes suitable for fitting.
+
+It is not part of the public API.
+Specific losses are used for regression, binary classification or multiclass
+classification.
+"""
+# Goals:
+# - Provide a common private module for loss functions/classes.
+# - Replace losses for:
+#   - LogisticRegression
+#   - PoissonRegressor, GammaRegressor, TweedieRegressor
+#   - HistGradientBoostingRegressor, HistGradientBoostingClassifier
+#   - GradientBoostingRegressor, GradientBoostingClassifier
+#   - SGDRegressor, SGDClassifier
+# - Replace link module of GLMs.
+
+import numpy as np
+from scipy.special import xlogy
+from ._loss import (
+    cLossFunction,
+    cHalfSquaredError,
+    cAbsoluteError,
+    cPinballLoss,
+    cHalfPoissonLoss,
+    cHalfGammaLoss,
+    cHalfTweedieLoss,
+    cBinaryCrossEntropy,
+    cCategoricalCrossEntropy,
+)
+from .link import (
+    Interval,
+    is_in_interval_range,
+    BaseLink,
+    IdentityLink,
+    LogLink,
+    LogitLink,
+    MultinomialLogit,
+)
+from ..utils.stats import _weighted_percentile
+
+
+# Note: The shape of raw_prediction for multiclass classifications are
+# - GradientBoostingClassifier: (n_samples, n_classes)
+# - HistGradientBoostingClassifier: (n_classes, n_samples)
+class BaseLoss(BaseLink, cLossFunction):
+    """Base class for a loss function of 1-dimensional targets.
+
+    Conventions:
+
+        - y_true.shape = sample_weight.shape = (n_samples,)
+        - y_pred.shape = raw_prediction.shape = (n_samples,)
+        - If n_classes >= 3 (multiclass classification), then
+          y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
+          Note that this corresponds to the return value of decision_function.
+
+    y_true, y_pred, sample_weight and raw_prediction must either be all float64
+    or all float32.
+    gradient and hessian must be either both float64 or both float32.
+
+    Note that y_pred = link.inverse(raw_prediction).
+
+    Specific loss classes can inherit specific link classes to satisfy
+    BaseLink's abstractmethods.
+
+    Parameters
+    ----------
+    sample_weight : {None, ndarray}
+        If sample_weight is None, the hessian might be constant.
+    n_classes : {None, int}
+        The number of classes for classification, else None.
+
+    Attributes
+    ----------
+    interval_y_true: Interval
+        Valid interval for y_true
+    interval_y_pred: Interval
+        Valid Interval for y_pred
+    differentiable: bool
+        Indicates whether or not loss function is differentiable in
+        raw_prediction everywhere.
+    need_update_leaves_values: bool
+        Indicates whether decision trees in gradient boosting need to uptade
+        leave values after having been fit to the (negative) gradients.
+    approx_hessian : bool
+        Indicates whether the hessian is approximated or exact. If,
+        approximated, it should be larger or equal to the exact one.
+    constant_hessian : bool
+        Indicates whether the hessian is one for this loss.
+    """
+
+    # Inherited methods from BaseLink:
+    # - link
+    # - inverse
+    #
+    # Inherited methods from cLossFunction:
+    # - _loss, _loss_gradient, _gradient, _gradient_hessian
+
+    # For decision trees:
+    # This variable indicates whether the loss requires the leaves values to
+    # be updated once the tree has been trained. The trees are trained to
+    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
+    # some losses (e.g. least absolute deviation) we need to adjust the tree
+    # values to account for the "line search" of the gradient descent
+    # procedure. See the original paper Greedy Function Approximation: A
+    # Gradient Boosting Machine by Friedman
+    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
+    need_update_leaves_values = False
+    differentiable = True
+
+    def __init__(self, n_classes=1):
+        self.approx_hessian = False
+        self.constant_hessian = False
+        self.n_classes = n_classes
+        self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
+
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return is_in_interval_range(y, self.interval_y_true)
+
+    def in_y_pred_range(self, y):
+        """Return True if y is in the valid range of y_pred.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return is_in_interval_range(y, self.interval_y_pred)
+
+    def loss(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss=None,
+        n_threads=1,
+    ):
+        """Compute the pointwise loss value for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss : None or C-contiguous array of shape (n_samples,)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+        """
+        if loss is None:
+            loss = np.empty_like(y_true)
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        return self._loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=loss,
+            n_threads=n_threads,
+        )
+
+    def loss_gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        loss=None,
+        gradient=None,
+        n_threads=1,
+    ):
+        """Compute loss and gradient w.r.t. raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        loss : None or C-contiguous array of shape (n_samples,)
+            A location into which the loss is stored. If None, a new array
+            might be created.
+        gradient : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : array of shape (n_samples,)
+            Element-wise loss function.
+
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if loss is None:
+            if gradient is None:
+                loss = np.empty_like(y_true)
+                gradient = np.empty_like(raw_prediction)
+            else:
+                loss = np.empty_like(y_true, dtype=gradient.dtype)
+        elif gradient is None:
+            gradient = np.empty_like(raw_prediction, dtype=loss.dtype)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient.ndim == 2 and gradient.shape[1] == 1:
+            gradient = gradient.squeeze(1)
+
+        return self._loss_gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=loss,
+            gradient=gradient,
+            n_threads=n_threads,
+        )
+
+    def gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        n_threads=1,
+    ):
+        """Compute gradient of loss w.r.t raw_prediction for each input.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the result is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : array of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+        """
+        if gradient is None:
+            gradient = np.empty_like(raw_prediction)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient.ndim == 2 and gradient.shape[1] == 1:
+            gradient = gradient.squeeze(1)
+
+        return self._gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=gradient,
+            n_threads=n_threads,
+        )
+
+    def gradient_hessian(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        hessian=None,
+        n_threads=1,
+    ):
+        """Compute gradient and hessian of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        hessian : None or C-contiguous array of shape (n_samples,) or array \
+            of shape (n_samples, n_classes)
+            A location into which the hessian is stored. If None, a new array
+            might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise gradients.
+
+        hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
+            Element-wise hessians.
+        """
+        if gradient is None:
+            if hessian is None:
+                gradient = np.empty_like(raw_prediction)
+                hessian = np.empty_like(raw_prediction)
+            else:
+                gradient = np.empty_like(hessian)
+        elif hessian is None:
+            hessian = np.empty_like(gradient)
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if gradient.ndim == 2 and gradient.shape[1] == 1:
+            gradient = gradient.squeeze(1)
+        if hessian.ndim == 2 and hessian.shape[1] == 1:
+            hessian = hessian.squeeze(1)
+
+        return self._gradient_hessian(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=gradient,
+            hessian=hessian,
+            n_threads=n_threads,
+        )
+
+    def __call__(
+        self, y_true, raw_prediction, sample_weight=None, n_threads=1
+    ):
+        """Compute the weighted average loss.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : C-contiguous array of shape (n_samples,) or array of \
+            shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        loss : float
+            Mean or averaged loss function.
+        """
+        return np.average(
+            self.loss(
+                y_true=y_true,
+                raw_prediction=raw_prediction,
+                sample_weight=None,
+                loss=None,
+                n_threads=n_threads,
+            ),
+            weights=sample_weight,
+        )
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This can be used as initial estimates of predictions, i.e. before the
+        first iteration in fit.
+
+        Parameters
+        ----------
+        y_true : array-like of shape (n_samples,)
+            Observed, true target values.
+        sample_weight : None or array of shape (n_samples,)
+            Sample weights.
+
+        Returns
+        -------
+        raw_prediction : float or (n_classes,)
+            Raw predictions of an intercept-only model.
+        """
+        # As default, take weighted average of the target over the samples
+        # axis=0 and then transform into link-scale (raw_prediction).
+        y_pred = np.average(y_true, weights=sample_weight, axis=0)
+        eps = 10 * np.finfo(y_pred.dtype).eps
+
+        if self.interval_y_pred.low == -np.inf:
+            a_min = None
+        elif self.interval_y_pred.low_inclusive:
+            a_min = self.interval_y_pred.low
+        else:
+            a_min = self.interval_y_pred.low + eps
+
+        if self.interval_y_pred.high == np.inf:
+            a_max = None
+        elif self.interval_y_pred.high_inclusive:
+            a_max = self.interval_y_pred.high
+        else:
+            a_max = self.interval_y_pred.high - eps
+
+        if a_min is None and a_max is None:
+            return self.link(y_pred)
+        else:
+            return self.link(np.clip(y_pred, a_min, a_max))
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        """Calculate term dropped in loss.
+
+        With this term added, the loss of perfect predictions is zero.
+        """
+        return np.zeros_like(y_true)
+
+
+class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
+    """Half Squared Error with identity link, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, half squares error is defined as::
+
+        loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
+
+    The factor of 0.5 simplifies the computation of gradients and results in a
+    unit hessian (and be consistent with what is done in LightGBM).
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        if sample_weight is None:
+            self.constant_hessian = True
+        else:
+            self.constant_hessian = False
+
+    def gradient(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        n_threads=1,
+    ):
+        # easier in numpy
+        # gradient = raw_prediction - y_true is easier in numpy
+
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        if (
+            gradient is not None
+            and gradient.ndim == 2
+            and gradient.shape[1] == 1
+        ):
+            gradient = gradient.squeeze(1)
+
+        gradient = np.subtract(raw_prediction, y_true, out=gradient)
+        if sample_weight is None:
+            return gradient
+        else:
+            return np.multiply(sample_weight, gradient, out=gradient)
+
+    def gradient_hessian(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        hessian=None,
+        n_threads=1,
+    ):
+        # easier in numpy
+        gradient = self.gradient(
+            y_true, raw_prediction, sample_weight, gradient, hessian
+        )
+        if hessian is None:
+            hessian = np.empty_like(gradient)
+        elif hessian.ndim == 2 and hessian.shape[1] == 1:
+            # Be graceful to shape (n_samples, 1) -> (n_samples,)
+            hessian = hessian.squeeze(1)
+        if sample_weight is None:
+            hessian.fill(1)
+        else:
+            np.copyto(hessian, sample_weight)
+        return gradient, hessian
+
+
+class AbsoluteError(IdentityLink, BaseLoss, cAbsoluteError):
+    """Least absolute error, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, the absolute error is defined as::
+
+        loss(x_i) = |y_true_i - raw_prediction_i|
+    """
+
+    differentiable = False
+    need_update_leaves_values = True
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        self.approx_hessian = True
+        if sample_weight is None:
+            self.constant_hessian = True
+        else:
+            self.constant_hessian = False
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.median(y_true, axis=0)
+        else:
+            return _weighted_percentile(y_true, sample_weight, 50)
+
+
+class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
+    """Quantile Loss aka Pinball Loss, for regression.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    For a given sample x_i, the pinball loss loss is defined as::
+
+        loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
+
+        rho_{quantile}(u) = u * (quantile - 1_{u<0})
+                          = -u (1 - quantile)  if u < 0
+                            u * quantile       if u >= 0
+
+    Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
+
+    Additional Attributes
+    ---------------------
+    quantile : float
+        The quantile to be estimated. Must be in range (0, 1).
+    """
+
+    differentiable = False
+    need_update_leaves_values = True
+
+    def __init__(self, sample_weight=None, quantile=0.5):
+        BaseLoss.__init__(self)
+        cPinballLoss.__init__(self, quantile=float(quantile))
+        self.approx_hessian = True
+        if sample_weight is None:
+            self.constant_hessian = True
+        else:
+            self.constant_hessian = False
+        if quantile <= 0 or quantile >= 1:
+            raise ValueError(
+                f"PinballLoss aka quantile loss only accepts "
+                f"0 < quantile < 1; {quantile} was given."
+            )
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the weighted median of the target, i.e. over the samples
+        axis=0.
+        """
+        if sample_weight is None:
+            return np.percentile(y_true, 100 * self.quantile, axis=0)
+        else:
+            return _weighted_percentile(
+                y_true, sample_weight, 100 * self.quantile
+            )
+
+
+class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
+    """Poisson deviance loss with log-link, for regression.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half the Poisson deviance is defined as::
+
+        loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
+                    - y_true_i + exp(raw_prediction_i)
+
+    Half the Poisson deviance is actually the negative log likelihood up to
+    constant terms (not involving raw_prediction) and simplifies the
+    computation of the gradients.
+    We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        self.interval_y_true = Interval(0, np.inf, True, False)
+        self.interval_y_pred = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = xlogy(y_true, y_true) - y_true
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+
+class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
+    """Gamma deviance loss with log-link, for regression.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half Gamma deviance loss is defined as::
+
+        loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
+                    + y_true/exp(raw_prediction_i) - 1
+
+    Half the Gamma deviance is actually proportional the negative log
+    likelihood up constant terms (not involving raw_prediction) and simplifies
+    the computation of the gradients.
+    We also skip the constant term `-log(y_true_i) - 1`.
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__()
+        self.interval_y_true = Interval(0, np.inf, False, False)
+        self.interval_y_pred = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        term = -np.log(y_true) - 1
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+
+class HalfTweedieLoss(LogLink, BaseLoss, cHalfTweedieLoss):
+    """Tweedie deviance loss with log-link, for regression.
+
+    Domain:
+    y_true in real numbers for power <= 0
+    y_true in non-negative real numbers for 0 < power < 2
+    y_true in positive real numbers for 2 <= power
+    y_pred in positive real numbers
+    power in real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    For a given sample x_i, half Tweedie deviance loss with p=power is defined
+    as::
+
+        loss(x_i) = max(y_true_i, 0)**(2-p) / (1-p) / (2-p)
+                    - y_true_i * exp(raw_prediction_i)**(1-p) / (1-p)
+                    + exp(raw_prediction_i)**(2-p) / (2-p)
+
+    Taking the limits for p=0, 1, 2 gives HalfSquaredError with a log link,
+    HalfPoissonLoss and HalfGammaLoss.
+
+    We also skip constant terms, but those are different for p=0, 1, 2.
+    Therefore, the loss is not continuous in `power`.
+
+    Note furthermore that although no Tweedie distribution exists for
+    0 < power < 1, it still gives a strictly consistent scoring function for
+    the expectation.
+    """
+
+    def __init__(self, sample_weight=None, power=1.5):
+        BaseLoss.__init__(self)
+        cHalfTweedieLoss.__init__(self, power=power)
+        self.interval_y_pred = Interval(0, np.inf, False, False)
+        if self.power <= 0:
+            self.interval_y_true = Interval(-np.inf, np.inf, False, False)
+        elif self.power < 2:
+            self.interval_y_true = Interval(0, np.inf, True, False)
+        else:
+            self.interval_y_true = Interval(0, np.inf, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        if self.power == 0:
+            return HalfSquaredError().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.power == 1:
+            return HalfPoissonLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        elif self.power == 2:
+            return HalfGammaLoss().constant_to_optimal_zero(
+                y_true=y_true, sample_weight=sample_weight
+            )
+        else:
+            p = self.power
+            term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p)
+            if sample_weight is not None:
+                term *= sample_weight
+            return term
+
+
+class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
+    """Binary cross entropy loss for binary classification.
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+
+    For a given sample x_i, the binary cross-entropy, aka log loss, is defined
+    as the negative log-likelihood of the Bernoulli distributions and can be
+    expressed as::
+
+        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
+
+    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
+    section 4.4.1 (about logistic regression).
+    """
+
+    def __init__(self, sample_weight=None):
+        super().__init__(n_classes=2)
+        self.interval_y_true = Interval(0, 1, True, True)
+        self.interval_y_pred = Interval(0, 1, False, False)
+
+    def constant_to_optimal_zero(self, y_true, sample_weight=None):
+        # This is non-zero only if y_true is neither 0 nor 1.
+        term = xlogy(y_true, y_true) + xlogy(1 - y_true, 1 - y_true)
+        if sample_weight is not None:
+            term *= sample_weight
+        return term
+
+    def predict_proba(self, raw_prediction):
+        # Be graceful to shape (n_samples, 1) -> (n_samples,)
+        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
+            raw_prediction = raw_prediction.squeeze(1)
+        proba = np.empty(
+            (raw_prediction.shape[0], 2), dtype=raw_prediction.dtype
+        )
+        proba[:, 1] = self.inverse(raw_prediction)
+        proba[:, 0] = 1 - proba[:, 1]
+        return proba
+
+
+class CategoricalCrossEntropy(
+    MultinomialLogit, BaseLoss, cCategoricalCrossEntropy
+):
+    """Categorical cross-entropy loss for multiclass classification.
+
+    Domain:
+    y_true in {0, 1, 2, 3, .., n_classes - 1}
+    y_pred a n_classes array, each element in (0, 1)
+
+    Link:
+    y_pred = softmax(raw_prediction)
+
+    Note: We assume y_true to be already label encoded.
+
+    For a given sample x_i, the categorical cross-entropy loss is defined as
+    the negative log-likelihood of the multinomial distribution, it generalizes
+    the binary cross-entropy to more than 2 classes::
+
+        loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
+                - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
+
+    See [1].
+
+    Note that for the hessian, we calculate only the diagonal part in the
+    classes: If the full hessian for classes k and l and sample i is H_i_k_l,
+    we calculate H_i_k_k, i.e. k=l.
+
+    Reference
+    ---------
+    .. [1] Simon, Noah, J. Friedman and T. Hastie.
+        "A Blockwise Descent Algorithm for Group-penalized Multiresponse and
+        Multinomial Regression."
+        https://arxiv.org/pdf/1311.6529.pdf
+    """
+
+    def __init__(self, sample_weight=None, n_classes=3):
+        super().__init__(n_classes=n_classes)
+        self.interval_y_true = Interval(0, np.inf, True, False)
+        self.interval_y_pred = Interval(0, 1, False, False)
+
+    def in_y_true_range(self, y):
+        """Return True if y is in the valid range of y_true.
+
+        Parameters
+        ----------
+        y : ndarray
+        """
+        return is_in_interval_range(y, self.interval_y_true) and np.all(
+            y.astype(np.int) == y
+        )
+
+    def fit_intercept_only(self, y_true, sample_weight=None):
+        """Compute raw_prediction of an intercept-only model.
+
+        This is the softmax of the weighted average of the target, i.e. over
+        the samples axis=0.
+        """
+        out = np.zeros(self.n_classes, dtype=y_true.dtype)
+        eps = np.finfo(y_true.dtype).eps
+        for k in range(self.n_classes):
+            out[k] = np.average(y_true == k, weights=sample_weight, axis=0)
+            out[k] = np.clip(out[k], eps, 1 - eps)
+        return self.link(out[None, :]).reshape(-1)
+
+    def predict_proba(self, raw_prediction):
+        return self.inverse(raw_prediction)
+
+    def gradient_proba(
+        self,
+        y_true,
+        raw_prediction,
+        sample_weight=None,
+        gradient=None,
+        proba=None,
+        n_threads=1,
+    ):
+        """Compute gradient and probabilities of loss w.r.t raw_prediction.
+
+        Parameters
+        ----------
+        y_true : C-contiguous array of shape (n_samples,)
+            Observed, true target values.
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+        sample_weight : None or C-contiguous array of shape (n_samples,)
+            Sample weights.
+        gradient : None or array of shape (n_samples, n_classes)
+            A location into which the gradient is stored. If None, a new array
+            might be created.
+        proba : None or array of shape (n_samples, n_classes)
+            A location into which the class probabilities are stored. If None,
+            a new array might be created.
+        n_threads : int
+            Might use openmp thread parallelism.
+
+        Returns
+        -------
+        gradient, proba : array of shape (n_samples, n_classes)
+            Element-wise gradients.
+
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilites.
+        """
+        if gradient is None:
+            if proba is None:
+                gradient = np.empty_like(raw_prediction)
+                proba = np.empty_like(raw_prediction)
+            else:
+                gradient = np.empty_like(proba)
+        elif proba is None:
+            proba = np.empty_like(gradient)
+
+        return self._gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=gradient,
+            proba=proba,
+            n_threads=n_threads,
+        )
+
+
+_LOSSES = {
+    "squared_error": HalfSquaredError,
+    "absolute_error": AbsoluteError,
+    "pinball_loss": PinballLoss,
+    "poisson_loss": HalfPoissonLoss,
+    "gamma_loss": HalfGammaLoss,
+    "tweedie_loss": HalfTweedieLoss,
+    "binary_crossentropy": BinaryCrossEntropy,
+    "categorical_crossentropy": CategoricalCrossEntropy,
+}
diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
new file mode 100644
index 0000000000000..23d35439885ba
--- /dev/null
+++ b/sklearn/_loss/setup.py
@@ -0,0 +1,20 @@
+import numpy
+from numpy.distutils.misc_util import Configuration
+
+
+def configuration(parent_package="", top_path=None):
+    config = Configuration("_loss", parent_package, top_path)
+
+    config.add_extension(
+        "_loss", sources=["_loss.pyx"], include_dirs=[numpy.get_include()]
+    )
+
+    # config.add_subpackage("tests")
+
+    return config
+
+
+if __name__ == "__main__":
+    from numpy.distutils.core import setup
+
+    setup(**configuration().todict())
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
new file mode 100644
index 0000000000000..5f7e001f2d6de
--- /dev/null
+++ b/sklearn/_loss/tests/test_loss.py
@@ -0,0 +1,814 @@
+import numpy as np
+from numpy.testing import assert_allclose, assert_array_equal
+import pytest
+from pytest import approx
+from scipy.optimize import (
+    minimize,
+    minimize_scalar,
+    newton,
+)
+from scipy.special import logit
+
+from sklearn._loss.link import _inclusive_low_high
+from sklearn._loss.loss import (
+    _LOSSES,
+    AbsoluteError,
+    BinaryCrossEntropy,
+    CategoricalCrossEntropy,
+    HalfGammaLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+    HalfTweedieLoss,
+    PinballLoss,
+)
+from sklearn.utils import assert_all_finite
+from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils.fixes import sp_version, parse_version
+
+
+ALL_LOSSES = list(_LOSSES.values())
+
+LOSS_INSTANCES = [loss() for loss in ALL_LOSSES]
+# HalfTweedieLoss(power=1.5) is already there as default
+LOSS_INSTANCES += [
+    PinballLoss(quantile=0.25),
+    HalfTweedieLoss(power=-1.5),
+    HalfTweedieLoss(power=0),
+    HalfTweedieLoss(power=1),
+    HalfTweedieLoss(power=2),
+    HalfTweedieLoss(power=3.0),
+]
+
+
+def loss_instance_name(loss):
+    name = loss.__class__.__name__
+    if hasattr(loss, "quantile"):
+        name += f"(quantile={loss.quantile})"
+    elif hasattr(loss, "power"):
+        name += f"(power={loss.power})"
+    return name
+
+
+def random_y_true_raw_prediction(
+    loss, n_samples, y_bound=(-100, 100), raw_bound=(-5, 5), seed=42
+):
+    """Random generate y_true and raw_prediction in valid range."""
+    rng = np.random.RandomState(seed)
+    if loss.n_classes <= 2:
+        raw_prediction = rng.uniform(
+            low=raw_bound[0], high=raw_bound[0], size=n_samples
+        )
+        # generate a y_true in valid range
+        low, high = _inclusive_low_high(loss.interval_y_true)
+        low = max(low, y_bound[0])
+        high = min(high, y_bound[1])
+        y_true = rng.uniform(low, high, size=n_samples)
+        # set some values at special boundaries
+        if (
+            loss.interval_y_true.low == 0
+            and loss.interval_y_true.low_inclusive
+        ):
+            y_true[:: (n_samples // 3)] = 0
+        if (
+            loss.interval_y_true.high == 1
+            and loss.interval_y_true.high_inclusive
+        ):
+            y_true[1:: (n_samples // 3)] = 1
+    else:
+        raw_prediction = np.empty((n_samples, loss.n_classes))
+        raw_prediction.flat[:] = rng.uniform(
+            low=raw_bound[0],
+            high=raw_bound[1],
+            size=n_samples * loss.n_classes,
+        )
+        y_true = np.arange(n_samples).astype(float) % loss.n_classes
+
+    return y_true, raw_prediction
+
+
+def numerical_derivative(func, x, eps):
+    """Helper function for numerical (first) derivatives.
+
+    # For numerical derivatives, see
+    # https://en.wikipedia.org/wiki/Numerical_differentiation
+    # https://en.wikipedia.org/wiki/Finite_difference_coefficient
+    # We use central finite differences of accuracy 4.
+    """
+    h = np.full_like(x, fill_value=eps)
+    f_minus_2h = func(x - 2 * h)
+    f_minus_1h = func(x - h)
+    f_plus_1h = func(x + h)
+    f_plus_2h = func(x + 2 * h)
+    return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (
+        12.0 * eps
+    )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_loss_boundary(loss):
+    # make sure low and high are always within the interval, used for linspace
+    if loss.n_classes is None or loss.n_classes <= 2:
+        low, high = _inclusive_low_high(loss.interval_y_true)
+        y_true = np.linspace(low, high, num=10)
+    else:
+        y_true = np.linspace(0, 9, num=10)
+
+    # add boundaries if they are included
+    if loss.interval_y_true.low_inclusive:
+        y_true = np.r_[y_true, loss.interval_y_true.low]
+    if loss.interval_y_true.high_inclusive:
+        y_true = np.r_[y_true, loss.interval_y_true.high]
+
+    assert loss.in_y_true_range(y_true)
+
+    low, high = _inclusive_low_high(loss.interval_y_pred)
+    if loss.n_classes is None or loss.n_classes <= 2:
+        y_pred = np.linspace(low, high, num=10)
+    else:
+        y_pred = np.empty((10, 3))
+        y_pred[:, 0] = np.linspace(low, high, num=10)
+        y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0])
+        y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0])
+
+    assert loss.in_y_pred_range(y_pred)
+
+    # calculating losses should not fail
+    raw_prediction = loss.link(y_pred)
+    loss.loss(y_true=y_true, raw_prediction=raw_prediction)
+
+
+@pytest.mark.parametrize(
+    "loss, y_true_success, y_true_fail",
+    [
+        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (HalfPoissonLoss(), [0, 0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfTweedieLoss(power=-3), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (HalfTweedieLoss(power=0), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (
+            HalfTweedieLoss(power=1.5),
+            [0, 0.1, 100],
+            [-np.inf, -3, -0.1, np.inf],
+        ),
+        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (BinaryCrossEntropy(), [0, 0.5, 1], [-np.inf, -1, 2, np.inf]),
+        (CategoricalCrossEntropy(), [0.0, 1.0, 2], [-np.inf, -1, 1.1, np.inf]),
+    ],
+)
+def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
+    # Test boundaries of y_true for loss functions.
+    for y in y_true_success:
+        assert loss.in_y_true_range(np.array([y]))
+    for y in y_true_fail:
+        assert not loss.in_y_true_range(np.array([y]))
+
+
+@pytest.mark.parametrize(
+    "loss, y_pred_success, y_pred_fail",
+    [
+        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+        (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (
+            HalfTweedieLoss(power=-3),
+            [0.1, 100],
+            [-np.inf, -3, -0.1, 0, np.inf],
+        ),
+        (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (
+            HalfTweedieLoss(power=1.5),
+            [0.1, 100],
+            [-np.inf, -3, -0.1, 0, np.inf],
+        ),
+        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+        (BinaryCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
+        (CategoricalCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
+    ],
+)
+def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
+    # Test boundaries of y_pred for loss functions.
+    for y in y_pred_success:
+        assert loss.in_y_pred_range(np.array([y]))
+    for y in y_pred_fail:
+        assert not loss.in_y_pred_range(np.array([y]))
+
+
+@pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
+@pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
+@pytest.mark.parametrize("sample_weight", [None, 1])
+@pytest.mark.parametrize("out1", [None, 1])
+@pytest.mark.parametrize("out2", [None, 1])
+@pytest.mark.parametrize("n_threads", [1, 2])
+def test_loss_dtype(
+    loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
+):
+    # Test that loss accepts if all input arrays are either all float32 or all
+    # float64, and all output arrays are either all float32 or all float64.
+    loss = loss()
+    if loss.n_classes <= 2:
+        # generate a y_true in valid range
+        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
+        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
+        raw_prediction = np.array([0.0], dtype=dtype_in)
+    else:
+        y_true = np.array([0], dtype=dtype_in)
+        raw_prediction = np.full(
+            shape=(1, loss.n_classes), fill_value=0.0, dtype=dtype_in
+        )
+
+    if sample_weight is not None:
+        sample_weight = np.array([2.0], dtype=dtype_in)
+    if out1 is not None:
+        out1 = np.empty_like(y_true, dtype=dtype_out)
+    if out2 is not None:
+        out2 = np.empty_like(raw_prediction, dtype=dtype_out)
+
+    loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out1,
+        n_threads=n_threads,
+    )
+    loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out2,
+        n_threads=n_threads,
+    )
+    loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out1,
+        gradient=out2,
+        n_threads=n_threads,
+    )
+    if out1 is not None and loss.n_classes >= 3:
+        out1 = np.empty_like(raw_prediction, dtype=dtype_out)
+    loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out1,
+        hessian=out2,
+        n_threads=n_threads,
+    )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_same_as_C_functions(loss, sample_weight):
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    out_l1 = np.empty_like(y_true)
+    out_l2 = np.empty_like(y_true)
+    out_g1 = np.empty_like(raw_prediction)
+    out_g2 = np.empty_like(raw_prediction)
+    out_h1 = np.empty_like(raw_prediction)
+    out_h2 = np.empty_like(raw_prediction)
+    assert_allclose(
+        loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=out_l1,
+        ),
+        loss._loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss=out_l2,
+        ),
+    )
+    assert_allclose(
+        loss.gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out_g1,
+        ),
+        loss._gradient(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out_g2,
+        ),
+    )
+    loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l1,
+        gradient=out_g1,
+    )
+    loss._loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l2,
+        gradient=out_g2,
+    )
+    assert_allclose(out_l1, out_l2)
+    assert_allclose(out_g1, out_g2)
+    loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g1,
+        hessian=out_h1,
+    )
+    loss._gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g2,
+        hessian=out_h2,
+    )
+    assert_allclose(out_g1, out_g2)
+    assert_allclose(out_h1, out_h2)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_gradients_are_the_same(loss, sample_weight):
+    # Test that loss and gradient are the same accross different functions
+    # Also test that output arguments contain correct result.
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    out_l1 = np.empty_like(y_true)
+    out_l2 = np.empty_like(y_true)
+    out_g1 = np.empty_like(raw_prediction)
+    out_g2 = np.empty_like(raw_prediction)
+    out_g3 = np.empty_like(raw_prediction)
+    out_h3 = np.empty_like(raw_prediction)
+
+    l1 = loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l1,
+    )
+    g1 = loss.gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g1,
+    )
+    l2, g2 = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        loss=out_l2,
+        gradient=out_g2,
+    )
+    g3, h3 = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+        gradient=out_g3,
+        hessian=out_h3,
+    )
+    assert_allclose(l1, l2)
+    assert_array_equal(l1, out_l1)
+    assert np.shares_memory(l1, out_l1)
+    assert_array_equal(l2, out_l2)
+    assert np.shares_memory(l2, out_l2)
+    assert_allclose(g1, g2)
+    assert_allclose(g1, g3)
+    assert_array_equal(g1, out_g1)
+    assert np.shares_memory(g1, out_g1)
+    assert_array_equal(g2, out_g2)
+    assert np.shares_memory(g2, out_g2)
+    assert_array_equal(g3, out_g3)
+    assert np.shares_memory(g3, out_g3)
+
+    if hasattr(loss, "gradient_proba"):
+        assert loss.n_classes >= 3  # only for CategoricalCrossEntropy
+        out_g4 = np.empty_like(raw_prediction)
+        out_proba = np.empty_like(raw_prediction)
+        g4, proba = loss.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out_g4,
+            proba=out_proba,
+        )
+        assert_allclose(g1, out_g4)
+        assert_allclose(g1, g4)
+        assert_allclose(proba, out_proba)
+        assert_allclose(np.sum(proba, axis=1), 1)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", ["ones", "random"])
+def test_sample_weight_multiplies_gradients(loss, sample_weight):
+    # Make sure that passing sample weights to the gradient and hessians
+    # computation methods is equivalent to multiplying by the weights.
+
+    n_samples = 100
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+
+    if sample_weight == "ones":
+        sample_weight = np.ones(shape=n_samples, dtype=np.float64)
+    else:
+        rng = np.random.RandomState(42)
+        sample_weight = rng.normal(size=n_samples).astype(np.float64)
+
+    baseline_prediction = loss.fit_intercept_only(
+        y_true=y_true, sample_weight=None
+    )
+
+    if loss.n_classes <= 2:
+        raw_prediction = np.zeros(
+            shape=(n_samples,), dtype=baseline_prediction.dtype
+        )
+    else:
+        raw_prediction = np.zeros(
+            shape=(n_samples, loss.n_classes), dtype=baseline_prediction.dtype
+        )
+    raw_prediction += baseline_prediction
+
+    gradient, hessian = loss.gradient_hessian(
+        y_true=y_true, raw_prediction=raw_prediction, sample_weight=None
+    )
+
+    gradient_sw, hessian_sw = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+
+    if loss.n_classes <= 2:
+        assert_allclose(gradient * sample_weight, gradient_sw)
+        assert_allclose(hessian * sample_weight, hessian_sw)
+    else:
+        assert_allclose(gradient * sample_weight[:, None], gradient_sw)
+        assert_allclose(hessian * sample_weight[:, None], hessian_sw)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_of_perfect_prediction(loss, sample_weight):
+    # Test that loss of y_true = y_pred plus constant_to_optimal_zero sums up
+    # to zero.
+    if loss.n_classes <= 2:
+        # Use small values such that exp(value) is not nan.
+        raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
+        y_true = loss.inverse(raw_prediction)
+    else:
+        # CategoricalCrossEntropy
+        y_true = np.arange(loss.n_classes).astype(float)
+        # raw_prediction with entries -exp(10), but +exp(10) on the diagonal
+        # this is close enough to np.inf which would produce nan
+        raw_prediction = np.full(
+            shape=(loss.n_classes, loss.n_classes),
+            fill_value=-np.exp(10),
+            dtype=float,
+        )
+        raw_prediction.flat[:: loss.n_classes + 1] = np.exp(10)
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    loss_value = loss.loss(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    constant_term = loss.constant_to_optimal_zero(
+        y_true=y_true, sample_weight=sample_weight
+    )
+    # Comparing loss_value + constant_term to zero would result in large
+    # round-off errors.
+    assert_allclose(loss_value, -constant_term, atol=1e-14, rtol=1e-15)
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_gradients_hessians_numerically(loss, sample_weight):
+    # Test that gradients are computed correctly by comparing to numerical
+    # derivatives of loss functions.
+    # Test that hessians are correct by numerical derivative of gradients.
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y_true.shape[0], num=y_true.shape[0])
+
+    g, h = loss.gradient_hessian(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+
+    assert g.shape == raw_prediction.shape
+    assert h.shape == raw_prediction.shape
+
+    if loss.n_classes <= 2:
+
+        def loss_func(x):
+            return loss.loss(
+                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+            )
+
+        g_numeric = numerical_derivative(loss_func, raw_prediction, eps=1e-6)
+        assert_allclose(g, g_numeric, rtol=5e-6, atol=1e-10)
+
+        def grad_func(x):
+            return loss.gradient(
+                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+            )
+
+        h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6)
+        if loss.approx_hessian:
+            assert np.all(h >= h_numeric)
+        else:
+            assert_allclose(h, h_numeric, rtol=5e-6, atol=1e-10)
+    else:
+        # For multiclass loss, we should only change the predictions of the
+        # class for which the derivative is taken for, e.g. offset[:, k] = eps
+        # for class k.
+        # As a softmax is computed, offsetting the whole array by a constant
+        # would have no effect on the probabilities, and thus on the loss.
+        for k in range(loss.n_classes):
+
+            def loss_func(x):
+                raw = raw_prediction.copy()
+                raw[:, k] = x
+                return loss.loss(
+                    y_true=y_true,
+                    raw_prediction=raw,
+                    sample_weight=sample_weight,
+                )
+
+            g_numeric = numerical_derivative(
+                loss_func, raw_prediction[:, k], eps=1e-5
+            )
+            assert_allclose(g[:, k], g_numeric, rtol=5e-6, atol=1e-10)
+
+            def grad_func(x):
+                raw = raw_prediction.copy()
+                raw[:, k] = x
+                return loss.gradient(
+                    y_true=y_true,
+                    raw_prediction=raw,
+                    sample_weight=sample_weight,
+                )[:, k]
+
+            h_numeric = numerical_derivative(
+                grad_func, raw_prediction[:, k], eps=1e-6
+            )
+            if loss.approx_hessian:
+                assert np.all(h >= h_numeric)
+            else:
+                assert_allclose(h[:, k], h_numeric, rtol=5e-6, atol=1e-10)
+
+
+@pytest.mark.parametrize(
+    "loss, x0, y_true",
+    [
+        ("squared_error", -2.0, 42),
+        ("squared_error", 117.0, 1.05),
+        ("squared_error", 0.0, 0.0),
+        # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp.
+        # -inf and +inf due to logit, cf. "complete separation". Therefore, we
+        # use 0 < y_true < 1.
+        ("binary_crossentropy", 0.3, 0.1),
+        ("binary_crossentropy", -12, 0.2),
+        ("binary_crossentropy", 30, 0.9),
+        ("poisson_loss", 12.0, 1.0),
+        ("poisson_loss", 0.0, 2.0),
+        ("poisson_loss", -22.0, 10.0),
+    ],
+)
+@pytest.mark.skipif(
+    sp_version == parse_version("1.2.0"),
+    reason="bug in scipy 1.2.0, see scipy issue #9608",
+)
+@skip_if_32bit
+def test_derivatives(loss, x0, y_true):
+    # Check that gradients are zero when the loss is minimized on a single
+    # value/sample using Halley's method with the first and second order
+    # derivatives computed by the Loss instance.
+    # Note that methods of Loss instances operate on arrays while the newton
+    # root finder expects a scalar or a one-element array for this purpose.
+
+    loss = _LOSSES[loss](sample_weight=None)
+    y_true = np.array([y_true], dtype=np.float64)
+    x0 = np.array([x0], dtype=np.float64)
+
+    def func(x: np.ndarray) -> np.ndarray:
+        # Add constant term such that loss has its minimum at zero, which is
+        # required by the newton method.
+        return loss.loss(
+            y_true=y_true, raw_prediction=x
+        ) + loss.constant_to_optimal_zero(y_true=y_true)
+
+    def fprime(x: np.ndarray) -> np.ndarray:
+        return loss.gradient(y_true=y_true, raw_prediction=x)
+
+    def fprime2(x: np.ndarray) -> np.ndarray:
+        return loss.gradient_hessian(y_true=y_true, raw_prediction=x)[1]
+
+    optimum = newton(
+        func,
+        x0=x0,
+        fprime=fprime,
+        fprime2=fprime2,
+        maxiter=100,
+        tol=5e-8,
+    )
+
+    # Need to ravel arrays because assert_allclose requires matching dimensions
+    y_true = y_true.ravel()
+    optimum = optimum.ravel()
+    assert_allclose(loss.inverse(optimum), y_true)
+    assert_allclose(func(optimum), 0, atol=1e-14)
+    assert_allclose(
+        loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7
+    )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+def test_loss_intercept_only(loss, sample_weight):
+    # Test that fit_intercept_only returns the argmin of the loss and that the
+    # gradient is zero.
+    n_samples = 50
+    if loss.n_classes <= 2:
+        y_true = loss.inverse(np.linspace(-4, 4, num=n_samples))
+    else:
+        y_true = np.arange(n_samples).astype(float) % loss.n_classes
+        y_true[::5] = 0  # exceedance of class 0
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(0.1, 2, num=n_samples)
+
+    a = loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight)
+
+    # find minimum by optimization
+    def fun(x):
+        if loss.n_classes <= 2:
+            raw_prediction = np.full(shape=(n_samples), fill_value=x)
+        else:
+            raw_prediction = np.ascontiguousarray(
+                np.broadcast_to(x, shape=(n_samples, loss.n_classes))
+            )
+        return loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+        )
+
+    if loss.n_classes <= 2:
+        opt = minimize_scalar(fun, tol=1e-7, options={"maxiter": 100})
+        grad = loss.gradient(
+            y_true=y_true,
+            raw_prediction=np.full_like(y_true, a),
+            sample_weight=sample_weight,
+        )
+        assert a.shape == tuple()  # scalar
+        assert a.dtype == y_true.dtype
+        assert_all_finite(a)
+        a == approx(opt.x, rel=1e-7)
+        grad.sum() == approx(0, abs=1e-12)
+    else:
+        # constraint corresponds to sum(raw_prediction) = 0
+        # without the constraint, we would need to apply
+        # loss.symmetrize_raw_prediction to opt.x before comparing
+        # TODO: With scipy 1.1.0, one could use
+        # LinearConstraint(np.ones((1, loss.n_classes)), 0, 0)
+        opt = minimize(
+            fun,
+            np.empty((loss.n_classes)),
+            tol=1e-13,
+            options={"maxiter": 100},
+            method="SLSQP",
+            constraints={
+                "type": "eq",
+                "fun": lambda x: np.ones((1, loss.n_classes)) @ x
+            },
+        )
+        grad = loss.gradient(
+            y_true=y_true,
+            raw_prediction=np.tile(a, (n_samples, 1)),
+            sample_weight=sample_weight,
+        )
+        assert a.dtype == y_true.dtype
+        assert_all_finite(a)
+        assert_allclose(a, opt.x, rtol=5e-6, atol=1e-12)
+        assert_allclose(grad.sum(axis=0), 0, atol=1e-12)
+
+
+@pytest.mark.parametrize(
+    "loss, func, link, low, high, random_dist",
+    [
+        (HalfSquaredError, np.mean, "identity", None, None, "normal"),
+        (AbsoluteError, np.median, "identity", None, None, "normal"),
+        (HalfPoissonLoss, np.mean, np.log, 0, None, "poisson"),
+        (BinaryCrossEntropy, np.mean, logit, 0, 1, "binomial"),
+    ],
+)
+def test_specific_fit_intercept_only(loss, func, link, low, high, random_dist):
+    rng = np.random.RandomState(0)
+    loss = loss()
+    if random_dist == "binomial":
+        y_train = rng.binomial(1, 0.5, size=100)
+    else:
+        y_train = getattr(rng, random_dist)(size=100)
+    baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+    # Make sure baseline prediction is the expected one, i.e. func, e.g.
+    # mean or median.
+    assert_all_finite(baseline_prediction)
+    if link == "identity":
+        assert baseline_prediction == approx(func(y_train))
+        assert_allclose(loss.inverse(baseline_prediction), baseline_prediction)
+    else:
+        assert baseline_prediction == approx(link(func(y_train)))
+
+    # Test baseline at boundary
+    if low is not None:
+        y_train.fill(low)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert_all_finite(baseline_prediction)
+    if high is not None:
+        y_train.fill(high)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert_all_finite(baseline_prediction)
+
+
+def test_categorical_crossentropy_fit_intercept_only():
+    rng = np.random.RandomState(0)
+    n_classes = 4
+    loss = CategoricalCrossEntropy(n_classes=n_classes)
+    # Same logic as test_single_fit_intercept_only. Here inverse link function
+    # = softmax and link function = log - symmetry term
+    y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64)
+    baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+    assert baseline_prediction.shape == (n_classes,)
+    p = np.zeros(n_classes, dtype=y_train.dtype)
+    for k in range(n_classes):
+        p[k] = (y_train == k).mean()
+    assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p)))
+    assert_allclose(baseline_prediction[None, :], loss.link(p[None, :]))
+
+    for y_train in (np.zeros(shape=10), np.ones(shape=10)):
+        y_train = y_train.astype(np.float64)
+        baseline_prediction = loss.fit_intercept_only(y_true=y_train)
+        assert baseline_prediction.dtype == y_train.dtype
+        assert_all_finite(baseline_prediction)
+
+
+def test_binary_and_categorical_crossentropy():
+    # Test that CategoricalCrossEntropy with n_classes = 2 is the same as
+    # BinaryCrossEntropy
+    rng = np.random.RandomState(0)
+    n_samples = 20
+    bce = BinaryCrossEntropy()
+    cce = CategoricalCrossEntropy(n_classes=2)
+    y_train = rng.randint(0, 2, size=n_samples).astype(np.float64)
+    raw_prediction = rng.normal(size=n_samples)
+    raw_cce = np.empty((n_samples, 2))
+    raw_cce[:, 0] = -0.5 * raw_prediction
+    raw_cce[:, 1] = 0.5 * raw_prediction
+    assert_allclose(
+        bce.loss(y_true=y_train, raw_prediction=raw_prediction),
+        cce.loss(y_true=y_train, raw_prediction=raw_cce)
+    )

From 25012bc76fb585202aa72ffa847e5467e3ec8631 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 2 Apr 2021 10:57:52 +0200
Subject: [PATCH 052/143] CLN replace deprecated np.int by int

---
 sklearn/_loss/loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 49d968b6bd2af..2ac09628fed79 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -824,7 +824,7 @@ def in_y_true_range(self, y):
         y : ndarray
         """
         return is_in_interval_range(y, self.interval_y_true) and np.all(
-            y.astype(np.int) == y
+            y.astype(int) == y
         )
 
     def fit_intercept_only(self, y_true, sample_weight=None):

From d6c9307c7300df1b6aca17c57b43ff82217cd386 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 2 Apr 2021 11:05:37 +0200
Subject: [PATCH 053/143] DOC document default=1 for n_threads

---
 sklearn/_loss/loss.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 2ac09628fed79..37a04d88c5740 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -155,7 +155,7 @@ def loss(
         loss : None or C-contiguous array of shape (n_samples,)
             A location into which the result is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -203,7 +203,7 @@ def loss_gradient(
             of shape (n_samples, n_classes)
             A location into which the gradient is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -261,7 +261,7 @@ def gradient(
             of shape (n_samples, n_classes)
             A location into which the result is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -314,7 +314,7 @@ def gradient_hessian(
             of shape (n_samples, n_classes)
             A location into which the hessian is stored. If None, a new array
             might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -365,7 +365,7 @@ def __call__(
             Raw prediction values (in link space).
         sample_weight : None or C-contiguous array of shape (n_samples,)
             Sample weights.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns
@@ -868,7 +868,7 @@ def gradient_proba(
         proba : None or array of shape (n_samples, n_classes)
             A location into which the class probabilities are stored. If None,
             a new array might be created.
-        n_threads : int
+        n_threads : int, default=1
             Might use openmp thread parallelism.
 
         Returns

From 5682a7282485a3d57933603d9e83cb016743eac1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:05:04 +0200
Subject: [PATCH 054/143] CLN comments and line wrapping

---
 sklearn/_loss/_loss.pyx | 127 +++++++++++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 35 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index f94c4118119f9..2e10b1c6ec721 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -27,7 +27,8 @@
 #      input checking like None -> np.empty().
 #
 # Note: We require 1-dim ndarrays to be contiguous.
-# TODO: Use const memoryviews with Cython 3.0 where appropriate (# IN)
+# TODO: Use const memoryviews with fused types with Cython 3.0 where
+#       appropriate (arguments marked by "# IN")
 
 cimport cython
 from cython.parallel import parallel, prange
@@ -57,8 +58,9 @@ cdef inline double log1pexp(double x) nogil:
 
 
 cdef inline void sum_exp_minus_max(
-    const int i, Y_DTYPE_C[:, :] raw_prediction,  # IN
-    Y_DTYPE_C *p                                  # OUT
+    const int i,
+    Y_DTYPE_C[:, :] raw_prediction,  # IN
+    Y_DTYPE_C *p                     # OUT
 ) nogil:
     # Store p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
     #       p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
@@ -70,7 +72,8 @@ cdef inline void sum_exp_minus_max(
     # - i needs to be passed (and stays constant) because otherwise Cython does
     #   not generate optimal code, see
     #   https://github.com/scikit-learn/scikit-learn/issues/17299
-    # - We do not calculate p[k] = p[k] / sum_exps to save one loop over k.
+    # - We do not normalize p by calculating p[k] = p[k] / sum_exps.
+    #   This helps to save one loop over k.
     cdef:
         int k
         int n_classes = raw_prediction.shape[1]
@@ -93,18 +96,23 @@ cdef inline void sum_exp_minus_max(
 # Single point inline C functions
 # -------------------------------------
 # Half Squared Error
-cdef inline double closs_half_squared_error(double y_true, double raw_prediction) nogil:
+cdef inline double closs_half_squared_error(
+    double y_true,
+    double raw_prediction
+) nogil:
     return 0.5 * (raw_prediction - y_true) * (raw_prediction - y_true)
 
 
 cdef inline double cgradient_half_squared_error(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     return raw_prediction - y_true
 
 
 cdef inline double2 cgrad_hess_half_squared_error(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     cdef double2 gh
     gh.val1 = raw_prediction - y_true  # gradient
@@ -113,16 +121,23 @@ cdef inline double2 cgrad_hess_half_squared_error(
 
 
 # Absolute Error
-cdef inline double closs_absolute_error(double y_true, double raw_prediction) nogil:
+cdef inline double closs_absolute_error(
+    double y_true,
+    double raw_prediction
+) nogil:
     return fabs(raw_prediction - y_true)
 
 
-cdef inline double cgradient_absolute_error(double y_true, double raw_prediction) nogil:
+cdef inline double cgradient_absolute_error(
+    double y_true,
+    double raw_prediction
+) nogil:
     return 1. if raw_prediction > y_true else -1.
 
 
 cdef inline double2 cgrad_hess_absolute_error(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     cdef double2 gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
@@ -134,20 +149,26 @@ cdef inline double2 cgrad_hess_absolute_error(
 
 # Quantile Loss / Pinball Loss
 cdef inline double closs_pinball_loss(
-    double y_true, double raw_prediction, double quantile
+    double y_true,
+    double raw_prediction,
+    double quantile
 ) nogil:
     return (quantile * (y_true - raw_prediction) if y_true >= raw_prediction
             else (1. - quantile) * (raw_prediction - y_true))
 
 
 cdef inline double cgradient_pinball_loss(
-    double y_true, double raw_prediction, double quantile
+    double y_true,
+    double raw_prediction,
+    double quantile
 ) nogil:
     return -quantile if y_true >=raw_prediction else 1. - quantile
 
 
 cdef inline double2 cgrad_hess_pinball_loss(
-    double y_true, double raw_prediction, double quantile
+    double y_true,
+    double raw_prediction,
+    double quantile
 ) nogil:
     cdef double2 gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
@@ -158,24 +179,36 @@ cdef inline double2 cgrad_hess_pinball_loss(
 
 
 # Half Poisson Deviance with Log-Link, dropping constant terms
-cdef inline double closs_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double closs_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     return exp(raw_prediction) - y_true * raw_prediction
 
 
-cdef inline double cgradient_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double cgradient_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     # y_pred - y_true
     return exp(raw_prediction) - y_true
 
 
-cdef inline double2 closs_grad_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double2 closs_grad_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 lg
-    lg.val2 = exp(raw_prediction)
+    lg.val2 = exp(raw_prediction)                # used as temporary
     lg.val1 = lg.val2 - y_true * raw_prediction  # loss
     lg.val2 -= y_true                            # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_poisson(double y_true, double raw_prediction) nogil:
+cdef inline double2 cgrad_hess_half_poisson(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 gh
     gh.val2 = exp(raw_prediction)  # hessian
     gh.val1 = gh.val2 - y_true     # gradient
@@ -183,25 +216,37 @@ cdef inline double2 cgrad_hess_half_poisson(double y_true, double raw_prediction
 
 
 # Half Gamma Deviance with Log-Link, dropping constant terms
-cdef inline double closs_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double closs_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     return raw_prediction + y_true * exp(-raw_prediction)
 
 
-cdef inline double cgradient_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double cgradient_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     return 1. - y_true * exp(-raw_prediction)
 
 
-cdef inline double2 closs_grad_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double2 closs_grad_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 lg
-    lg.val2 = exp(-raw_prediction)
+    lg.val2 = exp(-raw_prediction)               # used as temporary
     lg.val1 = raw_prediction + y_true * lg.val2  # loss
     lg.val2 = 1. - y_true * lg.val2              # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_gamma(double y_true, double raw_prediction) nogil:
+cdef inline double2 cgrad_hess_half_gamma(
+    double y_true,
+    double raw_prediction
+) nogil:
     cdef double2 gh
-    gh.val2 = exp(-raw_prediction)
+    gh.val2 = exp(-raw_prediction)   # used as temporary
     gh.val1 = 1. - y_true * gh.val2  # gradient
     gh.val2 *= y_true                # hessian
     return gh
@@ -210,7 +255,9 @@ cdef inline double2 cgrad_hess_half_gamma(double y_true, double raw_prediction)
 # Half Tweedie Deviance with Log-Link, dropping constant terms
 # Note that by dropping constants this is no longer smooth in parameter power.
 cdef inline double closs_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     if power == 0.:
         return closs_half_squared_error(y_true, exp(raw_prediction))
@@ -224,7 +271,9 @@ cdef inline double closs_half_tweedie(
 
 
 cdef inline double cgradient_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     cdef double exp1
     if power == 0.:
@@ -240,7 +289,9 @@ cdef inline double cgradient_half_tweedie(
 
 
 cdef inline double2 closs_grad_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     cdef double2 lg
     cdef double exp1, exp2
@@ -261,7 +312,9 @@ cdef inline double2 closs_grad_half_tweedie(
 
 
 cdef inline double2 cgrad_hess_half_tweedie(
-    double y_true, double raw_prediction, double power
+    double y_true,
+    double raw_prediction,
+    double power
 ) nogil:
     cdef double2 gh
     cdef double exp1, exp2
@@ -283,14 +336,16 @@ cdef inline double2 cgrad_hess_half_tweedie(
 
 # Binary cross entropy aka log-loss
 cdef inline double closs_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     # log1p(exp(raw_prediction)) - y_true * raw_prediction
     return log1pexp(raw_prediction) - y_true * raw_prediction
 
 
 cdef inline double cgradient_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     # y_pred - y_true = expit(raw_prediction) - y_true
     # Numerically more stable, see
@@ -314,18 +369,19 @@ cdef inline double cgradient_binary_crossentropy(
 
 
 cdef inline double2 closs_grad_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     cdef double2 lg
     if raw_prediction <= 0:
-        lg.val2 = exp(raw_prediction)
+        lg.val2 = exp(raw_prediction)  # used as temporary
         if raw_prediction <= -37:
             lg.val1 = lg.val2 - y_true * raw_prediction              # loss
         else:
             lg.val1 = log1p(lg.val2) - y_true * raw_prediction       # loss
         lg.val2 = ((1 - y_true) * lg.val2 - y_true) / (1 + lg.val2)  # gradient
     else:
-        lg.val2 = exp(-raw_prediction)
+        lg.val2 = exp(-raw_prediction)  # used as temporary
         if raw_prediction <= 18:
             # log1p(exp(x)) = log(1 + exp(x)) = x + log1p(exp(-x))
             lg.val1 = log1p(lg.val2) + (1 - y_true) * raw_prediction  # loss
@@ -336,13 +392,14 @@ cdef inline double2 closs_grad_binary_crossentropy(
 
 
 cdef inline double2 cgrad_hess_binary_crossentropy(
-    double y_true, double raw_prediction
+    double y_true,
+    double raw_prediction
 ) nogil:
     # with y_pred = expit(raw)
     # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2
     #                                 = exp(-raw) / (1 + exp(-raw))**2
     cdef double2 gh
-    gh.val2 = exp(-raw_prediction)
+    gh.val2 = exp(-raw_prediction)  # used as temporary
     gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
     gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
     return gh

From 6ace46210a12bc273662bb2724feb8cd8ff5b915 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:16:18 +0200
Subject: [PATCH 055/143] CLN comments and doc

---
 sklearn/_loss/_loss.pyx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index 2e10b1c6ec721..c57965cb5f59c 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -358,7 +358,7 @@ cdef inline double cgradient_binary_crossentropy(
     #         return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
     # Note that optimal speed would be achieved, at the cost of precision, by
     #     return expit(raw_prediction) - y_true
-    # i.e. no if else, and an own inline implemention of expit instead of
+    # i.e. no "if else" and an own inline implemention of expit instead of
     #     from scipy.special.cython_special cimport expit
     # The case distinction raw_prediction < 0 in the stable implementation
     # does not provide significant better precision. Therefore we go without
@@ -465,7 +465,7 @@ cdef class cLossFunction:
 
         Returns
         -------
-        grad_hess_pair
+        double2
             Gradient and hessian of the loss function w.r.t. `raw_prediction`.
         """
         pass
@@ -495,7 +495,7 @@ cdef class cLossFunction:
         loss : array of shape (n_samples,)
             A location into which the result is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------
@@ -525,7 +525,7 @@ cdef class cLossFunction:
         gradient : array of shape (n_samples,)
             A location into which the result is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------
@@ -558,7 +558,7 @@ cdef class cLossFunction:
         gradient : array of shape (n_samples,)
             A location into which the gradient is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------
@@ -598,7 +598,7 @@ cdef class cLossFunction:
         hessian : array of shape (n_samples,)
             A location into which the hessian is stored.
         n_threads : int
-            Might use openmp thread parallelism.
+            Number of threads used by OpenMP (if any).
 
         Returns
         -------

From 1a5ae1c260d7e4b726699150c4eb3610b58adcaa Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:17:49 +0200
Subject: [PATCH 056/143] BUG remove useless line of code

---
 sklearn/_loss/_loss.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index c57965cb5f59c..59a46dcab522b 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -1601,7 +1601,6 @@ cdef class cCategoricalCrossEntropy(cLossFunction):
 
                 for i in prange(n_samples, schedule='static'):
                     sum_exp_minus_max(i, raw_prediction, p)
-                    max_value = raw_prediction[i, 0]
                     max_value = p[n_classes]     # p[-2]
                     sum_exps = p[n_classes + 1]  # p[-1]
                     loss[i] = log(sum_exps) + max_value

From af93e7ba77eb2a46d8ab8862ea22db86c8642a51 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:21:52 +0200
Subject: [PATCH 057/143] CLN remove line that was commented out

---
 sklearn/_loss/setup.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index 23d35439885ba..227eee5e47a64 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -4,13 +4,9 @@
 
 def configuration(parent_package="", top_path=None):
     config = Configuration("_loss", parent_package, top_path)
-
     config.add_extension(
         "_loss", sources=["_loss.pyx"], include_dirs=[numpy.get_include()]
     )
-
-    # config.add_subpackage("tests")
-
     return config
 
 
From 9ed2096e3643935f3f7e9e35fe2201cc7b2c155d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:41:16 +0200
Subject: [PATCH 058/143] CLN nitpicks in comments and docstrings

---
 sklearn/_loss/loss.py            | 4 +---
 sklearn/_loss/tests/test_link.py | 6 +++---
 sklearn/_loss/tests/test_loss.py | 6 ++----
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 37a04d88c5740..4a3e0dbdcde5e 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -466,9 +466,6 @@ def gradient(
         gradient=None,
         n_threads=1,
     ):
-        # easier in numpy
-        # gradient = raw_prediction - y_true is easier in numpy
-
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
@@ -479,6 +476,7 @@ def gradient(
         ):
             gradient = gradient.squeeze(1)
 
+        # gradient = raw_prediction - y_true is easier in numpy
         gradient = np.subtract(raw_prediction, y_true, out=gradient)
         if sample_weight is None:
             return gradient
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index a8dbbff511373..b049f5ac637d6 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -54,7 +54,7 @@ def test_is_in_range(interval):
 
 @pytest.mark.parametrize("link", LINK_FUNCTIONS)
 def test_link_inverse_identity(link):
-    # Test that link of inverse gives idendity.
+    # Test that link of inverse gives identity.
     rng = np.random.RandomState(42)
     link = link()
     n_samples, n_classes = 100, None
@@ -67,7 +67,7 @@ def test_link_inverse_identity(link):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:
         # So far, the valid interval of raw_prediction is (-inf, inf) and
-        # we do not need to distinguish
+        # we do not need to distinguish.
         raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
 
     assert_allclose(link.link(link.inverse(raw_prediction)), raw_prediction)
@@ -90,7 +90,7 @@ def test_link_out_argument(link):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:
         # So far, the valid interval of raw_prediction is (-inf, inf) and
-        # we do not need to distinguish
+        # we do not need to distinguish.
         raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples))
 
     y_pred = link.inverse(raw_prediction, out=None)
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 5f7e001f2d6de..68832a3cdc273 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -87,13 +87,11 @@ def random_y_true_raw_prediction(
 
 
 def numerical_derivative(func, x, eps):
-    """Helper function for numerical (first) derivatives.
-
+    """Helper function for numerical (first) derivatives."""
     # For numerical derivatives, see
     # https://en.wikipedia.org/wiki/Numerical_differentiation
     # https://en.wikipedia.org/wiki/Finite_difference_coefficient
     # We use central finite differences of accuracy 4.
-    """
     h = np.full_like(x, fill_value=eps)
     f_minus_2h = func(x - 2 * h)
     f_minus_1h = func(x - h)
@@ -348,7 +346,7 @@ def test_loss_same_as_C_functions(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_gradients_are_the_same(loss, sample_weight):
-    # Test that loss and gradient are the same accross different functions
+    # Test that loss and gradient are the same across different functions.
     # Also test that output arguments contain correct result.
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,

From b5e822414c1e1600a160ca28a7b89a032ac892cc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 14 Apr 2021 19:50:23 +0200
Subject: [PATCH 059/143] ENH set NPY_NO_DEPRECATED_API

---
 sklearn/_loss/setup.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index 227eee5e47a64..ad380fc8e429b 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -5,7 +5,10 @@
 def configuration(parent_package="", top_path=None):
     config = Configuration("_loss", parent_package, top_path)
     config.add_extension(
-        "_loss", sources=["_loss.pyx"], include_dirs=[numpy.get_include()]
+        "_loss",
+        sources=["_loss.pyx"],
+        include_dirs=[numpy.get_include()],
+        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_13_API_VERSION")],
     )
     return config
 

From c5c0d55b17244070aecc52bae031f234a4d8fc57 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 15 Apr 2021 08:36:43 +0200
Subject: [PATCH 060/143] MNT change NPY_1_13_API_VERSION to
 NPY_1_7_API_VERSION

---
 sklearn/_loss/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index ad380fc8e429b..63546bd29c90b 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -8,7 +8,7 @@ def configuration(parent_package="", top_path=None):
         "_loss",
         sources=["_loss.pyx"],
         include_dirs=[numpy.get_include()],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_13_API_VERSION")],
+        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
     )
     return config
 

From d7b105e7ded9950b6113a46f44010b26e9ef2825 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 15 Apr 2021 08:58:10 +0200
Subject: [PATCH 061/143] MNT comment out NPY_NO_DEPRECATED_API

---
 sklearn/_loss/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index 63546bd29c90b..c7f11afe9e30a 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -8,7 +8,7 @@ def configuration(parent_package="", top_path=None):
         "_loss",
         sources=["_loss.pyx"],
         include_dirs=[numpy.get_include()],
-        define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
+        # define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
     )
     return config
 

From 4b4829440e1fd0e3696b591fe3296088ac991441 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 8 May 2021 15:21:35 +0200
Subject: [PATCH 062/143] TST restructure domain test cases

---
 sklearn/_loss/tests/test_loss.py | 82 ++++++++++++++++----------------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 68832a3cdc273..8ca75d9a966e2 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -135,26 +135,46 @@ def test_loss_boundary(loss):
     loss.loss(y_true=y_true, raw_prediction=raw_prediction)
 
 
+# Fixture to test valid value ranges.
+Y_COMMON_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
+    (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLoss(power=-3), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, np.inf]),
+    (HalfTweedieLoss(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
+    (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
+    (BinaryCrossEntropy(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]),
+    (CategoricalCrossEntropy(), [], [-np.inf, -1, 1.1, np.inf]),
+]
+# y_pred and y_true do not always have the same domain (valid value range).
+# Hence, we define extra sets of parameters for each of them.
+Y_TRUE_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfPoissonLoss(), [0], []),
+    (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []),
+    (HalfTweedieLoss(power=0), [-100, 0], []),
+    (HalfTweedieLoss(power=1.5), [0], []),
+    (BinaryCrossEntropy(), [0, 1], []),
+    (CategoricalCrossEntropy(), [0.0, 1.0, 2], []),
+]
+Y_PRED_PARAMS = [
+    # (loss, [y success], [y fail])
+    (HalfPoissonLoss(), [], [0]),
+    (HalfTweedieLoss(power=-3), [], [-3, -0.1, 0]),
+    (HalfTweedieLoss(power=0), [], [-3, -0.1, 0]),
+    (HalfTweedieLoss(power=1.5), [], [0]),
+    (BinaryCrossEntropy(), [], [0, 1]),
+    (CategoricalCrossEntropy(), [0.1, 0.5], [0, 1]),
+]
+
+
 @pytest.mark.parametrize(
-    "loss, y_true_success, y_true_fail",
-    [
-        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (HalfPoissonLoss(), [0, 0.1, 100], [-np.inf, -3, -0.1, np.inf]),
-        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfTweedieLoss(power=-3), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (HalfTweedieLoss(power=0), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (
-            HalfTweedieLoss(power=1.5),
-            [0, 0.1, 100],
-            [-np.inf, -3, -0.1, np.inf],
-        ),
-        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (BinaryCrossEntropy(), [0, 0.5, 1], [-np.inf, -1, 2, np.inf]),
-        (CategoricalCrossEntropy(), [0.0, 1.0, 2], [-np.inf, -1, 1.1, np.inf]),
-    ],
+    "loss, y_true_success, y_true_fail", Y_COMMON_PARAMS + Y_TRUE_PARAMS
 )
 def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
     # Test boundaries of y_true for loss functions.
@@ -165,29 +185,7 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
 
 
 @pytest.mark.parametrize(
-    "loss, y_pred_success, y_pred_fail",
-    [
-        (HalfSquaredError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (AbsoluteError(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (PinballLoss(), [-100, 0, 0.1, 100], [-np.inf, np.inf]),
-        (HalfPoissonLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfGammaLoss(), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (
-            HalfTweedieLoss(power=-3),
-            [0.1, 100],
-            [-np.inf, -3, -0.1, 0, np.inf],
-        ),
-        (HalfTweedieLoss(power=0), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (
-            HalfTweedieLoss(power=1.5),
-            [0.1, 100],
-            [-np.inf, -3, -0.1, 0, np.inf],
-        ),
-        (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-        (BinaryCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
-        (CategoricalCrossEntropy(), [0.1, 0.5], [-np.inf, 0, 1, np.inf]),
-    ],
+    "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
     # Test boundaries of y_pred for loss functions.

From a4572a4f27c05b586da2288a0a7a04d2ff305bd4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 18:36:03 +0200
Subject: [PATCH 063/143] DOC add losses to API reference

---
 doc/modules/classes.rst   | 24 ++++++++++++++++++++++++
 sklearn/_loss/__init__.py |  4 ++++
 2 files changed, 28 insertions(+)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 3848a189c35d4..1560af6827553 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1646,3 +1646,27 @@ Recently deprecated
 
 To be removed in 1.0 (renaming of 0.25)
 ---------------------------------------
+
+.. _loss_function_ref:
+
+:mod:`sklearn._loss`: Non-public Loss Function Classes
+===========================================================
+
+.. automodule:: sklearn._loss
+   :no-members:
+   :no-inherited-members:
+
+.. currentmodule:: sklearn
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   _loss.HalfSquaredError
+   _loss.AbsoluteError
+   _loss.PinballLoss
+   _loss.HalfPoissonLoss
+   _loss.HalfGammaLoss
+   _loss.HalfTweedieLoss
+   _loss.BinaryCrossEntropy
+   _loss.CategoricalCrossEntropy
diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index e69de29bb2d1d..bb71abe0ad48a 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -0,0 +1,4 @@
+"""
+The :mod:`sklearn._loss` module includes loss function classes suitable for
+fitting classification and regression tasks.
+"""

From 64964d686acaee8d3b219ecc0edc5a49e3adb4a4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 20:22:43 +0200
Subject: [PATCH 064/143] MNT add classes to __init__

---
 sklearn/_loss/__init__.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index bb71abe0ad48a..282a3df9bdb93 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -2,3 +2,26 @@
 The :mod:`sklearn._loss` module includes loss function classes suitable for
 fitting classification and regression tasks.
 """
+
+from ._loss import (
+    HalfSquaredError,
+    AbsoluteError,
+    PinballLoss,
+    HalfPoissonLoss,
+    HalfGammaLoss,
+    HalfTweedieLoss,
+    BinaryCrossEntropy,
+    CategoricalCrossEntropy,
+)
+
+
+__all__ = [
+    "HalfSquaredError",
+    "AbsoluteError",
+    "PinballLoss",
+    "HalfPoissonLoss",
+    "HalfGammaLoss",
+    "HalfTweedieLoss",
+    "BinaryCrossEntropy",
+    "CategoricalCrossEntropy",
+]

From 43b62693333dde83d2ffc95077b2f2486cbbac7a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 20:54:28 +0200
Subject: [PATCH 065/143] CLN fix import

---
 sklearn/_loss/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index 282a3df9bdb93..ae7bac5f1a8d8 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -3,7 +3,7 @@
 fitting classification and regression tasks.
 """
 
-from ._loss import (
+from .loss import (
     HalfSquaredError,
     AbsoluteError,
     PinballLoss,

From f90049bb1dd744a73011e304b285ac19fb7e2f15 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 9 May 2021 22:05:16 +0200
Subject: [PATCH 066/143] DOC minor docstring changes

---
 sklearn/_loss/loss.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 4a3e0dbdcde5e..a636d9fa29c6a 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -560,8 +560,8 @@ class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
         loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
 
         rho_{quantile}(u) = u * (quantile - 1_{u<0})
-                          = -u (1 - quantile)  if u < 0
-                            u * quantile       if u >= 0
+                          = -u *(1 - quantile)  if u < 0
+                             u * quantile       if u >= 0
 
     Note: 2 * PinballLoss(quantile=0.5) equals AbsoluteError().
 
@@ -649,9 +649,9 @@ class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
         loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                     + y_true/exp(raw_prediction_i) - 1
 
-    Half the Gamma deviance is actually proportional the negative log
-    likelihood up constant terms (not involving raw_prediction) and simplifies
-    the computation of the gradients.
+    Half the Gamma deviance is actually proportional to the negative log
+    likelihood up to constant terms (not involving raw_prediction) and
+    simplifies the computation of the gradients.
     We also skip the constant term `-log(y_true_i) - 1`.
     """
 

From 47691ffc76e5a520c7ddbc4d8f36e466075ad7fa Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 May 2021 17:25:58 +0200
Subject: [PATCH 067/143] TST prefer docstring over comment

---
 sklearn/_loss/tests/test_loss.py | 117 +++++++++++++++++++------------
 1 file changed, 71 insertions(+), 46 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 8ca75d9a966e2..26c936e428d97 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -7,9 +7,8 @@
     minimize_scalar,
     newton,
 )
-from scipy.special import logit
 
-from sklearn._loss.link import _inclusive_low_high
+from sklearn._loss.link import _inclusive_low_high, IdentityLink
 from sklearn._loss.loss import (
     _LOSSES,
     AbsoluteError,
@@ -104,6 +103,7 @@ def numerical_derivative(func, x, eps):
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_loss_boundary(loss):
+    """Test interval ranges of y_true and y_pred in losses."""
     # make sure low and high are always within the interval, used for linspace
     if loss.n_classes is None or loss.n_classes <= 2:
         low, high = _inclusive_low_high(loss.interval_y_true)
@@ -177,7 +177,7 @@ def test_loss_boundary(loss):
     "loss, y_true_success, y_true_fail", Y_COMMON_PARAMS + Y_TRUE_PARAMS
 )
 def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
-    # Test boundaries of y_true for loss functions.
+    """Test boundaries of y_true for loss functions."""
     for y in y_true_success:
         assert loss.in_y_true_range(np.array([y]))
     for y in y_true_fail:
@@ -188,7 +188,7 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
     "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
-    # Test boundaries of y_pred for loss functions.
+    """Test boundaries of y_pred for loss functions."""
     for y in y_pred_success:
         assert loss.in_y_pred_range(np.array([y]))
     for y in y_pred_fail:
@@ -205,8 +205,11 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
 def test_loss_dtype(
     loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
 ):
-    # Test that loss accepts if all input arrays are either all float32 or all
-    # float64, and all output arrays are either all float32 or all float64.
+    """Test acceptance of dtypes in loss functions.
+
+    Check that loss accepts if all input arrays are either all float32 or all
+    float64, and all output arrays are either all float32 or all float64.
+    """
     loss = loss()
     if loss.n_classes <= 2:
         # generate a y_true in valid range
@@ -263,6 +266,7 @@ def test_loss_dtype(
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_same_as_C_functions(loss, sample_weight):
+    """Test that Python and Cython functions return same results."""
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
         n_samples=20,
@@ -344,8 +348,10 @@ def test_loss_same_as_C_functions(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_gradients_are_the_same(loss, sample_weight):
-    # Test that loss and gradient are the same across different functions.
-    # Also test that output arguments contain correct result.
+    """Test that loss and gradient are the same across different functions.
+
+    Also test that output arguments contain correct result.
+    """
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
         n_samples=20,
@@ -423,9 +429,11 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", ["ones", "random"])
 def test_sample_weight_multiplies_gradients(loss, sample_weight):
-    # Make sure that passing sample weights to the gradient and hessians
-    # computation methods is equivalent to multiplying by the weights.
+    """Test sample weights in gradients and hessians.
 
+    Make sure that passing sample weights to the gradient and hessians
+    computation methods is equivalent to multiplying by the weights.
+    """
     n_samples = 100
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -476,8 +484,11 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_of_perfect_prediction(loss, sample_weight):
-    # Test that loss of y_true = y_pred plus constant_to_optimal_zero sums up
-    # to zero.
+    """Test value of perfect predictions.
+
+    Loss of y_pred = y_true plus constant_to_optimal_zero should sums up to
+    zero.
+    """
     if loss.n_classes <= 2:
         # Use small values such that exp(value) is not nan.
         raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
@@ -513,9 +524,11 @@ def test_loss_of_perfect_prediction(loss, sample_weight):
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_gradients_hessians_numerically(loss, sample_weight):
-    # Test that gradients are computed correctly by comparing to numerical
-    # derivatives of loss functions.
-    # Test that hessians are correct by numerical derivative of gradients.
+    """Test gradients and hessians with numerical derivatives.
+
+    Gradient should equal the numerical derivatives of the loss function.
+    Hessians should equal the numerical derivatives of gradients.
+    """
     n_samples = 20
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -620,19 +633,23 @@ def grad_func(x):
 )
 @skip_if_32bit
 def test_derivatives(loss, x0, y_true):
-    # Check that gradients are zero when the loss is minimized on a single
-    # value/sample using Halley's method with the first and second order
-    # derivatives computed by the Loss instance.
-    # Note that methods of Loss instances operate on arrays while the newton
-    # root finder expects a scalar or a one-element array for this purpose.
+    """Test that gradients are zero at the minimum of the loss.
 
+    We check this on a single value/sample using Halley's method with the
+    first and second order derivatives computed by the Loss instance.
+    Note that methods of Loss instances operate on arrays while the newton
+    root finder expects a scalar or a one-element array for this purpose.
+    """
     loss = _LOSSES[loss](sample_weight=None)
     y_true = np.array([y_true], dtype=np.float64)
     x0 = np.array([x0], dtype=np.float64)
 
     def func(x: np.ndarray) -> np.ndarray:
-        # Add constant term such that loss has its minimum at zero, which is
-        # required by the newton method.
+        """Compute loss plus constant term.
+
+        The constant term is such that the minimum function value is zero,
+        which is required by the Newton method.
+        """
         return loss.loss(
             y_true=y_true, raw_prediction=x
         ) + loss.constant_to_optimal_zero(y_true=y_true)
@@ -652,7 +669,8 @@ def fprime2(x: np.ndarray) -> np.ndarray:
         tol=5e-8,
     )
 
-    # Need to ravel arrays because assert_allclose requires matching dimensions
+    # Need to ravel arrays because assert_allclose requires matching
+    # dimensions.
     y_true = y_true.ravel()
     optimum = optimum.ravel()
     assert_allclose(loss.inverse(optimum), y_true)
@@ -665,8 +683,10 @@ def fprime2(x: np.ndarray) -> np.ndarray:
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_intercept_only(loss, sample_weight):
-    # Test that fit_intercept_only returns the argmin of the loss and that the
-    # gradient is zero.
+    """Test that fit_intercept_only returns the argmin of the loss.
+
+    Also test that the gradient is zero at the minimum.
+    """
     n_samples = 50
     if loss.n_classes <= 2:
         y_true = loss.inverse(np.linspace(-4, 4, num=n_samples))
@@ -734,15 +754,20 @@ def fun(x):
 
 
 @pytest.mark.parametrize(
-    "loss, func, link, low, high, random_dist",
+    "loss, func, random_dist",
     [
-        (HalfSquaredError, np.mean, "identity", None, None, "normal"),
-        (AbsoluteError, np.median, "identity", None, None, "normal"),
-        (HalfPoissonLoss, np.mean, np.log, 0, None, "poisson"),
-        (BinaryCrossEntropy, np.mean, logit, 0, 1, "binomial"),
+        (HalfSquaredError, np.mean, "normal"),
+        (AbsoluteError, np.median, "normal"),
+        (HalfPoissonLoss, np.mean, "poisson"),
+        (BinaryCrossEntropy, np.mean, "binomial"),
     ],
 )
-def test_specific_fit_intercept_only(loss, func, link, low, high, random_dist):
+def test_specific_fit_intercept_only(loss, func, random_dist):
+    """Test that fit_intercept_only returns the correct functional.
+
+    We test the functional for specific, meaningful distributions, e.g.
+    squared error estimates the expectation of a probability distribution.
+    """
     rng = np.random.RandomState(0)
     loss = loss()
     if random_dist == "binomial":
@@ -750,32 +775,33 @@ def test_specific_fit_intercept_only(loss, func, link, low, high, random_dist):
     else:
         y_train = getattr(rng, random_dist)(size=100)
     baseline_prediction = loss.fit_intercept_only(y_true=y_train)
-    # Make sure baseline prediction is the expected one, i.e. func, e.g.
-    # mean or median.
+    # Make sure baseline prediction is the expected functional=func, e.g. mean
+    # or median.
     assert_all_finite(baseline_prediction)
-    if link == "identity":
-        assert baseline_prediction == approx(func(y_train))
-        assert_allclose(loss.inverse(baseline_prediction), baseline_prediction)
-    else:
-        assert baseline_prediction == approx(link(func(y_train)))
+    assert baseline_prediction == approx(loss.link(func(y_train)))
+    if isinstance(loss, IdentityLink):
+        assert_allclose(
+            loss.inverse(baseline_prediction), baseline_prediction
+        )
 
     # Test baseline at boundary
-    if low is not None:
-        y_train.fill(low)
+    if loss.interval_y_true.low_inclusive:
+        y_train.fill(loss.interval_y_true.low)
         baseline_prediction = loss.fit_intercept_only(y_true=y_train)
         assert_all_finite(baseline_prediction)
-    if high is not None:
-        y_train.fill(high)
+    if loss.interval_y_true.high_inclusive:
+        y_train.fill(loss.interval_y_true.high)
         baseline_prediction = loss.fit_intercept_only(y_true=y_train)
         assert_all_finite(baseline_prediction)
 
 
 def test_categorical_crossentropy_fit_intercept_only():
+    """Test that fit_intercept_only returns the mean functional for CCE."""
     rng = np.random.RandomState(0)
     n_classes = 4
     loss = CategoricalCrossEntropy(n_classes=n_classes)
-    # Same logic as test_single_fit_intercept_only. Here inverse link function
-    # = softmax and link function = log - symmetry term
+    # Same logic as test_specific_fit_intercept_only. Here inverse link
+    # function = softmax and link function = log - symmetry term.
     y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64)
     baseline_prediction = loss.fit_intercept_only(y_true=y_train)
     assert baseline_prediction.shape == (n_classes,)
@@ -793,8 +819,7 @@ def test_categorical_crossentropy_fit_intercept_only():
 
 
 def test_binary_and_categorical_crossentropy():
-    # Test that CategoricalCrossEntropy with n_classes = 2 is the same as
-    # BinaryCrossEntropy
+    """Test that CCE with n_classes = 2 is the same as BinaryCrossEntropy."""
     rng = np.random.RandomState(0)
     n_samples = 20
     bce = BinaryCrossEntropy()

From b2e0856bebf66098195bfe9f3b676d71c58a6882 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 May 2021 17:58:15 +0200
Subject: [PATCH 068/143] ENH define loss.is_multiclass

---
 sklearn/_loss/loss.py            |  5 +++
 sklearn/_loss/tests/test_loss.py | 53 ++++++++++++++++----------------
 2 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index a636d9fa29c6a..bb7aa2f29c69e 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -87,6 +87,8 @@ class BaseLoss(BaseLink, cLossFunction):
         approximated, it should be larger or equal to the exact one.
     constant_hessian : bool
         Indicates whether the hessian is one for this loss.
+    is_multiclass : bool
+        Indicates whether n_classes > 2 is allowed.
     """
 
     # Inherited methods from BaseLink:
@@ -107,6 +109,7 @@ class BaseLoss(BaseLink, cLossFunction):
     # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
     need_update_leaves_values = False
     differentiable = True
+    is_multiclass = False
 
     def __init__(self, n_classes=1):
         self.approx_hessian = False
@@ -809,6 +812,8 @@ class CategoricalCrossEntropy(
         https://arxiv.org/pdf/1311.6529.pdf
     """
 
+    is_multiclass = True
+
     def __init__(self, sample_weight=None, n_classes=3):
         super().__init__(n_classes=n_classes)
         self.interval_y_true = Interval(0, np.inf, True, False)
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 26c936e428d97..2b2c9ea22aaca 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -53,7 +53,15 @@ def random_y_true_raw_prediction(
 ):
     """Random generate y_true and raw_prediction in valid range."""
     rng = np.random.RandomState(seed)
-    if loss.n_classes <= 2:
+    if loss.is_multiclass:
+        raw_prediction = np.empty((n_samples, loss.n_classes))
+        raw_prediction.flat[:] = rng.uniform(
+            low=raw_bound[0],
+            high=raw_bound[1],
+            size=n_samples * loss.n_classes,
+        )
+        y_true = np.arange(n_samples).astype(float) % loss.n_classes
+    else:
         raw_prediction = rng.uniform(
             low=raw_bound[0], high=raw_bound[0], size=n_samples
         )
@@ -73,14 +81,6 @@ def random_y_true_raw_prediction(
             and loss.interval_y_true.high_inclusive
         ):
             y_true[1:: (n_samples // 3)] = 1
-    else:
-        raw_prediction = np.empty((n_samples, loss.n_classes))
-        raw_prediction.flat[:] = rng.uniform(
-            low=raw_bound[0],
-            high=raw_bound[1],
-            size=n_samples * loss.n_classes,
-        )
-        y_true = np.arange(n_samples).astype(float) % loss.n_classes
 
     return y_true, raw_prediction
 
@@ -105,11 +105,11 @@ def numerical_derivative(func, x, eps):
 def test_loss_boundary(loss):
     """Test interval ranges of y_true and y_pred in losses."""
     # make sure low and high are always within the interval, used for linspace
-    if loss.n_classes is None or loss.n_classes <= 2:
+    if loss.is_multiclass:
+        y_true = np.linspace(0, 9, num=10)
+    else:
         low, high = _inclusive_low_high(loss.interval_y_true)
         y_true = np.linspace(low, high, num=10)
-    else:
-        y_true = np.linspace(0, 9, num=10)
 
     # add boundaries if they are included
     if loss.interval_y_true.low_inclusive:
@@ -120,13 +120,13 @@ def test_loss_boundary(loss):
     assert loss.in_y_true_range(y_true)
 
     low, high = _inclusive_low_high(loss.interval_y_pred)
-    if loss.n_classes is None or loss.n_classes <= 2:
-        y_pred = np.linspace(low, high, num=10)
-    else:
+    if loss.is_multiclass:
         y_pred = np.empty((10, 3))
         y_pred[:, 0] = np.linspace(low, high, num=10)
         y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0])
         y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0])
+    else:
+        y_pred = np.linspace(low, high, num=10)
 
     assert loss.in_y_pred_range(y_pred)
 
@@ -153,7 +153,7 @@ def test_loss_boundary(loss):
 ]
 # y_pred and y_true do not always have the same domain (valid value range).
 # Hence, we define extra sets of parameters for each of them.
-Y_TRUE_PARAMS = [
+Y_TRUE_PARAMS = [  # type: ignore
     # (loss, [y success], [y fail])
     (HalfPoissonLoss(), [0], []),
     (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []),
@@ -185,7 +185,8 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
 
 
 @pytest.mark.parametrize(
-    "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS
+    "loss, y_pred_success, y_pred_fail",
+    Y_COMMON_PARAMS + Y_PRED_PARAMS  # type: ignore
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
     """Test boundaries of y_pred for loss functions."""
@@ -211,16 +212,16 @@ def test_loss_dtype(
     float64, and all output arrays are either all float32 or all float64.
     """
     loss = loss()
-    if loss.n_classes <= 2:
-        # generate a y_true in valid range
-        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
-        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
-        raw_prediction = np.array([0.0], dtype=dtype_in)
-    else:
+    # generate a y_true and raw_prediction in valid range
+    if loss.is_multiclass:
         y_true = np.array([0], dtype=dtype_in)
         raw_prediction = np.full(
             shape=(1, loss.n_classes), fill_value=0.0, dtype=dtype_in
         )
+    else:
+        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
+        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
+        raw_prediction = np.array([0.0], dtype=dtype_in)
 
     if sample_weight is not None:
         sample_weight = np.array([2.0], dtype=dtype_in)
@@ -251,7 +252,7 @@ def test_loss_dtype(
         gradient=out2,
         n_threads=n_threads,
     )
-    if out1 is not None and loss.n_classes >= 3:
+    if out1 is not None and loss.is_multiclass:
         out1 = np.empty_like(raw_prediction, dtype=dtype_out)
     loss.gradient_hessian(
         y_true=y_true,
@@ -350,7 +351,7 @@ def test_loss_same_as_C_functions(loss, sample_weight):
 def test_loss_gradients_are_the_same(loss, sample_weight):
     """Test that loss and gradient are the same across different functions.
 
-    Also test that output arguments contain correct result.
+    Also test that output arguments contain correct results.
     """
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -410,7 +411,7 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
     assert np.shares_memory(g3, out_g3)
 
     if hasattr(loss, "gradient_proba"):
-        assert loss.n_classes >= 3  # only for CategoricalCrossEntropy
+        assert loss.is_multiclass  # only for CategoricalCrossEntropy
         out_g4 = np.empty_like(raw_prediction)
         out_proba = np.empty_like(raw_prediction)
         g4, proba = loss.gradient_proba(

From 35ce8d2c6d77c757b1a88a07ad7076676d28207e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 May 2021 22:02:08 +0200
Subject: [PATCH 069/143] DOC fix typos

---
 sklearn/_loss/loss.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index bb7aa2f29c69e..14210b2a6202b 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -438,7 +438,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
 
 
 class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
-    """Half Squared Error with identity link, for regression.
+    """Half squared error with identity link, for regression.
 
     Domain:
     y_true and y_pred all real numbers
@@ -446,7 +446,7 @@ class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
     Link:
     y_pred = raw_prediction
 
-    For a given sample x_i, half squares error is defined as::
+    For a given sample x_i, half squared error is defined as::
 
         loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
 
@@ -549,7 +549,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
 
 
 class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
-    """Quantile Loss aka Pinball Loss, for regression.
+    """Quantile loss aka pinball loss, for regression.
 
     Domain:
     y_true and y_pred all real numbers
@@ -558,7 +558,7 @@ class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
     Link:
     y_pred = raw_prediction
 
-    For a given sample x_i, the pinball loss loss is defined as::
+    For a given sample x_i, the pinball loss is defined as::
 
         loss(x_i) = rho_{quantile}(y_true_i - raw_prediction_i)
 
@@ -620,7 +620,7 @@ class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
         loss(x_i) = y_true_i * log(y_true_i/exp(raw_prediction_i))
                     - y_true_i + exp(raw_prediction_i)
 
-    Half the Poisson deviance is actually the negative log likelihood up to
+    Half the Poisson deviance is actually the negative log-likelihood up to
     constant terms (not involving raw_prediction) and simplifies the
     computation of the gradients.
     We also skip the constant term `y_true_i * log(y_true_i) - y_true_i`.
@@ -652,7 +652,7 @@ class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
         loss(x_i) = log(exp(raw_prediction_i)/y_true_i)
                     + y_true/exp(raw_prediction_i) - 1
 
-    Half the Gamma deviance is actually proportional to the negative log
+    Half the Gamma deviance is actually proportional to the negative log-
     likelihood up to constant terms (not involving raw_prediction) and
     simplifies the computation of the gradients.
     We also skip the constant term `-log(y_true_i) - 1`.
@@ -744,7 +744,7 @@ class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
     y_pred = expit(raw_prediction)
 
     For a given sample x_i, the binary cross-entropy, aka log loss, is defined
-    as the negative log-likelihood of the Bernoulli distributions and can be
+    as the negative log-likelihood of the Bernoulli distribution and can be
     expressed as::
 
         loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
@@ -784,7 +784,7 @@ class CategoricalCrossEntropy(
 
     Domain:
     y_true in {0, 1, 2, 3, .., n_classes - 1}
-    y_pred a n_classes array, each element in (0, 1)
+    y_pred has n_classes elements, each element in (0, 1)
 
     Link:
     y_pred = softmax(raw_prediction)
@@ -792,8 +792,8 @@ class CategoricalCrossEntropy(
     Note: We assume y_true to be already label encoded.
 
     For a given sample x_i, the categorical cross-entropy loss is defined as
-    the negative log-likelihood of the multinomial distribution, it generalizes
-    the binary cross-entropy to more than 2 classes::
+    the negative log-likelihood of the multinomial distribution, it
+    generalizes the binary cross-entropy to more than 2 classes::
 
         loss_i = log(sum(exp(raw_pred_{i, k}), k=0..n_classes-1))
                 - sum(y_true_{i, k} * raw_pred_{i, k}, k=0..n_classes-1)
@@ -855,7 +855,7 @@ def gradient_proba(
         proba=None,
         n_threads=1,
     ):
-        """Compute gradient and probabilities of loss w.r.t raw_prediction.
+        """Compute gradient and probabilities fow raw_prediction.
 
         Parameters
         ----------

From 5904c8b6367c375b74ee4a92c08997ec8b76faff Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 18 May 2021 15:25:40 +0200
Subject: [PATCH 070/143] CLN address review comments

---
 doc/modules/classes.rst |  2 +-
 sklearn/_loss/_loss.pxd | 18 ++++----
 sklearn/_loss/_loss.pyx | 94 +++++++++++++++++++++--------------------
 3 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 1560af6827553..c64f9a4ddd34a 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1649,7 +1649,7 @@ To be removed in 1.0 (renaming of 0.25)
 
 .. _loss_function_ref:
 
-:mod:`sklearn._loss`: Non-public Loss Function Classes
+:mod:`sklearn._loss`: Private Loss Function Classes
 ===========================================================
 
 .. automodule:: sklearn._loss
diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index 1528ab28741fd..8ad45f3bed389 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -19,7 +19,7 @@ ctypedef fused G_DTYPE_C:
 
 
 # Struct to return 2 doubles
-ctypedef struct double2:
+ctypedef struct double_pair:
    double val1
    double val2
 
@@ -28,48 +28,48 @@ ctypedef struct double2:
 cdef class cLossFunction:
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfSquaredError(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cAbsoluteError(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cPinballLoss(cLossFunction):
     cdef readonly double quantile  # readonly makes it inherited by children
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfPoissonLoss(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfGammaLoss(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cHalfTweedieLoss(cLossFunction):
     cdef readonly double power  # readonly makes it inherited by children
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
 
 
 cdef class cBinaryCrossEntropy(cLossFunction):
     cdef double closs(self, double y_true, double raw_prediction) nogil
     cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx
index 59a46dcab522b..df1c7ec8e8e79 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx
@@ -28,7 +28,7 @@
 #
 # Note: We require 1-dim ndarrays to be contiguous.
 # TODO: Use const memoryviews with fused types with Cython 3.0 where
-#       appropriate (arguments marked by "# IN")
+#       appropriate (arguments marked by "# IN").
 
 cimport cython
 from cython.parallel import parallel, prange
@@ -62,9 +62,11 @@ cdef inline void sum_exp_minus_max(
     Y_DTYPE_C[:, :] raw_prediction,  # IN
     Y_DTYPE_C *p                     # OUT
 ) nogil:
-    # Store p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
-    #       p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
-    #       p[-1] = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
+    # Thread local buffers are used to stores results of this function via p.
+    # The results are stored as follows:
+    #     p[k] = exp(raw_prediction_i_k - max_value) for k = 0 to n_classes-1
+    #     p[-2] = max(raw_prediction_i_k, k = 0 to n_classes-1)
+    #     p[-1] = sum(p[k], k = 0 to n_classes-1) = sum of exponentials
     # len(p) must be n_classes + 2
     # Notes:
     # - Using "by reference" arguments doesn't work well, therefore we use a
@@ -110,11 +112,11 @@ cdef inline double cgradient_half_squared_error(
     return raw_prediction - y_true
 
 
-cdef inline double2 cgrad_hess_half_squared_error(
+cdef inline double_pair cgrad_hess_half_squared_error(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     gh.val1 = raw_prediction - y_true  # gradient
     gh.val2 = 1.                       # hessian
     return gh
@@ -135,11 +137,11 @@ cdef inline double cgradient_absolute_error(
     return 1. if raw_prediction > y_true else -1.
 
 
-cdef inline double2 cgrad_hess_absolute_error(
+cdef inline double_pair cgrad_hess_absolute_error(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
     # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
     gh.val1 = 1. if raw_prediction > y_true else -1.  # gradient
@@ -165,12 +167,12 @@ cdef inline double cgradient_pinball_loss(
     return -quantile if y_true >=raw_prediction else 1. - quantile
 
 
-cdef inline double2 cgrad_hess_pinball_loss(
+cdef inline double_pair cgrad_hess_pinball_loss(
     double y_true,
     double raw_prediction,
     double quantile
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     # Note that exact hessian = 0 almost everywhere. Optimization routines like
     # in HGBT, however, need a hessian > 0. Therefore, we assign 1.
     gh.val1 = -quantile if y_true >=raw_prediction else 1. - quantile  # gradient
@@ -194,22 +196,22 @@ cdef inline double cgradient_half_poisson(
     return exp(raw_prediction) - y_true
 
 
-cdef inline double2 closs_grad_half_poisson(
+cdef inline double_pair closs_grad_half_poisson(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     lg.val2 = exp(raw_prediction)                # used as temporary
     lg.val1 = lg.val2 - y_true * raw_prediction  # loss
     lg.val2 -= y_true                            # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_poisson(
+cdef inline double_pair cgrad_hess_half_poisson(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     gh.val2 = exp(raw_prediction)  # hessian
     gh.val1 = gh.val2 - y_true     # gradient
     return gh
@@ -230,22 +232,22 @@ cdef inline double cgradient_half_gamma(
     return 1. - y_true * exp(-raw_prediction)
 
 
-cdef inline double2 closs_grad_half_gamma(
+cdef inline double_pair closs_grad_half_gamma(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     lg.val2 = exp(-raw_prediction)               # used as temporary
     lg.val1 = raw_prediction + y_true * lg.val2  # loss
     lg.val2 = 1. - y_true * lg.val2              # gradient
     return lg
 
 
-cdef inline double2 cgrad_hess_half_gamma(
+cdef inline double_pair cgrad_hess_half_gamma(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     gh.val2 = exp(-raw_prediction)   # used as temporary
     gh.val1 = 1. - y_true * gh.val2  # gradient
     gh.val2 *= y_true                # hessian
@@ -288,12 +290,12 @@ cdef inline double cgradient_half_tweedie(
                 - y_true * exp((1. - power) * raw_prediction))
 
 
-cdef inline double2 closs_grad_half_tweedie(
+cdef inline double_pair closs_grad_half_tweedie(
     double y_true,
     double raw_prediction,
     double power
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     cdef double exp1, exp2
     if power == 0.:
         exp1 = exp(raw_prediction)
@@ -311,12 +313,12 @@ cdef inline double2 closs_grad_half_tweedie(
     return lg
 
 
-cdef inline double2 cgrad_hess_half_tweedie(
+cdef inline double_pair cgrad_hess_half_tweedie(
     double y_true,
     double raw_prediction,
     double power
 ) nogil:
-    cdef double2 gh
+    cdef double_pair gh
     cdef double exp1, exp2
     if power == 0.:
         exp1 = exp(raw_prediction)
@@ -368,11 +370,11 @@ cdef inline double cgradient_binary_crossentropy(
     return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
 
 
-cdef inline double2 closs_grad_binary_crossentropy(
+cdef inline double_pair closs_grad_binary_crossentropy(
     double y_true,
     double raw_prediction
 ) nogil:
-    cdef double2 lg
+    cdef double_pair lg
     if raw_prediction <= 0:
         lg.val2 = exp(raw_prediction)  # used as temporary
         if raw_prediction <= -37:
@@ -391,14 +393,14 @@ cdef inline double2 closs_grad_binary_crossentropy(
     return lg
 
 
-cdef inline double2 cgrad_hess_binary_crossentropy(
+cdef inline double_pair cgrad_hess_binary_crossentropy(
     double y_true,
     double raw_prediction
 ) nogil:
     # with y_pred = expit(raw)
     # hessian = y_pred * (1 - y_pred) = exp(raw) / (1 + exp(raw))**2
     #                                 = exp(-raw) / (1 + exp(-raw))**2
-    cdef double2 gh
+    cdef double_pair gh
     gh.val2 = exp(-raw_prediction)  # used as temporary
     gh.val1 = ((1 - y_true) - y_true * gh.val2) / (1 + gh.val2)  # gradient
     gh.val2 = gh.val2 / (1 + gh.val2)**2                         # hessian
@@ -445,7 +447,7 @@ cdef class cLossFunction:
         """
         pass
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         """Compute gradient and hessian.
 
         Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
@@ -465,7 +467,7 @@ cdef class cLossFunction:
 
         Returns
         -------
-        double2
+        double_pair
             Gradient and hessian of the loss function w.r.t. `raw_prediction`.
         """
         pass
@@ -627,7 +629,7 @@ cdef class cHalfSquaredError(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_squared_error(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_squared_error(y_true, raw_prediction)
 
     def _loss(
@@ -699,7 +701,7 @@ cdef class cHalfSquaredError(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -735,7 +737,7 @@ cdef class cAbsoluteError(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_absolute_error(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_absolute_error(y_true, raw_prediction)
 
     def _loss(
@@ -804,7 +806,7 @@ cdef class cAbsoluteError(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -846,7 +848,7 @@ cdef class cPinballLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_pinball_loss(y_true, raw_prediction, self.quantile)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_pinball_loss(y_true, raw_prediction, self.quantile)
 
     def _loss(
@@ -919,7 +921,7 @@ cdef class cPinballLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -968,7 +970,7 @@ cdef class cHalfPoissonLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_poisson(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_poisson(y_true, raw_prediction)
 
     def _loss(
@@ -1011,7 +1013,7 @@ cdef class cHalfPoissonLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1070,7 +1072,7 @@ cdef class cHalfPoissonLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1113,7 +1115,7 @@ cdef class cHalfGammaLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_gamma(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_gamma(y_true, raw_prediction)
 
     def _loss(
@@ -1156,7 +1158,7 @@ cdef class cHalfGammaLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1215,7 +1217,7 @@ cdef class cHalfGammaLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1276,7 +1278,7 @@ cdef class cHalfTweedieLoss(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_half_tweedie(y_true, raw_prediction, self.power)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_half_tweedie(y_true, raw_prediction, self.power)
 
     def _loss(
@@ -1319,7 +1321,7 @@ cdef class cHalfTweedieLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1380,7 +1382,7 @@ cdef class cHalfTweedieLoss(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1417,7 +1419,7 @@ cdef class cBinaryCrossEntropy(cLossFunction):
     cdef double cgradient(self, double y_true, double raw_prediction) nogil:
         return cgradient_binary_crossentropy(y_true, raw_prediction)
 
-    cdef double2 cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
         return cgrad_hess_binary_crossentropy(y_true, raw_prediction)
 
     def _loss(
@@ -1460,7 +1462,7 @@ cdef class cBinaryCrossEntropy(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(
@@ -1519,7 +1521,7 @@ cdef class cBinaryCrossEntropy(cLossFunction):
         cdef:
             int i
             int n_samples = y_true.shape[0]
-            double2 dbl2
+            double_pair dbl2
 
         if sample_weight is None:
             for i in prange(

From 3a7122b80c43b0517ce15cc254ea66f04e1ae67c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 15 Jun 2021 17:34:20 +0200
Subject: [PATCH 071/143] DOC small docstring improvements

---
 sklearn/_loss/loss.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 14210b2a6202b..321d000636fe5 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -451,7 +451,8 @@ class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
         loss(x_i) = 0.5 * (y_true_i - raw_prediction_i)**2
 
     The factor of 0.5 simplifies the computation of gradients and results in a
-    unit hessian (and be consistent with what is done in LightGBM).
+    unit hessian (and is consistent with what is done in LightGBM). It is also
+    half the Normal distribution deviance.
     """
 
     def __init__(self, sample_weight=None):
@@ -512,7 +513,7 @@ def gradient_hessian(
 
 
 class AbsoluteError(IdentityLink, BaseLoss, cAbsoluteError):
-    """Least absolute error, for regression.
+    """Absolute error with identity link, for regression.
 
     Domain:
     y_true and y_pred all real numbers
@@ -734,7 +735,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
 
 
 class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
-    """Binary cross entropy loss for binary classification.
+    """Binary cross entropy loss with logit link, for binary classification.
 
     Domain:
     y_true in [0, 1]
@@ -743,14 +744,20 @@ class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
     Link:
     y_pred = expit(raw_prediction)
 
-    For a given sample x_i, the binary cross-entropy, aka log loss, is defined
-    as the negative log-likelihood of the Bernoulli distribution and can be
-    expressed as::
+    For a given sample x_i, the binary cross-entropy, is defined as the
+    negative log-likelihood of the Bernoulli distribution and can be expressed
+    as::
 
         loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
 
     See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
     section 4.4.1 (about logistic regression).
+
+    This loss is also known as log loss or logistic loss.
+    Note that the formulation works for classification, y = {0, 1}, as well as
+    logistic regression, y = [0, 1].
+    If you add `constant_to_optimal_zero` to the loss, you get half the
+    Bernoulli/binomial deviance.
     """
 
     def __init__(self, sample_weight=None):
@@ -780,7 +787,7 @@ def predict_proba(self, raw_prediction):
 class CategoricalCrossEntropy(
     MultinomialLogit, BaseLoss, cCategoricalCrossEntropy
 ):
-    """Categorical cross-entropy loss for multiclass classification.
+    """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:
     y_true in {0, 1, 2, 3, .., n_classes - 1}
@@ -789,7 +796,9 @@ class CategoricalCrossEntropy(
     Link:
     y_pred = softmax(raw_prediction)
 
-    Note: We assume y_true to be already label encoded.
+    Note: We assume y_true to be already label encoded. The inverse link is
+    softmax. But the full link function is the symmetric multinomial logit
+    function.
 
     For a given sample x_i, the categorical cross-entropy loss is defined as
     the negative log-likelihood of the multinomial distribution, it

From c3b7658c791ca94da13dd8dd176f01b2dc143977 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 15 Jun 2021 20:15:51 +0200
Subject: [PATCH 072/143] TST test more losses in
 test_specific_fit_intercept_only

---
 sklearn/_loss/tests/test_loss.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 2b2c9ea22aaca..f7228b722dbb1 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -757,10 +757,13 @@ def fun(x):
 @pytest.mark.parametrize(
     "loss, func, random_dist",
     [
-        (HalfSquaredError, np.mean, "normal"),
-        (AbsoluteError, np.median, "normal"),
-        (HalfPoissonLoss, np.mean, "poisson"),
-        (BinaryCrossEntropy, np.mean, "binomial"),
+        (HalfSquaredError(), np.mean, "normal"),
+        (AbsoluteError(), np.median, "normal"),
+        (PinballLoss(quantile=0.25), lambda x: np.quantile(x, q=0.25), "normal"),
+        (HalfPoissonLoss(), np.mean, "poisson"),
+        (HalfGammaLoss(), np.mean, "exponential"),
+        (HalfTweedieLoss(), np.mean, "exponential"),
+        (BinaryCrossEntropy(), np.mean, "binomial"),
     ],
 )
 def test_specific_fit_intercept_only(loss, func, random_dist):
@@ -770,7 +773,6 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
     squared error estimates the expectation of a probability distribution.
     """
     rng = np.random.RandomState(0)
-    loss = loss()
     if random_dist == "binomial":
         y_train = rng.binomial(1, 0.5, size=100)
     else:
@@ -780,6 +782,7 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
     # or median.
     assert_all_finite(baseline_prediction)
     assert baseline_prediction == approx(loss.link(func(y_train)))
+    assert loss.inverse(baseline_prediction) == approx(func(y_train))
     if isinstance(loss, IdentityLink):
         assert_allclose(
             loss.inverse(baseline_prediction), baseline_prediction

From eae2defd3fcccb5f8ee5281d4820f95e4ff5f8ab Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 18 Jul 2021 22:14:59 +0200
Subject: [PATCH 073/143] FIX test_loss_boundary

---
 sklearn/_loss/tests/test_loss.py | 66 ++++++++++++--------------------
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index f7228b722dbb1..5e674ccc00942 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -71,16 +71,10 @@ def random_y_true_raw_prediction(
         high = min(high, y_bound[1])
         y_true = rng.uniform(low, high, size=n_samples)
         # set some values at special boundaries
-        if (
-            loss.interval_y_true.low == 0
-            and loss.interval_y_true.low_inclusive
-        ):
+        if loss.interval_y_true.low == 0 and loss.interval_y_true.low_inclusive:
             y_true[:: (n_samples // 3)] = 0
-        if (
-            loss.interval_y_true.high == 1
-            and loss.interval_y_true.high_inclusive
-        ):
-            y_true[1:: (n_samples // 3)] = 1
+        if loss.interval_y_true.high == 1 and loss.interval_y_true.high_inclusive:
+            y_true[1 :: (n_samples // 3)] = 1
 
     return y_true, raw_prediction
 
@@ -96,9 +90,7 @@ def numerical_derivative(func, x, eps):
     f_minus_1h = func(x - h)
     f_plus_1h = func(x + h)
     f_plus_2h = func(x + 2 * h)
-    return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (
-        12.0 * eps
-    )
+    return (-f_plus_2h + 8 * f_plus_1h - 8 * f_minus_1h + f_minus_2h) / (12.0 * eps)
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
@@ -119,14 +111,15 @@ def test_loss_boundary(loss):
 
     assert loss.in_y_true_range(y_true)
 
+    n = y_true.shape[0]
     low, high = _inclusive_low_high(loss.interval_y_pred)
     if loss.is_multiclass:
-        y_pred = np.empty((10, 3))
-        y_pred[:, 0] = np.linspace(low, high, num=10)
+        y_pred = np.empty((n, 3))
+        y_pred[:, 0] = np.linspace(low, high, num=n)
         y_pred[:, 1] = 0.5 * (1 - y_pred[:, 0])
         y_pred[:, 2] = 0.5 * (1 - y_pred[:, 0])
     else:
-        y_pred = np.linspace(low, high, num=10)
+        y_pred = np.linspace(low, high, num=n)
 
     assert loss.in_y_pred_range(y_pred)
 
@@ -185,8 +178,7 @@ def test_loss_boundary_y_true(loss, y_true_success, y_true_fail):
 
 
 @pytest.mark.parametrize(
-    "loss, y_pred_success, y_pred_fail",
-    Y_COMMON_PARAMS + Y_PRED_PARAMS  # type: ignore
+    "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS  # type: ignore
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
     """Test boundaries of y_pred for loss functions."""
@@ -203,9 +195,7 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
 @pytest.mark.parametrize("out1", [None, 1])
 @pytest.mark.parametrize("out2", [None, 1])
 @pytest.mark.parametrize("n_threads", [1, 2])
-def test_loss_dtype(
-    loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
-):
+def test_loss_dtype(loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads):
     """Test acceptance of dtypes in loss functions.
 
     Check that loss accepts if all input arrays are either all float32 or all
@@ -450,14 +440,10 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         rng = np.random.RandomState(42)
         sample_weight = rng.normal(size=n_samples).astype(np.float64)
 
-    baseline_prediction = loss.fit_intercept_only(
-        y_true=y_true, sample_weight=None
-    )
+    baseline_prediction = loss.fit_intercept_only(y_true=y_true, sample_weight=None)
 
     if loss.n_classes <= 2:
-        raw_prediction = np.zeros(
-            shape=(n_samples,), dtype=baseline_prediction.dtype
-        )
+        raw_prediction = np.zeros(shape=(n_samples,), dtype=baseline_prediction.dtype)
     else:
         raw_prediction = np.zeros(
             shape=(n_samples, loss.n_classes), dtype=baseline_prediction.dtype
@@ -555,7 +541,9 @@ def test_gradients_hessians_numerically(loss, sample_weight):
 
         def loss_func(x):
             return loss.loss(
-                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+                y_true=y_true,
+                raw_prediction=x,
+                sample_weight=sample_weight,
             )
 
         g_numeric = numerical_derivative(loss_func, raw_prediction, eps=1e-6)
@@ -563,7 +551,9 @@ def loss_func(x):
 
         def grad_func(x):
             return loss.gradient(
-                y_true=y_true, raw_prediction=x, sample_weight=sample_weight,
+                y_true=y_true,
+                raw_prediction=x,
+                sample_weight=sample_weight,
             )
 
         h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6)
@@ -588,9 +578,7 @@ def loss_func(x):
                     sample_weight=sample_weight,
                 )
 
-            g_numeric = numerical_derivative(
-                loss_func, raw_prediction[:, k], eps=1e-5
-            )
+            g_numeric = numerical_derivative(loss_func, raw_prediction[:, k], eps=1e-5)
             assert_allclose(g[:, k], g_numeric, rtol=5e-6, atol=1e-10)
 
             def grad_func(x):
@@ -602,9 +590,7 @@ def grad_func(x):
                     sample_weight=sample_weight,
                 )[:, k]
 
-            h_numeric = numerical_derivative(
-                grad_func, raw_prediction[:, k], eps=1e-6
-            )
+            h_numeric = numerical_derivative(grad_func, raw_prediction[:, k], eps=1e-6)
             if loss.approx_hessian:
                 assert np.all(h >= h_numeric)
             else:
@@ -676,9 +662,7 @@ def fprime2(x: np.ndarray) -> np.ndarray:
     optimum = optimum.ravel()
     assert_allclose(loss.inverse(optimum), y_true)
     assert_allclose(func(optimum), 0, atol=1e-14)
-    assert_allclose(
-        loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7
-    )
+    assert_allclose(loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7)
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
@@ -740,7 +724,7 @@ def fun(x):
             method="SLSQP",
             constraints={
                 "type": "eq",
-                "fun": lambda x: np.ones((1, loss.n_classes)) @ x
+                "fun": lambda x: np.ones((1, loss.n_classes)) @ x,
             },
         )
         grad = loss.gradient(
@@ -784,9 +768,7 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
     assert baseline_prediction == approx(loss.link(func(y_train)))
     assert loss.inverse(baseline_prediction) == approx(func(y_train))
     if isinstance(loss, IdentityLink):
-        assert_allclose(
-            loss.inverse(baseline_prediction), baseline_prediction
-        )
+        assert_allclose(loss.inverse(baseline_prediction), baseline_prediction)
 
     # Test baseline at boundary
     if loss.interval_y_true.low_inclusive:
@@ -835,5 +817,5 @@ def test_binary_and_categorical_crossentropy():
     raw_cce[:, 1] = 0.5 * raw_prediction
     assert_allclose(
         bce.loss(y_true=y_train, raw_prediction=raw_prediction),
-        cce.loss(y_true=y_train, raw_prediction=raw_cce)
+        cce.loss(y_true=y_train, raw_prediction=raw_cce),
     )

From ec5fd024402161010921beba791f6a9fb6dda81c Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 18 Jul 2021 22:17:04 +0200
Subject: [PATCH 074/143] ENH Tempita for losses

---
 .gitignore                                |    1 +
 setup.cfg                                 |    1 +
 sklearn/_loss/{_loss.pyx => _loss.pyx.tp} | 1021 ++++-----------------
 sklearn/_loss/setup.py                    |    6 +
 4 files changed, 206 insertions(+), 823 deletions(-)
 rename sklearn/_loss/{_loss.pyx => _loss.pyx.tp} (60%)

diff --git a/.gitignore b/.gitignore
index 2c3dd0c4794c1..0bb730f493a56 100644
--- a/.gitignore
+++ b/.gitignore
@@ -76,6 +76,7 @@ _configtest.o.d
 .mypy_cache/
 
 # files generated from a template
+sklearn/_loss/_loss.pyx
 sklearn/utils/_seq_dataset.pyx
 sklearn/utils/_seq_dataset.pxd
 sklearn/utils/_weight_vector.pyx
diff --git a/setup.cfg b/setup.cfg
index 050045072f428..080f0607789ee 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -66,6 +66,7 @@ allow_redefinition = True
 [check-manifest]
 # ignore files missing in VCS
 ignore =
+    sklearn/_loss/_loss.pyx
     sklearn/linear_model/_sag_fast.pyx
     sklearn/utils/_seq_dataset.pyx
     sklearn/utils/_seq_dataset.pxd
diff --git a/sklearn/_loss/_loss.pyx b/sklearn/_loss/_loss.pyx.tp
similarity index 60%
rename from sklearn/_loss/_loss.pyx
rename to sklearn/_loss/_loss.pyx.tp
index df1c7ec8e8e79..92dcf57e9f1fb 100644
--- a/sklearn/_loss/_loss.pyx
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -1,3 +1,164 @@
+{{py:
+
+"""
+Template file for easily generate fused types consistent code using Tempita
+(https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
+
+Generated file: _loss.pyx
+
+Each loss class is generated by a cdef functions  on single samples.
+The keywords between double braces are substituted in setup.py.
+"""
+
+doc_SquaredError = (
+    """Half Squared Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+)
+
+doc_AbsoluteError = (
+    """Absolute Error with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+
+    Link:
+    y_pred = raw_prediction
+    """
+)
+
+doc_PinballLoss = (
+    """Quantile Loss aka Pinball Loss with identity link.
+
+    Domain:
+    y_true and y_pred all real numbers
+    quantile in (0, 1)
+
+    Link:
+    y_pred = raw_prediction
+
+    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
+    """
+)
+
+doc_PoissonLoss = (
+    """Half Poisson deviance loss with log-link.
+
+    Domain:
+    y_true in non-negative real numbers
+    y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Poisson deviance with log-link is
+        y_true * log(y_true/y_pred) + y_pred - y_true
+        = y_true * log(y_true) - y_true * raw_prediction
+          + exp(raw_prediction) - y_true
+
+    Dropping constant terms, this gives:
+        exp(raw_prediction) - y_true * raw_prediction
+    """
+)
+
+doc_GammaLoss = (
+    """Half Gamma deviance loss with log-link.
+
+    Domain:
+    y_true and y_pred in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Gamma deviance with log-link is
+        log(y_pred/y_true) + y_true/y_pred - 1
+        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
+
+    Dropping constant terms, this gives:
+        raw_prediction + y_true * exp(-raw_prediction)
+    """
+)
+
+doc_TweedieLoss = (
+    """Half Tweedie deviance loss with log-link.
+
+    Domain:
+    y_true in real numbers if p <= 0
+    y_true in non-negative real numbers if 0 < p < 2
+    y_true in positive real numbers if p >= 2
+    y_pred and power in positive real numbers
+
+    Link:
+    y_pred = exp(raw_prediction)
+
+    Half Tweedie deviance with log-link and p=power is
+        max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * y_pred**(1-p) / (1-p)
+        + y_pred**(2-p) / (2-p)
+        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+        + exp((2-p) * raw_prediction) / (2-p)
+
+    Dropping constant terms, this gives:
+        exp((2-p) * raw_prediction) / (2-p)
+        - y_true * exp((1-p) * raw_prediction) / (1-p)
+
+    Notes:
+    - Poisson with p=1 and and Gamma with p=2 have different terms dropped such
+      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
+    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
+      0<p<1 still gives a strictly consistent scoring function for the
+      expectation.
+    """
+)
+
+doc_BinaryCrossEntropy = (
+    """BinaryCrossEntropy with logit link.
+
+    Domain:
+    y_true in [0, 1]
+    y_pred in (0, 1), i.e. boundaries excluded
+
+    Link:
+    y_pred = expit(raw_prediction)
+    """
+)
+
+# loss class name, docstring, param, closs, closs_grad, cgrad, cgrad_hess,
+class_list = [
+    ("cHalfSquaredError", doc_SquaredError, None,
+     "closs_half_squared_error", None,
+     "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
+    ("cAbsoluteError", doc_AbsoluteError, None,
+     "closs_absolute_error", None,
+     "cgradient_absolute_error", "cgrad_hess_absolute_error"),
+    ("cPinballLoss", doc_PinballLoss, "quantile",
+     "closs_pinball_loss", None,
+     "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
+    ("cHalfPoissonLoss", doc_PoissonLoss, None,
+     "closs_half_poisson", "closs_grad_half_poisson",
+     "cgradient_half_poisson", "cgrad_hess_half_poisson"),
+    ("cHalfGammaLoss", doc_GammaLoss, None,
+     "closs_half_gamma", "closs_grad_half_gamma",
+     "cgradient_half_gamma", "cgrad_hess_half_gamma"),
+    ("cHalfTweedieLoss", doc_TweedieLoss, "power",
+     "closs_half_tweedie", "closs_grad_half_tweedie",
+     "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
+    ("cBinaryCrossEntropy", doc_BinaryCrossEntropy, None,
+     "closs_binary_crossentropy", "closs_grad_binary_crossentropy",
+     "cgradient_binary_crossentropy", "cgrad_hess_binary_crossentropy"),
+]
+}}
+"""
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+#------------------------------------------------------------------------------
+
 # cython: cdivision=True
 # cython: boundscheck=False
 # cython: wraparound=False
@@ -613,24 +774,30 @@ cdef class cLossFunction:
         pass
 
 
-cdef class cHalfSquaredError(cLossFunction):
-    """Half Squared Error with identity link.
+{{for name, docstring, param, closs, closs_grad, cgrad, cgrad_hess, in class_list}}
+{{py:
+if param is None:
+    with_param = ""
+else:
+    with_param = ", self." + param
+}}
 
-    Domain:
-    y_true and y_pred all real numbers
+cdef class {{name}}(cLossFunction):
+    """{{docstring}}"""
 
-    Link:
-    y_pred = raw_prediction
-    """
+    {{if param is not None}}
+    def __init__(self, {{param}}):
+        self.{{param}} = {{param}}
+    {{endif}}
 
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_squared_error(y_true, raw_prediction)
+    cdef inline double closs(self, double y_true, double raw_prediction) nogil:
+        return {{closs}}(y_true, raw_prediction{{with_param}})
 
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_squared_error(y_true, raw_prediction)
+    cdef inline double cgradient(self, double y_true, double raw_prediction) nogil:
+        return {{cgrad}}(y_true, raw_prediction{{with_param}})
 
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_squared_error(y_true, raw_prediction)
+    cdef inline double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
+        return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
 
     def _loss(
         self,
@@ -648,54 +815,23 @@ cdef class cHalfSquaredError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = closs_half_squared_error(y_true[i], raw_prediction[i])
+                loss[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_squared_error(y_true[i], raw_prediction[i])
-                )
+                loss[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
 
         return np.asarray(loss)
 
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_squared_error(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_squared_error(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-
-    def _gradient_hessian(
+    {{if closs_grad is not None}}
+    def _loss_gradient(
         self,
         Y_DTYPE_C[::1] y_true,
         Y_DTYPE_C[::1] raw_prediction,
         Y_DTYPE_C[::1] sample_weight,
+        G_DTYPE_C[::1] loss,
         G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
         int n_threads=1
     ):
         cdef:
@@ -707,64 +843,19 @@ cdef class cHalfSquaredError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_squared_error(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cAbsoluteError(cLossFunction):
-    """Absolute Error with identity link.
-
-    Domain:
-    y_true and y_pred all real numbers
-
-    Link:
-    y_pred = raw_prediction
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_absolute_error(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_absolute_error(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_absolute_error(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_absolute_error(y_true[i], raw_prediction[i])
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss[i] = dbl2.val1
+                gradient[i] = dbl2.val2
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = (sample_weight[i]
-                    * closs_absolute_error(y_true[i], raw_prediction[i]))
+                dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss[i] = sample_weight[i] * dbl2.val1
+                gradient[i] = sample_weight[i] * dbl2.val2
 
-        return np.asarray(loss)
+        return np.asarray(loss), np.asarray(gradient)
+    {{endif}}
 
     def _gradient(
         self,
@@ -782,15 +873,12 @@ cdef class cAbsoluteError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                gradient[i] = cgradient_absolute_error(y_true[i], raw_prediction[i])
+                gradient[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_absolute_error(y_true[i], raw_prediction[i])
-                )
+                gradient[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
 
         return np.asarray(gradient)
 
@@ -812,733 +900,20 @@ cdef class cAbsoluteError(cLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
                 gradient[i] = dbl2.val1
                 hessian[i] = dbl2.val2
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                dbl2 = cgrad_hess_absolute_error(y_true[i], raw_prediction[i])
+                dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
                 gradient[i] = sample_weight[i] * dbl2.val1
                 hessian[i] = sample_weight[i] * dbl2.val2
 
         return np.asarray(gradient), np.asarray(hessian)
 
-
-cdef class cPinballLoss(cLossFunction):
-    """Quantile Loss aka Pinball Loss with identity link.
-
-    Domain:
-    y_true and y_pred all real numbers
-    quantile in (0, 1)
-
-    Link:
-    y_pred = raw_prediction
-
-    Note: 2 * cPinballLoss(quantile=0.5) equals cAbsoluteError()
-    """
-
-    def __init__(self, quantile):
-        self.quantile = quantile
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_pinball_loss(y_true, raw_prediction, self.quantile)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_pinball_loss(y_true, raw_prediction, self.quantile)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_pinball_loss(y_true, raw_prediction, self.quantile)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
-                )
-
-        return np.asarray(loss)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_pinball_loss(
-                    y_true[i], raw_prediction[i], self.quantile
-                )
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_pinball_loss(y_true[i], raw_prediction[i], self.quantile)
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_pinball_loss(
-                    y_true[i], raw_prediction[i], self.quantile
-                )
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_pinball_loss(
-                    y_true[i], raw_prediction[i], self.quantile
-                )
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cHalfPoissonLoss(cLossFunction):
-    """Half Poisson deviance loss with log-link.
-
-    Domain:
-    y_true in non-negative real numbers
-    y_pred in positive real numbers
-
-    Link:
-    y_pred = exp(raw_prediction)
-
-    Half Poisson deviance with log-link is
-        y_true * log(y_true/y_pred) + y_pred - y_true
-        = y_true * log(y_true) - y_true * raw_prediction
-          + exp(raw_prediction) - y_true
-
-    Dropping constant terms, this gives:
-        exp(raw_prediction) - y_true * raw_prediction
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_poisson(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_poisson(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_poisson(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_half_poisson(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_poisson(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_poisson(y_true[i], raw_prediction[i])
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_poisson(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_poisson(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_poisson(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cHalfGammaLoss(cLossFunction):
-    """Half Gamma deviance loss with log-link.
-
-    Domain:
-    y_true and y_pred in positive real numbers
-
-    Link:
-    y_pred = exp(raw_prediction)
-
-    Half Gamma deviance with log-link is
-        log(y_pred/y_true) + y_true/y_pred - 1
-        = raw_prediction - log(y_true) + y_true * exp(-raw_prediction) - 1
-
-    Dropping constant terms, this gives:
-        raw_prediction + y_true * exp(-raw_prediction)
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_gamma(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_gamma(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_gamma(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_half_gamma(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_gamma(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_gamma(y_true[i], raw_prediction[i])
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_gamma(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_gamma(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_gamma(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cHalfTweedieLoss(cLossFunction):
-    """Half Tweedie deviance loss with log-link.
-
-    Domain:
-    y_true in real numbers if p <= 0
-    y_true in non-negative real numbers if 0 < p < 2
-    y_true in positive real numbers if p >= 2
-    y_pred and power in positive real numbers
-
-    Link:
-    y_pred = exp(raw_prediction)
-
-    Half Tweedie deviance with log-link and p=power is
-        max(y_true, 0)**(2-p) / (1-p) / (2-p)
-        - y_true * y_pred**(1-p) / (1-p)
-        + y_pred**(2-p) / (2-p)
-        = max(y_true, 0)**(2-p) / (1-p) / (2-p)
-        - y_true * exp((1-p) * raw_prediction) / (1-p)
-        + exp((2-p) * raw_prediction) / (2-p)
-
-    Dropping constant terms, this gives:
-        exp((2-p) * raw_prediction) / (2-p)
-        - y_true * exp((1-p) * raw_prediction) / (1-p)
-
-    Notes:
-    - Poisson with p=1 and and Gamma with p=2 have different terms dropped such
-      that cHalfTweedieLoss is not continuous in p=power at p=1 and p=2.
-    - While the Tweedie distribution only exists for p<=0 or p>=1, the range
-      0<p<1 still gives a strictly consistent scoring function for the
-      expectation.
-    """
-
-    def __init__(self, power):
-        self.power = power
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_half_tweedie(y_true, raw_prediction, self.power)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_half_tweedie(y_true, raw_prediction, self.power)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_half_tweedie(y_true, raw_prediction, self.power)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_half_tweedie(
-                    y_true[i], raw_prediction[i], self.power
-                )
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_half_tweedie(y_true[i], raw_prediction[i], self.power)
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
-
-
-cdef class cBinaryCrossEntropy(cLossFunction):
-    """BinaryCrossEntropy with logit link.
-
-    Domain:
-    y_true in [0, 1]
-    y_pred in (0, 1), i.e. boundaries excluded
-
-    Link:
-    y_pred = expit(raw_prediction)
-    """
-
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
-        return closs_binary_crossentropy(y_true, raw_prediction)
-
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
-        return cgradient_binary_crossentropy(y_true, raw_prediction)
-
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
-        return cgrad_hess_binary_crossentropy(y_true, raw_prediction)
-
-    def _loss(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = closs_binary_crossentropy(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                loss[i] = (
-                    sample_weight[i]
-                    * closs_binary_crossentropy(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(loss)
-
-    def _loss_gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = closs_grad_binary_crossentropy(y_true[i], raw_prediction[i])
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(loss), np.asarray(gradient)
-
-    def _gradient(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                gradient[i] = (
-                    sample_weight[i]
-                    * cgradient_binary_crossentropy(y_true[i], raw_prediction[i])
-                )
-
-        return np.asarray(gradient)
-
-    def _gradient_hessian(
-        self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
-        int n_threads=1
-    ):
-        cdef:
-            int i
-            int n_samples = y_true.shape[0]
-            double_pair dbl2
-
-        if sample_weight is None:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
-        else:
-            for i in prange(
-                n_samples, schedule='static', nogil=True, num_threads=n_threads
-            ):
-                dbl2 = cgrad_hess_binary_crossentropy(y_true[i], raw_prediction[i])
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
-
-        return np.asarray(gradient), np.asarray(hessian)
+{{endfor}}
 
 
 cdef class cCategoricalCrossEntropy(cLossFunction):
diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index c7f11afe9e30a..b80584d8707c8 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -1,9 +1,15 @@
 import numpy
 from numpy.distutils.misc_util import Configuration
+from sklearn._build_utils import gen_from_templates
 
 
 def configuration(parent_package="", top_path=None):
     config = Configuration("_loss", parent_package, top_path)
+
+    # generate _loss.pyx from template
+    templates = ["sklearn/_loss/_loss.pyx.tp"]
+    gen_from_templates(templates, top_path)
+
     config.add_extension(
         "_loss",
         sources=["_loss.pyx"],

From a9d93b4d76ab3b2a0ea0008f0364d189a12dd8ad Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 19 Jul 2021 22:59:15 +0200
Subject: [PATCH 075/143] MNT apply black

---
 sklearn/_loss/link.py            |  4 +---
 sklearn/_loss/loss.py            | 24 ++++++------------------
 sklearn/_loss/tests/test_link.py | 17 ++++++-----------
 3 files changed, 13 insertions(+), 32 deletions(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index f5567e6dd7b49..ed9f12b577c62 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -12,9 +12,7 @@
 from ..utils.extmath import softmax
 
 
-Interval = namedtuple(
-    "Interval", ("low", "high", "low_inclusive", "high_inclusive")
-)
+Interval = namedtuple("Interval", ("low", "high", "low_inclusive", "high_inclusive"))
 
 
 def is_in_interval_range(x, interval):
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 321d000636fe5..37818c33b3978 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -354,9 +354,7 @@ def gradient_hessian(
             n_threads=n_threads,
         )
 
-    def __call__(
-        self, y_true, raw_prediction, sample_weight=None, n_threads=1
-    ):
+    def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
         """Compute the weighted average loss.
 
         Parameters
@@ -473,11 +471,7 @@ def gradient(
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
-        if (
-            gradient is not None
-            and gradient.ndim == 2
-            and gradient.shape[1] == 1
-        ):
+        if gradient is not None and gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
         # gradient = raw_prediction - y_true is easier in numpy
@@ -588,7 +582,7 @@ def __init__(self, sample_weight=None, quantile=0.5):
             self.constant_hessian = False
         if quantile <= 0 or quantile >= 1:
             raise ValueError(
-                f"PinballLoss aka quantile loss only accepts "
+                "PinballLoss aka quantile loss only accepts "
                 f"0 < quantile < 1; {quantile} was given."
             )
 
@@ -601,9 +595,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
         if sample_weight is None:
             return np.percentile(y_true, 100 * self.quantile, axis=0)
         else:
-            return _weighted_percentile(
-                y_true, sample_weight, 100 * self.quantile
-            )
+            return _weighted_percentile(y_true, sample_weight, 100 * self.quantile)
 
 
 class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
@@ -776,17 +768,13 @@ def predict_proba(self, raw_prediction):
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
-        proba = np.empty(
-            (raw_prediction.shape[0], 2), dtype=raw_prediction.dtype
-        )
+        proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
         proba[:, 1] = self.inverse(raw_prediction)
         proba[:, 0] = 1 - proba[:, 1]
         return proba
 
 
-class CategoricalCrossEntropy(
-    MultinomialLogit, BaseLoss, cCategoricalCrossEntropy
-):
+class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, cCategoricalCrossEntropy):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index b049f5ac637d6..f2846c17b3f1d 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -36,8 +36,7 @@ def test_is_in_range(interval):
 
     # x contains lower bound
     assert (
-        is_in_interval_range(np.r_[x, interval.low], interval)
-        == interval.low_inclusive
+        is_in_interval_range(np.r_[x, interval.low], interval) == interval.low_inclusive
     )
 
     # x contains upper bound
@@ -47,9 +46,9 @@ def test_is_in_range(interval):
     )
 
     # x contains upper and lower bound
-    assert is_in_interval_range(
-        np.r_[x, interval.low, interval.high], interval
-    ) == (interval.low_inclusive and interval.high_inclusive)
+    assert is_in_interval_range(np.r_[x, interval.low, interval.high], interval) == (
+        interval.low_inclusive and interval.high_inclusive
+    )
 
 
 @pytest.mark.parametrize("link", LINK_FUNCTIONS)
@@ -60,9 +59,7 @@ def test_link_inverse_identity(link):
     n_samples, n_classes = 100, None
     if link.multiclass:
         n_classes = 10
-        raw_prediction = rng.normal(
-            loc=0, scale=10, size=(n_samples, n_classes)
-        )
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
         if isinstance(link, MultinomialLogit):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:
@@ -83,9 +80,7 @@ def test_link_out_argument(link):
     n_samples, n_classes = 100, None
     if link.multiclass:
         n_classes = 10
-        raw_prediction = rng.normal(
-            loc=0, scale=10, size=(n_samples, n_classes)
-        )
+        raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
         if isinstance(link, MultinomialLogit):
             raw_prediction = link.symmetrize_raw_prediction(raw_prediction)
     else:

From f8a024ad4b0fd93903837acfff9a7d2047134466 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 20 Jul 2021 20:22:44 +0200
Subject: [PATCH 076/143] TST replace np.quantile by np.percentile

---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 5e674ccc00942..1673382114378 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -743,7 +743,7 @@ def fun(x):
     [
         (HalfSquaredError(), np.mean, "normal"),
         (AbsoluteError(), np.median, "normal"),
-        (PinballLoss(quantile=0.25), lambda x: np.quantile(x, q=0.25), "normal"),
+        (PinballLoss(quantile=0.25), lambda x: np.percentile(x, q=25), "normal"),
         (HalfPoissonLoss(), np.mean, "poisson"),
         (HalfGammaLoss(), np.mean, "exponential"),
         (HalfTweedieLoss(), np.mean, "exponential"),

From dfcd078e1862e99c8a4e4bd78a3964584be9f02e Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 13:51:30 +0200
Subject: [PATCH 077/143] ENH make Interval a dataclass

- function is_in_interval_range -> method Interval.includes
---
 sklearn/_loss/link.py            | 65 +++++++++++++++++---------------
 sklearn/_loss/loss.py            |  9 ++---
 sklearn/_loss/tests/test_link.py | 14 ++-----
 3 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index ed9f12b577c62..a172ac4d9e49c 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -4,7 +4,7 @@
 # Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
 
 from abc import ABC, abstractmethod
-from collections import namedtuple
+from dataclasses import dataclass
 
 import numpy as np
 from scipy.special import expit, logit
@@ -12,41 +12,46 @@
 from ..utils.extmath import softmax
 
 
-Interval = namedtuple("Interval", ("low", "high", "low_inclusive", "high_inclusive"))
+@dataclass
+class Interval:
+    low: float
+    high: float
+    low_inclusive: bool
+    high_inclusive: bool
 
+    def includes(self, x):
+        """Test whether values of x are in interval range.
 
-def is_in_interval_range(x, interval):
-    """Test whether values of x are in interval range from Interval.
-
-    Parameters
-    ----------
-    x : ndarray
-        Array whose elements are tested to be in interval range.
-    interval: Interval
-        An Interval range.
-    """
-    if interval.low_inclusive:
-        low = np.greater_equal(x, interval.low)
-    else:
-        low = np.greater(x, interval.low)
+        Parameters
+        ----------
+        x : ndarray
+            Array whose elements are tested to be in interval range.
+        """
+        if self.low_inclusive:
+            low = np.greater_equal(x, self.low)
+        else:
+            low = np.greater(x, self.low)
 
-    if not np.all(low):
-        return False
+        if not np.all(low):
+            return False
 
-    if interval.high_inclusive:
-        high = np.less_equal(x, interval.high)
-    else:
-        high = np.less(x, interval.high)
+        if self.high_inclusive:
+            high = np.less_equal(x, self.high)
+        else:
+            high = np.less(x, self.high)
 
-    # Note: np.all returns numpy.bool_
-    if np.all(high):
-        return True
-    else:
-        return False
+        # Note: np.all returns numpy.bool_
+        if np.all(high):
+            return True
+        else:
+            return False
 
 
 def _inclusive_low_high(interval, dtype=float):
-    """Generate values low and high to be within the interval range."""
+    """Generate values low and high to be within the interval range.
+
+    This is used in tests only.
+    """
     eps = 10 * np.finfo(dtype).eps
     if interval.low == -np.inf:
         low = -1e10
@@ -76,8 +81,8 @@ class BaseLink(ABC):
     called linear predictor, and `y_pred = h(raw_prediction)` is the predicted
     conditional (on X) expected value of the target `y_true`.
 
-    In case a link function needs parameters, the methods are not implemented
-    as staticmethods.
+    The methods are not implemented as staticmethods in case a link function needs
+    parameters.
     """
 
     multiclass = False
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 37818c33b3978..1608bd27902a8 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -30,7 +30,6 @@
 )
 from .link import (
     Interval,
-    is_in_interval_range,
     BaseLink,
     IdentityLink,
     LogLink,
@@ -125,7 +124,7 @@ def in_y_true_range(self, y):
         ----------
         y : ndarray
         """
-        return is_in_interval_range(y, self.interval_y_true)
+        return self.interval_y_true.includes(y)
 
     def in_y_pred_range(self, y):
         """Return True if y is in the valid range of y_pred.
@@ -134,7 +133,7 @@ def in_y_pred_range(self, y):
         ----------
         y : ndarray
         """
-        return is_in_interval_range(y, self.interval_y_pred)
+        return self.interval_y_pred.includes(y)
 
     def loss(
         self,
@@ -823,9 +822,7 @@ def in_y_true_range(self, y):
         ----------
         y : ndarray
         """
-        return is_in_interval_range(y, self.interval_y_true) and np.all(
-            y.astype(int) == y
-        )
+        return self.interval_y_true.includes(y) and np.all(y.astype(int) == y)
 
     def fit_intercept_only(self, y_true, sample_weight=None):
         """Compute raw_prediction of an intercept-only model.
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index f2846c17b3f1d..d9b1e36e68a19 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -7,7 +7,6 @@
     _inclusive_low_high,
     MultinomialLogit,
     Interval,
-    is_in_interval_range,
 )
 
 
@@ -32,21 +31,16 @@ def test_is_in_range(interval):
     low, high = _inclusive_low_high(interval)
 
     x = np.linspace(low, high, num=10)
-    assert is_in_interval_range(x, interval)
+    assert interval.includes(x)
 
     # x contains lower bound
-    assert (
-        is_in_interval_range(np.r_[x, interval.low], interval) == interval.low_inclusive
-    )
+    assert interval.includes(np.r_[x, interval.low]) == interval.low_inclusive
 
     # x contains upper bound
-    assert (
-        is_in_interval_range(np.r_[x, interval.high], interval)
-        == interval.high_inclusive
-    )
+    assert interval.includes(np.r_[x, interval.high]) == interval.high_inclusive
 
     # x contains upper and lower bound
-    assert is_in_interval_range(np.r_[x, interval.low, interval.high], interval) == (
+    assert interval.includes(np.r_[x, interval.low, interval.high]) == (
         interval.low_inclusive and interval.high_inclusive
     )
 

From b5c5bf5b656d1b4891c9944aba29a33faf12eda3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 13:58:55 +0200
Subject: [PATCH 078/143] DOC improve docstrings in link.py

---
 sklearn/_loss/link.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index a172ac4d9e49c..b3ba52d7c3bce 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -20,12 +20,16 @@ class Interval:
     high_inclusive: bool
 
     def includes(self, x):
-        """Test whether values of x are in interval range.
+        """Test whether all values of x are in interval range.
 
         Parameters
         ----------
         x : ndarray
             Array whose elements are tested to be in interval range.
+
+        Returns
+        -------
+        result : bool
         """
         if self.low_inclusive:
             low = np.greater_equal(x, self.low)
@@ -51,6 +55,11 @@ def _inclusive_low_high(interval, dtype=float):
     """Generate values low and high to be within the interval range.
 
     This is used in tests only.
+
+    Returns
+    -------
+    low, high : tuple
+        The returned values low and high lie within the interval.
     """
     eps = 10 * np.finfo(dtype).eps
     if interval.low == -np.inf:

From 3d6a47703f42203db4635012b51f43a5d7e848a4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:00:58 +0200
Subject: [PATCH 079/143] MNT use numpy dtype instead of Python type

---
 sklearn/_loss/link.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index b3ba52d7c3bce..9bb223eb0dca6 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -51,7 +51,7 @@ def includes(self, x):
             return False
 
 
-def _inclusive_low_high(interval, dtype=float):
+def _inclusive_low_high(interval, dtype=np.float64):
     """Generate values low and high to be within the interval range.
 
     This is used in tests only.

From c2f0f8ecca66f340ec61f60c8be3438ef843ceb6 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:08:28 +0200
Subject: [PATCH 080/143] TST add negative intervals

---
 sklearn/_loss/tests/test_link.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index d9b1e36e68a19..6a7f1b7598b73 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -24,6 +24,10 @@
         Interval(-np.inf, np.inf, False, True),
         Interval(-np.inf, np.inf, True, False),
         Interval(-np.inf, np.inf, True, True),
+        Interval(-10, -1, False, False),
+        Interval(-10, -1, False, True),
+        Interval(-10, -1, True, False),
+        Interval(-10, -1, True, True),
     ],
 )
 def test_is_in_range(interval):

From 33cabc4ca56ffa8d024d7b8c6d80b693f55c483f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:14:02 +0200
Subject: [PATCH 081/143] ENH add __post_init__ to class Interval

---
 sklearn/_loss/link.py            | 5 +++++
 sklearn/_loss/tests/test_link.py | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index 9bb223eb0dca6..b756e275c6d0e 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -19,6 +19,11 @@ class Interval:
     low_inclusive: bool
     high_inclusive: bool
 
+    def __post_init__(self):
+        """Check that low <= high"""
+        if self.low > self.high:
+            raise ValueError("On must have low <= high; got low={low}, high={high}.")
+
     def includes(self, x):
         """Test whether all values of x are in interval range.
 
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index 6a7f1b7598b73..3239ade25f3c7 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -13,6 +13,12 @@
 LINK_FUNCTIONS = list(_LINKS.values())
 
 
+def test_interval_raises():
+    """Test that interval with low > high raises ValueError."""
+    with pytest.raises(ValueError, match="On must have low <= high"):
+        Interval(1, 0, False, False)
+
+
 @pytest.mark.parametrize(
     "interval",
     [

From 4cd28265b67bc1ae233be1bb4d4108a3de6571bb Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 14:54:25 +0200
Subject: [PATCH 082/143] MNT rename cython losses

- class name from cLoss to CyLoss
- single sample methods
  from closs to cy_loss, cgradient to cy_gradient, ..
---
 sklearn/_loss/_loss.pxd    | 64 +++++++++++++++++++-------------------
 sklearn/_loss/_loss.pyx.tp | 57 +++++++++++++++++----------------
 sklearn/_loss/loss.py      | 42 ++++++++++++-------------
 3 files changed, 83 insertions(+), 80 deletions(-)

diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index 8ad45f3bed389..b00379a1e793d 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -25,51 +25,51 @@ ctypedef struct double_pair:
 
 
 # C base class for loss functions
-cdef class cLossFunction:
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyLossFunction:
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfSquaredError(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyHalfSquaredError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cAbsoluteError(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyAbsoluteError(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cPinballLoss(cLossFunction):
+cdef class CyPinballLoss(CyLossFunction):
     cdef readonly double quantile  # readonly makes it inherited by children
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfPoissonLoss(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyHalfPoissonLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfGammaLoss(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyHalfGammaLoss(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cHalfTweedieLoss(cLossFunction):
+cdef class CyHalfTweedieLoss(CyLossFunction):
     cdef readonly double power  # readonly makes it inherited by children
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class cBinaryCrossEntropy(cLossFunction):
-    cdef double closs(self, double y_true, double raw_prediction) nogil
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil
+cdef class CyBinaryCrossEntropy(CyLossFunction):
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 92dcf57e9f1fb..63ada42133dcf 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -1,12 +1,12 @@
 {{py:
 
 """
-Template file for easily generate fused types consistent code using Tempita
+Template file for easily generate loops over samples using Tempita
 (https://github.com/cython/cython/blob/master/Cython/Tempita/_tempita.py).
 
 Generated file: _loss.pyx
 
-Each loss class is generated by a cdef functions  on single samples.
+Each loss class is generated by a cdef functions on single samples.
 The keywords between double braces are substituted in setup.py.
 """
 
@@ -129,33 +129,36 @@ doc_BinaryCrossEntropy = (
     """
 )
 
-# loss class name, docstring, param, closs, closs_grad, cgrad, cgrad_hess,
+# loss class name, docstring, param,
+# cy_loss, cy_loss_grad,
+# cy_grad, cy_grad_hess,
 class_list = [
-    ("cHalfSquaredError", doc_SquaredError, None,
+    ("CyHalfSquaredError", doc_SquaredError, None,
      "closs_half_squared_error", None,
      "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
-    ("cAbsoluteError", doc_AbsoluteError, None,
+    ("CyAbsoluteError", doc_AbsoluteError, None,
      "closs_absolute_error", None,
      "cgradient_absolute_error", "cgrad_hess_absolute_error"),
-    ("cPinballLoss", doc_PinballLoss, "quantile",
+    ("CyPinballLoss", doc_PinballLoss, "quantile",
      "closs_pinball_loss", None,
      "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
-    ("cHalfPoissonLoss", doc_PoissonLoss, None,
+    ("CyHalfPoissonLoss", doc_PoissonLoss, None,
      "closs_half_poisson", "closs_grad_half_poisson",
      "cgradient_half_poisson", "cgrad_hess_half_poisson"),
-    ("cHalfGammaLoss", doc_GammaLoss, None,
+    ("CyHalfGammaLoss", doc_GammaLoss, None,
      "closs_half_gamma", "closs_grad_half_gamma",
      "cgradient_half_gamma", "cgrad_hess_half_gamma"),
-    ("cHalfTweedieLoss", doc_TweedieLoss, "power",
+    ("CyHalfTweedieLoss", doc_TweedieLoss, "power",
      "closs_half_tweedie", "closs_grad_half_tweedie",
      "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
-    ("cBinaryCrossEntropy", doc_BinaryCrossEntropy, None,
+    ("CyBinaryCrossEntropy", doc_BinaryCrossEntropy, None,
      "closs_binary_crossentropy", "closs_grad_binary_crossentropy",
      "cgradient_binary_crossentropy", "cgrad_hess_binary_crossentropy"),
 ]
 }}
 """
-WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+WARNING: Do not edit `sklearn/_loss/_loss.pyx` file directly, as it is generated from
+`sklearn/_loss/_loss.pyx.tp`. Changes must be made there.
 """
 #------------------------------------------------------------------------------
 
@@ -170,22 +173,22 @@ WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
 #    stability, i.e. use raw_prediction instead of y_pred in signature.
 # b) Pure C functions (nogil) calculate single points (single sample)
 # c) Wrap C functions in a loop to get Python functions operating on ndarrays.
-#   - Write loops manually.
+#   - Write loops manually---use Tempita for this.
 #     Reason: There is still some performance overhead when using a wrapper
 #     function "wrap" that carries out the loop and gets as argument a function
 #     pointer to one of the C functions from b), e.g.
 #     wrap(closs_half_poisson, y_true, ...)
 #   - Pass n_threads as argument to prange and propagate option to all callers.
-# d) Provide classes (Cython extension types) per loss in order to have
-#    semantical structured objects.
-#    - Member function for single points just call the C function from b).
+# d) Provide classes (Cython extension types) per loss (names start with Cy) in
+#    order to have semantical structured objects.
+#    - Member functions for single points just call the C function from b).
 #      These are used e.g. in SGD `_plain_sgd`.
-#    - Member functions operating on ndarrays looping, see c), over calls to C
+#    - Member functions operating on ndarrays, see c), looping over calls to C
 #      functions from b).
 # e) Provide convenience Python classes that inherit from these extension types
 #    elsewhere (see loss.py)
-#    - Example: loss.gradient calls extension_type._gradient but does some
-#      input checking like None -> np.empty().
+#    - Example: loss.gradient calls CyLoss._gradient but does some input
+#      checking like None -> np.empty().
 #
 # Note: We require 1-dim ndarrays to be contiguous.
 # TODO: Use const memoryviews with fused types with Cython 3.0 where
@@ -571,10 +574,10 @@ cdef inline double_pair cgrad_hess_binary_crossentropy(
 # ---------------------------------------------------
 # Extension Types for Loss Functions of 1-dim targets
 # ---------------------------------------------------
-cdef class cLossFunction:
+cdef class CyLossFunction:
     """Base class for convex loss functions."""
 
-    cdef double closs(self, double y_true, double raw_prediction) nogil:
+    cdef double cy_loss(self, double y_true, double raw_prediction) nogil:
         """Compute the loss for a single sample.
 
         Parameters
@@ -591,7 +594,7 @@ cdef class cLossFunction:
         """
         pass
 
-    cdef double cgradient(self, double y_true, double raw_prediction) nogil:
+    cdef double cy_gradient(self, double y_true, double raw_prediction) nogil:
         """Compute gradient of loss w.r.t. raw_prediction for a single sample.
 
         Parameters
@@ -608,7 +611,7 @@ cdef class cLossFunction:
         """
         pass
 
-    cdef double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil:
         """Compute gradient and hessian.
 
         Gradient and hessian of loss w.r.t. raw_prediction for a single sample.
@@ -782,7 +785,7 @@ else:
     with_param = ", self." + param
 }}
 
-cdef class {{name}}(cLossFunction):
+cdef class {{name}}(CyLossFunction):
     """{{docstring}}"""
 
     {{if param is not None}}
@@ -790,13 +793,13 @@ cdef class {{name}}(cLossFunction):
         self.{{param}} = {{param}}
     {{endif}}
 
-    cdef inline double closs(self, double y_true, double raw_prediction) nogil:
+    cdef inline double cy_loss(self, double y_true, double raw_prediction) nogil:
         return {{closs}}(y_true, raw_prediction{{with_param}})
 
-    cdef inline double cgradient(self, double y_true, double raw_prediction) nogil:
+    cdef inline double cy_gradient(self, double y_true, double raw_prediction) nogil:
         return {{cgrad}}(y_true, raw_prediction{{with_param}})
 
-    cdef inline double_pair cgrad_hess(self, double y_true, double raw_prediction) nogil:
+    cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil:
         return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
 
     def _loss(
@@ -916,7 +919,7 @@ cdef class {{name}}(cLossFunction):
 {{endfor}}
 
 
-cdef class cCategoricalCrossEntropy(cLossFunction):
+cdef class CyCategoricalCrossEntropy(CyLossFunction):
     """CategoricalCrossEntropy with multinomial logit link.
 
     Domain:
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 1608bd27902a8..53add11d9c89e 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -18,15 +18,15 @@
 import numpy as np
 from scipy.special import xlogy
 from ._loss import (
-    cLossFunction,
-    cHalfSquaredError,
-    cAbsoluteError,
-    cPinballLoss,
-    cHalfPoissonLoss,
-    cHalfGammaLoss,
-    cHalfTweedieLoss,
-    cBinaryCrossEntropy,
-    cCategoricalCrossEntropy,
+    CyLossFunction,
+    CyHalfSquaredError,
+    CyAbsoluteError,
+    CyPinballLoss,
+    CyHalfPoissonLoss,
+    CyHalfGammaLoss,
+    CyHalfTweedieLoss,
+    CyBinaryCrossEntropy,
+    CyCategoricalCrossEntropy,
 )
 from .link import (
     Interval,
@@ -42,7 +42,7 @@
 # Note: The shape of raw_prediction for multiclass classifications are
 # - GradientBoostingClassifier: (n_samples, n_classes)
 # - HistGradientBoostingClassifier: (n_classes, n_samples)
-class BaseLoss(BaseLink, cLossFunction):
+class BaseLoss(BaseLink, CyLossFunction):
     """Base class for a loss function of 1-dimensional targets.
 
     Conventions:
@@ -94,7 +94,7 @@ class BaseLoss(BaseLink, cLossFunction):
     # - link
     # - inverse
     #
-    # Inherited methods from cLossFunction:
+    # Inherited methods from CyLossFunction:
     # - _loss, _loss_gradient, _gradient, _gradient_hessian
 
     # For decision trees:
@@ -434,7 +434,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return np.zeros_like(y_true)
 
 
-class HalfSquaredError(IdentityLink, BaseLoss, cHalfSquaredError):
+class HalfSquaredError(IdentityLink, BaseLoss, CyHalfSquaredError):
     """Half squared error with identity link, for regression.
 
     Domain:
@@ -505,7 +505,7 @@ def gradient_hessian(
         return gradient, hessian
 
 
-class AbsoluteError(IdentityLink, BaseLoss, cAbsoluteError):
+class AbsoluteError(IdentityLink, BaseLoss, CyAbsoluteError):
     """Absolute error with identity link, for regression.
 
     Domain:
@@ -542,7 +542,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 50)
 
 
-class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
+class PinballLoss(IdentityLink, BaseLoss, CyPinballLoss):
     """Quantile loss aka pinball loss, for regression.
 
     Domain:
@@ -573,7 +573,7 @@ class PinballLoss(IdentityLink, BaseLoss, cPinballLoss):
 
     def __init__(self, sample_weight=None, quantile=0.5):
         BaseLoss.__init__(self)
-        cPinballLoss.__init__(self, quantile=float(quantile))
+        CyPinballLoss.__init__(self, quantile=float(quantile))
         self.approx_hessian = True
         if sample_weight is None:
             self.constant_hessian = True
@@ -597,7 +597,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 100 * self.quantile)
 
 
-class HalfPoissonLoss(LogLink, BaseLoss, cHalfPoissonLoss):
+class HalfPoissonLoss(LogLink, BaseLoss, CyHalfPoissonLoss):
     """Poisson deviance loss with log-link, for regression.
 
     Domain:
@@ -630,7 +630,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfGammaLoss(LogLink, BaseLoss, cHalfGammaLoss):
+class HalfGammaLoss(LogLink, BaseLoss, CyHalfGammaLoss):
     """Gamma deviance loss with log-link, for regression.
 
     Domain:
@@ -662,7 +662,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfTweedieLoss(LogLink, BaseLoss, cHalfTweedieLoss):
+class HalfTweedieLoss(LogLink, BaseLoss, CyHalfTweedieLoss):
     """Tweedie deviance loss with log-link, for regression.
 
     Domain:
@@ -695,7 +695,7 @@ class HalfTweedieLoss(LogLink, BaseLoss, cHalfTweedieLoss):
 
     def __init__(self, sample_weight=None, power=1.5):
         BaseLoss.__init__(self)
-        cHalfTweedieLoss.__init__(self, power=power)
+        CyHalfTweedieLoss.__init__(self, power=power)
         self.interval_y_pred = Interval(0, np.inf, False, False)
         if self.power <= 0:
             self.interval_y_true = Interval(-np.inf, np.inf, False, False)
@@ -725,7 +725,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
             return term
 
 
-class BinaryCrossEntropy(LogitLink, BaseLoss, cBinaryCrossEntropy):
+class BinaryCrossEntropy(LogitLink, BaseLoss, CyBinaryCrossEntropy):
     """Binary cross entropy loss with logit link, for binary classification.
 
     Domain:
@@ -773,7 +773,7 @@ def predict_proba(self, raw_prediction):
         return proba
 
 
-class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, cCategoricalCrossEntropy):
+class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, CyCategoricalCrossEntropy):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:

From 5f61a902743ebcfb006972d4d550b166c3ead187 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:32:55 +0200
Subject: [PATCH 083/143] TST loss.predict_proba

---
 sklearn/_loss/tests/test_loss.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 1673382114378..4d977d4078071 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -819,3 +819,24 @@ def test_binary_and_categorical_crossentropy():
         bce.loss(y_true=y_train, raw_prediction=raw_prediction),
         cce.loss(y_true=y_train, raw_prediction=raw_cce),
     )
+
+
+@pytest.mark.parametrize(
+    "loss",
+    [loss for loss in LOSS_INSTANCES if hasattr(loss, "predict_proba")],
+    ids=loss_instance_name,
+)
+def test_predict_proba(loss):
+    """Test that predict_proba works as expected."""
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+    proba = loss.predict_proba(raw_prediction)
+
+    assert proba.shape == (n_samples, loss.n_classes)
+    assert np.sum(proba, axis=1) == approx(1)

From 7e1af6c3325c83396b38b12ed77b7cbbba5dd129 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:44:05 +0200
Subject: [PATCH 084/143] TST predict_proba and gradient_proba

---
 sklearn/_loss/tests/test_loss.py | 41 +++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 4d977d4078071..47a212b77a34d 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -821,13 +821,9 @@ def test_binary_and_categorical_crossentropy():
     )
 
 
-@pytest.mark.parametrize(
-    "loss",
-    [loss for loss in LOSS_INSTANCES if hasattr(loss, "predict_proba")],
-    ids=loss_instance_name,
-)
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_predict_proba(loss):
-    """Test that predict_proba works as expected."""
+    """Test that predict_proba and gradient_proba work as expected."""
     n_samples = 20
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
@@ -836,7 +832,34 @@ def test_predict_proba(loss):
         raw_bound=(-5, 5),
         seed=42,
     )
-    proba = loss.predict_proba(raw_prediction)
 
-    assert proba.shape == (n_samples, loss.n_classes)
-    assert np.sum(proba, axis=1) == approx(1)
+    if hasattr(loss, "predict_proba"):
+        proba = loss.predict_proba(raw_prediction)
+        assert proba.shape == (n_samples, loss.n_classes)
+        assert np.sum(proba, axis=1) == approx(1)
+
+    if hasattr(loss, "gradient_proba"):
+        for grad, proba in (
+            (None, None),
+            (None, np.empty_like(raw_prediction)),
+            (np.empty_like(raw_prediction), None),
+            (np.empty_like(raw_prediction), np.empty_like(raw_prediction)),
+        ):
+            grad, proba = loss.gradient_proba(
+                y_true=y_true,
+                raw_prediction=raw_prediction,
+                sample_weight=None,
+                gradient=grad,
+                proba=proba,
+            )
+            assert proba.shape == (n_samples, loss.n_classes)
+            assert np.sum(proba, axis=1) == approx(1)
+            assert_allclose(
+                grad,
+                loss.gradient(
+                    y_true=y_true,
+                    raw_prediction=raw_prediction,
+                    sample_weight=None,
+                    gradient=None,
+                ),
+            )

From fb1ab5a4d63e4e72da5839110339edb987147fe5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:56:05 +0200
Subject: [PATCH 085/143] MNT use is_multiclass in tests instead of n_classes
 <= 2

---
 sklearn/_loss/tests/test_loss.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 47a212b77a34d..4527b58a09ea7 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -442,7 +442,7 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
 
     baseline_prediction = loss.fit_intercept_only(y_true=y_true, sample_weight=None)
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         raw_prediction = np.zeros(shape=(n_samples,), dtype=baseline_prediction.dtype)
     else:
         raw_prediction = np.zeros(
@@ -460,7 +460,7 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         sample_weight=sample_weight,
     )
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         assert_allclose(gradient * sample_weight, gradient_sw)
         assert_allclose(hessian * sample_weight, hessian_sw)
     else:
@@ -476,7 +476,7 @@ def test_loss_of_perfect_prediction(loss, sample_weight):
     Loss of y_pred = y_true plus constant_to_optimal_zero should sums up to
     zero.
     """
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         # Use small values such that exp(value) is not nan.
         raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
         y_true = loss.inverse(raw_prediction)
@@ -537,7 +537,7 @@ def test_gradients_hessians_numerically(loss, sample_weight):
     assert g.shape == raw_prediction.shape
     assert h.shape == raw_prediction.shape
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
 
         def loss_func(x):
             return loss.loss(
@@ -673,7 +673,7 @@ def test_loss_intercept_only(loss, sample_weight):
     Also test that the gradient is zero at the minimum.
     """
     n_samples = 50
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         y_true = loss.inverse(np.linspace(-4, 4, num=n_samples))
     else:
         y_true = np.arange(n_samples).astype(float) % loss.n_classes
@@ -686,7 +686,7 @@ def test_loss_intercept_only(loss, sample_weight):
 
     # find minimum by optimization
     def fun(x):
-        if loss.n_classes <= 2:
+        if not loss.is_multiclass:
             raw_prediction = np.full(shape=(n_samples), fill_value=x)
         else:
             raw_prediction = np.ascontiguousarray(
@@ -698,7 +698,7 @@ def fun(x):
             sample_weight=sample_weight,
         )
 
-    if loss.n_classes <= 2:
+    if not loss.is_multiclass:
         opt = minimize_scalar(fun, tol=1e-7, options={"maxiter": 100})
         grad = loss.gradient(
             y_true=y_true,

From bdb6d184a274ba5f422994f9841e0b1271b650e5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 15:58:00 +0200
Subject: [PATCH 086/143] DOC docstring predict_proba and more

---
 sklearn/_loss/_loss.pyx.tp | 82 +++++++++++++++++++-------------------
 sklearn/_loss/loss.py      | 28 ++++++++++++-
 2 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 63ada42133dcf..573b1bba8d47b 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -804,10 +804,10 @@ cdef class {{name}}(CyLossFunction):
 
     def _loss(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
         int n_threads=1
     ):
         cdef:
@@ -830,11 +830,11 @@ cdef class {{name}}(CyLossFunction):
     {{if closs_grad is not None}}
     def _loss_gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[::1] gradient,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] loss,            # OUT
+        G_DTYPE_C[::1] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -862,10 +862,10 @@ cdef class {{name}}(CyLossFunction):
 
     def _gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -887,11 +887,11 @@ cdef class {{name}}(CyLossFunction):
 
     def _gradient_hessian(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[::1] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] gradient,
-        G_DTYPE_C[::1] hessian,
+        Y_DTYPE_C[::1] y_true,          # IN
+        Y_DTYPE_C[::1] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,   # IN
+        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] hessian,         # OUT
         int n_threads=1
     ):
         cdef:
@@ -938,10 +938,10 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
     # opposite are welcome.
     def _loss(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[::1] loss,             # OUT
         int n_threads=1
     ):
         cdef:
@@ -998,11 +998,11 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
     def _loss_gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[::1] loss,
-        G_DTYPE_C[:, :] gradient,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[::1] loss,             # OUT
+        G_DTYPE_C[:, :] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -1060,10 +1060,10 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
     def _gradient(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[:, :] gradient,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[:, :] gradient,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -1109,11 +1109,11 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
     def _gradient_hessian(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[:, :] gradient,
-        G_DTYPE_C[:, :] hessian,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[:, :] gradient,        # OUT
+        G_DTYPE_C[:, :] hessian,         # OUT
         int n_threads=1
     ):
         cdef:
@@ -1167,11 +1167,11 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
     # diagonal (in the classes) approximation as implemented above.
     def _gradient_proba(
         self,
-        Y_DTYPE_C[::1] y_true,
-        Y_DTYPE_C[:, :] raw_prediction,
-        Y_DTYPE_C[::1] sample_weight,
-        G_DTYPE_C[:, :] gradient,
-        G_DTYPE_C[:, :] proba,
+        Y_DTYPE_C[::1] y_true,           # IN
+        Y_DTYPE_C[:, :] raw_prediction,  # IN
+        Y_DTYPE_C[::1] sample_weight,    # IN
+        G_DTYPE_C[:, :] gradient,        # OUT
+        G_DTYPE_C[:, :] proba,           # OUT
         int n_threads=1
     ):
         cdef:
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 53add11d9c89e..4530fd90a5212 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -764,6 +764,18 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
     def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples,) or (n_samples, 1)
+            Raw prediction values (in link space).
+
+        Returns
+        -------
+        proba : array of shape (n_samples, 2)
+            Element-wise class probabilites.
+        """
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
@@ -838,6 +850,18 @@ def fit_intercept_only(self, y_true, sample_weight=None):
         return self.link(out[None, :]).reshape(-1)
 
     def predict_proba(self, raw_prediction):
+        """Predict probabilities.
+
+        Parameters
+        ----------
+        raw_prediction : array of shape (n_samples, n_classes)
+            Raw prediction values (in link space).
+
+        Returns
+        -------
+        proba : array of shape (n_samples, n_classes)
+            Element-wise class probabilites.
+        """
         return self.inverse(raw_prediction)
 
     def gradient_proba(
@@ -849,7 +873,7 @@ def gradient_proba(
         proba=None,
         n_threads=1,
     ):
-        """Compute gradient and probabilities fow raw_prediction.
+        """Compute gradient and class probabilities fow raw_prediction.
 
         Parameters
         ----------
@@ -870,7 +894,7 @@ def gradient_proba(
 
         Returns
         -------
-        gradient, proba : array of shape (n_samples, n_classes)
+        gradient : array of shape (n_samples, n_classes)
             Element-wise gradients.
 
         proba : array of shape (n_samples, n_classes)

From 07503002eb91e39a185eda2d5644f44bcc2cc11b Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 15 Aug 2021 17:01:44 +0200
Subject: [PATCH 087/143] MNT remove top_path from gen_from_templates

---
 sklearn/_loss/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/setup.py b/sklearn/_loss/setup.py
index b80584d8707c8..2a2d2b5f13b8a 100644
--- a/sklearn/_loss/setup.py
+++ b/sklearn/_loss/setup.py
@@ -8,7 +8,7 @@ def configuration(parent_package="", top_path=None):
 
     # generate _loss.pyx from template
     templates = ["sklearn/_loss/_loss.pyx.tp"]
-    gen_from_templates(templates, top_path)
+    gen_from_templates(templates)
 
     config.add_extension(
         "_loss",

From 04df8a9e4ba6fda9c72448c3b7206daec0ad09b5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 17 Aug 2021 10:27:40 +0200
Subject: [PATCH 088/143] TST test graceful squeezing

---
 sklearn/_loss/tests/test_loss.py | 75 +++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 12 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 4527b58a09ea7..e179c310b2d2c 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -419,10 +419,10 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", ["ones", "random"])
-def test_sample_weight_multiplies_gradients(loss, sample_weight):
-    """Test sample weights in gradients and hessians.
+def test_sample_weight_multiplies(loss, sample_weight):
+    """Test sample weights in loss, gradients and hessians.
 
-    Make sure that passing sample weights to the gradient and hessians
+    Make sure that passing sample weights to loss, gradient and hessian
     computation methods is equivalent to multiplying by the weights.
     """
     n_samples = 100
@@ -440,26 +440,46 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         rng = np.random.RandomState(42)
         sample_weight = rng.normal(size=n_samples).astype(np.float64)
 
-    baseline_prediction = loss.fit_intercept_only(y_true=y_true, sample_weight=None)
+    assert_allclose(
+        loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+        ),
+        sample_weight
+        * loss.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=None,
+        ),
+    )
 
+    losses, gradient = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=None,
+    )
+    losses_sw, gradient_sw = loss.loss_gradient(
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=sample_weight,
+    )
+    assert_allclose(losses * sample_weight, losses_sw)
     if not loss.is_multiclass:
-        raw_prediction = np.zeros(shape=(n_samples,), dtype=baseline_prediction.dtype)
+        assert_allclose(gradient * sample_weight, gradient_sw)
     else:
-        raw_prediction = np.zeros(
-            shape=(n_samples, loss.n_classes), dtype=baseline_prediction.dtype
-        )
-    raw_prediction += baseline_prediction
+        assert_allclose(gradient * sample_weight[:, None], gradient_sw)
 
     gradient, hessian = loss.gradient_hessian(
-        y_true=y_true, raw_prediction=raw_prediction, sample_weight=None
+        y_true=y_true,
+        raw_prediction=raw_prediction,
+        sample_weight=None,
     )
-
     gradient_sw, hessian_sw = loss.gradient_hessian(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
     )
-
     if not loss.is_multiclass:
         assert_allclose(gradient * sample_weight, gradient_sw)
         assert_allclose(hessian * sample_weight, hessian_sw)
@@ -468,6 +488,37 @@ def test_sample_weight_multiplies_gradients(loss, sample_weight):
         assert_allclose(hessian * sample_weight[:, None], hessian_sw)
 
 
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_graceful_squeezing(loss):
+    """Test that Python and Cython functions return same results."""
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=20,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+
+    if raw_prediction.ndim == 1:
+        raw_prediction_2d = raw_prediction[:, None]
+        assert_allclose(
+            loss.loss(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.loss(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.loss_gradient(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.gradient(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.gradient(y_true=y_true, raw_prediction=raw_prediction),
+        )
+        assert_allclose(
+            loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction_2d),
+            loss.gradient_hessian(y_true=y_true, raw_prediction=raw_prediction),
+        )
+
+
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 def test_loss_of_perfect_prediction(loss, sample_weight):

From 86b92930cab2fa6c4884b24c9e89a47dcae28ee3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 17 Aug 2021 13:23:21 +0200
Subject: [PATCH 089/143] CLN no extra methods for HalfSquaredError

---
 sklearn/_loss/loss.py | 45 -------------------------------------------
 1 file changed, 45 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 4530fd90a5212..35b6e0903eb73 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -459,51 +459,6 @@ def __init__(self, sample_weight=None):
         else:
             self.constant_hessian = False
 
-    def gradient(
-        self,
-        y_true,
-        raw_prediction,
-        sample_weight=None,
-        gradient=None,
-        n_threads=1,
-    ):
-        # Be graceful to shape (n_samples, 1) -> (n_samples,)
-        if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
-            raw_prediction = raw_prediction.squeeze(1)
-        if gradient is not None and gradient.ndim == 2 and gradient.shape[1] == 1:
-            gradient = gradient.squeeze(1)
-
-        # gradient = raw_prediction - y_true is easier in numpy
-        gradient = np.subtract(raw_prediction, y_true, out=gradient)
-        if sample_weight is None:
-            return gradient
-        else:
-            return np.multiply(sample_weight, gradient, out=gradient)
-
-    def gradient_hessian(
-        self,
-        y_true,
-        raw_prediction,
-        sample_weight=None,
-        gradient=None,
-        hessian=None,
-        n_threads=1,
-    ):
-        # easier in numpy
-        gradient = self.gradient(
-            y_true, raw_prediction, sample_weight, gradient, hessian
-        )
-        if hessian is None:
-            hessian = np.empty_like(gradient)
-        elif hessian.ndim == 2 and hessian.shape[1] == 1:
-            # Be graceful to shape (n_samples, 1) -> (n_samples,)
-            hessian = hessian.squeeze(1)
-        if sample_weight is None:
-            hessian.fill(1)
-        else:
-            np.copyto(hessian, sample_weight)
-        return gradient, hessian
-
 
 class AbsoluteError(IdentityLink, BaseLoss, CyAbsoluteError):
     """Absolute error with identity link, for regression.

From 330b98e7f1c79fb5f05d2a2b3d6afc5297dc4283 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 19 Aug 2021 21:30:43 +0200
Subject: [PATCH 090/143] TST remove testing if approx_hessian=True

---
 sklearn/_loss/tests/test_loss.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index e179c310b2d2c..2bf41bca8adb6 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -609,7 +609,8 @@ def grad_func(x):
 
         h_numeric = numerical_derivative(grad_func, raw_prediction, eps=1e-6)
         if loss.approx_hessian:
-            assert np.all(h >= h_numeric)
+            # TODO: What could we test if loss.approx_hessian?
+            pass
         else:
             assert_allclose(h, h_numeric, rtol=5e-6, atol=1e-10)
     else:
@@ -643,7 +644,8 @@ def grad_func(x):
 
             h_numeric = numerical_derivative(grad_func, raw_prediction[:, k], eps=1e-6)
             if loss.approx_hessian:
-                assert np.all(h >= h_numeric)
+                # TODO: What could we test if loss.approx_hessian?
+                pass
             else:
                 assert_allclose(h[:, k], h_numeric, rtol=5e-6, atol=1e-10)
 

From d9b6bc80b8584d032cc77431c5598203031fd246 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 23 Aug 2021 19:36:46 +0200
Subject: [PATCH 091/143] DOC remove loss module for classes.rst

---
 doc/modules/classes.rst | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index c64f9a4ddd34a..3848a189c35d4 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1646,27 +1646,3 @@ Recently deprecated
 
 To be removed in 1.0 (renaming of 0.25)
 ---------------------------------------
-
-.. _loss_function_ref:
-
-:mod:`sklearn._loss`: Private Loss Function Classes
-===========================================================
-
-.. automodule:: sklearn._loss
-   :no-members:
-   :no-inherited-members:
-
-.. currentmodule:: sklearn
-
-.. autosummary::
-   :toctree: generated/
-   :template: class.rst
-
-   _loss.HalfSquaredError
-   _loss.AbsoluteError
-   _loss.PinballLoss
-   _loss.HalfPoissonLoss
-   _loss.HalfGammaLoss
-   _loss.HalfTweedieLoss
-   _loss.BinaryCrossEntropy
-   _loss.CategoricalCrossEntropy

From 96ab3ba2788327c0a33908bff8aa65621568b4ed Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 27 Aug 2021 17:29:04 +0200
Subject: [PATCH 092/143] TST that losses can be pickled

---
 sklearn/_loss/tests/test_loss.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 2bf41bca8adb6..b867437ecb0f4 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1,3 +1,5 @@
+import pickle
+
 import numpy as np
 from numpy.testing import assert_allclose, assert_array_equal
 import pytest
@@ -916,3 +918,21 @@ def test_predict_proba(loss):
                     gradient=None,
                 ),
             )
+
+
+@pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
+def test_loss_pickle(loss):
+    """Test that losses can be pickled."""
+    n_samples = 20
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-5, 5),
+        seed=42,
+    )
+    pickled_loss = pickle.dumps(loss)
+    unpickled_loss = pickle.loads(pickled_loss)
+    assert loss(y_true=y_true, raw_prediction=raw_prediction) == approx(
+        unpickled_loss(y_true=y_true, raw_prediction=raw_prediction)
+    )

From 6c8136e2e41b31542656affe14a5823882f9fed6 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 28 Aug 2021 13:37:49 +0200
Subject: [PATCH 093/143] TST add test_loss_on_specific_values

---
 sklearn/_loss/loss.py            |  2 +-
 sklearn/_loss/tests/test_loss.py | 60 ++++++++++++++++++++++++++++----
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 35b6e0903eb73..209f1352a2662 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -49,7 +49,7 @@ class BaseLoss(BaseLink, CyLossFunction):
 
         - y_true.shape = sample_weight.shape = (n_samples,)
         - y_pred.shape = raw_prediction.shape = (n_samples,)
-        - If n_classes >= 3 (multiclass classification), then
+        - If is_multiclass is true (multiclass classification), then
           y_pred.shape = raw_prediction.shape = (n_samples, n_classes)
           Note that this corresponds to the return value of decision_function.
 
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index b867437ecb0f4..19d7dbf484455 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -9,10 +9,12 @@
     minimize_scalar,
     newton,
 )
+from scipy.special import logsumexp
 
 from sklearn._loss.link import _inclusive_low_high, IdentityLink
 from sklearn._loss.loss import (
     _LOSSES,
+    BaseLoss,
     AbsoluteError,
     BinaryCrossEntropy,
     CategoricalCrossEntropy,
@@ -41,13 +43,17 @@
 ]
 
 
-def loss_instance_name(loss):
-    name = loss.__class__.__name__
-    if hasattr(loss, "quantile"):
-        name += f"(quantile={loss.quantile})"
-    elif hasattr(loss, "power"):
-        name += f"(power={loss.power})"
-    return name
+def loss_instance_name(param):
+    if isinstance(param, BaseLoss):
+        loss = param
+        name = loss.__class__.__name__
+        if hasattr(loss, "quantile"):
+            name += f"(quantile={loss.quantile})"
+        elif hasattr(loss, "power"):
+            name += f"(power={loss.power})"
+        return name
+    else:
+        return str(param)
 
 
 def random_y_true_raw_prediction(
@@ -190,6 +196,46 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
         assert not loss.in_y_pred_range(np.array([y]))
 
 
+@pytest.mark.parametrize(
+    "loss, y_true, raw_prediction, loss_true",
+    [
+        (HalfSquaredError(), 1.0, 5.0, 8),
+        (AbsoluteError(), 1.0, 5.0, 4),
+        (PinballLoss(quantile=0.5), 1.0, 5.0, 2),
+        (PinballLoss(quantile=0.25), 1.0, 5.0, 4 * (1 - 0.25)),
+        (PinballLoss(quantile=0.25), 5.0, 1.0, 4 * 0.25),
+        (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4)),
+        (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4),
+        (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4 ** 2),
+        (BinaryCrossEntropy(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
+        (
+            CategoricalCrossEntropy(n_classes=3),
+            0.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.2,
+        ),
+        (
+            CategoricalCrossEntropy(n_classes=3),
+            1.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.5,
+        ),
+        (
+            CategoricalCrossEntropy(n_classes=3),
+            2.0,
+            [0.2, 0.5, 0.3],
+            logsumexp([0.2, 0.5, 0.3]) - 0.3,
+        ),
+    ],
+    ids=loss_instance_name,
+)
+def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
+    """Test losses at specific values."""
+    assert loss(
+        y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
+    ) == approx(loss_true)
+
+
 @pytest.mark.parametrize("loss", ALL_LOSSES)
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])

From fa1569131cea7b0b64d8441ca6c081143846f20f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 28 Aug 2021 13:45:01 +0200
Subject: [PATCH 094/143] FIX make cython inheritance happy and losses pickable

---
 sklearn/_loss/loss.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 209f1352a2662..e6bc17609c9f7 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -434,7 +434,11 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return np.zeros_like(y_true)
 
 
-class HalfSquaredError(IdentityLink, BaseLoss, CyHalfSquaredError):
+# Note: Naturally, we would inherit in the following order
+#         class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
+#       But because of https://github.com/cython/cython/issues/4350 we
+#       set BaseLoss as the last one. This, of course, changes the MRO.
+class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
     """Half squared error with identity link, for regression.
 
     Domain:
@@ -460,7 +464,7 @@ def __init__(self, sample_weight=None):
             self.constant_hessian = False
 
 
-class AbsoluteError(IdentityLink, BaseLoss, CyAbsoluteError):
+class AbsoluteError(IdentityLink, CyAbsoluteError, BaseLoss):
     """Absolute error with identity link, for regression.
 
     Domain:
@@ -497,7 +501,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 50)
 
 
-class PinballLoss(IdentityLink, BaseLoss, CyPinballLoss):
+class PinballLoss(IdentityLink, CyPinballLoss, BaseLoss):
     """Quantile loss aka pinball loss, for regression.
 
     Domain:
@@ -552,7 +556,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 100 * self.quantile)
 
 
-class HalfPoissonLoss(LogLink, BaseLoss, CyHalfPoissonLoss):
+class HalfPoissonLoss(LogLink, CyHalfPoissonLoss, BaseLoss):
     """Poisson deviance loss with log-link, for regression.
 
     Domain:
@@ -585,7 +589,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfGammaLoss(LogLink, BaseLoss, CyHalfGammaLoss):
+class HalfGammaLoss(LogLink, CyHalfGammaLoss, BaseLoss):
     """Gamma deviance loss with log-link, for regression.
 
     Domain:
@@ -617,7 +621,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfTweedieLoss(LogLink, BaseLoss, CyHalfTweedieLoss):
+class HalfTweedieLoss(LogLink, CyHalfTweedieLoss, BaseLoss):
     """Tweedie deviance loss with log-link, for regression.
 
     Domain:
@@ -680,7 +684,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
             return term
 
 
-class BinaryCrossEntropy(LogitLink, BaseLoss, CyBinaryCrossEntropy):
+class BinaryCrossEntropy(LogitLink, CyBinaryCrossEntropy, BaseLoss):
     """Binary cross entropy loss with logit link, for binary classification.
 
     Domain:
@@ -740,7 +744,7 @@ def predict_proba(self, raw_prediction):
         return proba
 
 
-class CategoricalCrossEntropy(MultinomialLogit, BaseLoss, CyCategoricalCrossEntropy):
+class CategoricalCrossEntropy(MultinomialLogit, CyCategoricalCrossEntropy, BaseLoss):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:

From 202953b0bd5aa4c6a8ee5f8de2e2033baf997071 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 29 Aug 2021 16:09:19 +0200
Subject: [PATCH 095/143] ENH support const memoryviews by ReadonlyWrapper

---
 sklearn/_loss/_loss.pyx.tp       | 32 ++++++++++++++++++++
 sklearn/_loss/loss.py            | 32 ++++++++++++++++++++
 sklearn/_loss/tests/test_loss.py | 51 ++++++++++++++++++++++++--------
 3 files changed, 102 insertions(+), 13 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 573b1bba8d47b..c32303c12b0c5 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -205,6 +205,38 @@ from libc.stdlib cimport malloc, free
 np.import_array()
 
 
+# -------------------------------------
+# Readonly array wrapper
+# -------------------------------------
+# TODO: Remove with Cython >= 3.0 which supports const memoryviews for fused types.
+#
+# This class supports the buffer protocol, thus can wrap arrays and memoryvies.
+# All it does is LIE about the readonly attribute: tell it's false!
+# This way, we can use it on arrays that we don't touch.
+# !!! USE CAREFULLY !!!
+
+from cpython cimport Py_buffer
+from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE
+
+cdef class ReadonlyWrapper:
+    cdef object wraps
+
+    def __init__(self, wraps):
+        self.wraps = wraps
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        request_for_writeable = False
+        if flags & PyBUF_WRITABLE:
+            flags ^= PyBUF_WRITABLE
+            request_for_writeable = True
+        PyObject_GetBuffer(self.wraps, buffer, flags)
+        if request_for_writeable:
+            buffer.readonly = False  # This is a lie!
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        PyBuffer_Release(buffer)
+
+
 # -------------------------------------
 # Helper functions
 # -------------------------------------
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index e6bc17609c9f7..4031be741dc65 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -27,6 +27,7 @@
     CyHalfTweedieLoss,
     CyBinaryCrossEntropy,
     CyCategoricalCrossEntropy,
+    ReadonlyWrapper,  # TODO: Remove with Cython >= 3.0
 )
 from .link import (
     Interval,
@@ -170,6 +171,13 @@ def loss(
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
+
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -231,6 +239,12 @@ def loss_gradient(
         if gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._loss_gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -280,6 +294,12 @@ def gradient(
         if gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -344,6 +364,12 @@ def gradient_hessian(
         if hessian.ndim == 2 and hessian.shape[1] == 1:
             hessian = hessian.squeeze(1)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._gradient_hessian(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -868,6 +894,12 @@ def gradient_proba(
         elif proba is None:
             proba = np.empty_like(gradient)
 
+        if not y_true.flags["WRITEABLE"]:
+            y_true = ReadonlyWrapper(y_true)
+        if not raw_prediction.flags["WRITEABLE"]:
+            raw_prediction = ReadonlyWrapper(raw_prediction)
+        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
+            sample_weight = ReadonlyWrapper(sample_weight)
         return self._gradient_proba(
             y_true=y_true,
             raw_prediction=raw_prediction,
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 19d7dbf484455..f1d3ca76ef094 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -25,7 +25,7 @@
     PinballLoss,
 )
 from sklearn.utils import assert_all_finite
-from sklearn.utils._testing import skip_if_32bit
+from sklearn.utils._testing import create_memmap_backed_data, skip_if_32bit
 from sklearn.utils.fixes import sp_version, parse_version
 
 
@@ -237,37 +237,48 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
 
 
 @pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize("readonly_memmap", [False, True])
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
 @pytest.mark.parametrize("sample_weight", [None, 1])
 @pytest.mark.parametrize("out1", [None, 1])
 @pytest.mark.parametrize("out2", [None, 1])
 @pytest.mark.parametrize("n_threads", [1, 2])
-def test_loss_dtype(loss, dtype_in, dtype_out, sample_weight, out1, out2, n_threads):
-    """Test acceptance of dtypes in loss functions.
+def test_loss_dtype_readonly(
+    loss, readonly_memmap, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
+):
+    """Test acceptance of dtypes and readonly arrays in loss functions.
 
     Check that loss accepts if all input arrays are either all float32 or all
     float64, and all output arrays are either all float32 or all float64.
+
+    Also check that input arrays can be readonly, e.g. memory mapped.
     """
     loss = loss()
     # generate a y_true and raw_prediction in valid range
-    if loss.is_multiclass:
-        y_true = np.array([0], dtype=dtype_in)
-        raw_prediction = np.full(
-            shape=(1, loss.n_classes), fill_value=0.0, dtype=dtype_in
-        )
-    else:
-        low, high = _inclusive_low_high(loss.interval_y_true, dtype=dtype_in)
-        y_true = np.array([0.5 * (high - low)], dtype=dtype_in)
-        raw_prediction = np.array([0.0], dtype=dtype_in)
+    n_samples = 5
+    y_true, raw_prediction = random_y_true_raw_prediction(
+        loss=loss,
+        n_samples=n_samples,
+        y_bound=(-100, 100),
+        raw_bound=(-10, 10),
+        seed=42,
+    )
+    y_true = y_true.astype(dtype_in)
+    raw_prediction = raw_prediction.astype(dtype_in)
 
     if sample_weight is not None:
-        sample_weight = np.array([2.0], dtype=dtype_in)
+        sample_weight = np.array([2.0] * n_samples, dtype=dtype_in)
     if out1 is not None:
         out1 = np.empty_like(y_true, dtype=dtype_out)
     if out2 is not None:
         out2 = np.empty_like(raw_prediction, dtype=dtype_out)
 
+    if readonly_memmap:
+        y_true, raw_prediction = create_memmap_backed_data([y_true, raw_prediction])
+        if sample_weight is not None:
+            sample_weight = create_memmap_backed_data(sample_weight)
+
     loss.loss(
         y_true=y_true,
         raw_prediction=raw_prediction,
@@ -300,6 +311,20 @@ def test_loss_dtype(loss, dtype_in, dtype_out, sample_weight, out1, out2, n_thre
         hessian=out2,
         n_threads=n_threads,
     )
+    loss(y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight)
+    loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight)
+    loss.constant_to_optimal_zero(y_true=y_true, sample_weight=sample_weight)
+    if hasattr(loss, "predict_proba"):
+        loss.predict_proba(raw_prediction=raw_prediction)
+    if hasattr(loss, "gradient_proba"):
+        loss.gradient_proba(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            gradient=out1,
+            proba=out2,
+            n_threads=n_threads,
+        )
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)

From 9a89ff685b4d102945f46298f79c10b330d4ac35 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 30 Aug 2021 22:19:44 +0200
Subject: [PATCH 096/143] address review comments

---
 sklearn/_loss/_loss.pxd          |  4 ++--
 sklearn/_loss/link.py            |  4 +++-
 sklearn/_loss/tests/test_link.py |  4 +++-
 sklearn/_loss/tests/test_loss.py | 10 +++++-----
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index b00379a1e793d..e2e44ca712b35 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -44,7 +44,7 @@ cdef class CyAbsoluteError(CyLossFunction):
 
 
 cdef class CyPinballLoss(CyLossFunction):
-    cdef readonly double quantile  # readonly makes it inherited by children
+    cdef readonly double quantile  # readonly makes it accessible from Python
     cdef double cy_loss(self, double y_true, double raw_prediction) nogil
     cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
@@ -63,7 +63,7 @@ cdef class CyHalfGammaLoss(CyLossFunction):
 
 
 cdef class CyHalfTweedieLoss(CyLossFunction):
-    cdef readonly double power  # readonly makes it inherited by children
+    cdef readonly double power  # readonly makes it accessible from Python
     cdef double cy_loss(self, double y_true, double raw_prediction) nogil
     cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index b756e275c6d0e..7dd40876a5683 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -22,7 +22,9 @@ class Interval:
     def __post_init__(self):
         """Check that low <= high"""
         if self.low > self.high:
-            raise ValueError("On must have low <= high; got low={low}, high={high}.")
+            raise ValueError(
+                f"On must have low <= high; got low={self.low}, high={self.high}."
+            )
 
     def includes(self, x):
         """Test whether all values of x are in interval range.
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index 3239ade25f3c7..4c0fc44060cbb 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -15,7 +15,9 @@
 
 def test_interval_raises():
     """Test that interval with low > high raises ValueError."""
-    with pytest.raises(ValueError, match="On must have low <= high"):
+    with pytest.raises(
+        ValueError, match="On must have low <= high; got low=1, high=0."
+    ):
         Interval(1, 0, False, False)
 
 
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index f1d3ca76ef094..98416099a28b3 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -233,7 +233,7 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
     """Test losses at specific values."""
     assert loss(
         y_true=np.array([y_true]), raw_prediction=np.array([raw_prediction])
-    ) == approx(loss_true)
+    ) == approx(loss_true, rel=1e-11, abs=1e-12)
 
 
 @pytest.mark.parametrize("loss", ALL_LOSSES)
@@ -487,7 +487,7 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
         assert_allclose(g1, out_g4)
         assert_allclose(g1, g4)
         assert_allclose(proba, out_proba)
-        assert_allclose(np.sum(proba, axis=1), 1)
+        assert_allclose(np.sum(proba, axis=1), 1, rtol=1e-11)
 
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
@@ -563,7 +563,7 @@ def test_sample_weight_multiplies(loss, sample_weight):
 
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_graceful_squeezing(loss):
-    """Test that Python and Cython functions return same results."""
+    """Test that reshaped raw_prediction gives same results."""
     y_true, raw_prediction = random_y_true_raw_prediction(
         loss=loss,
         n_samples=20,
@@ -962,7 +962,7 @@ def test_predict_proba(loss):
     if hasattr(loss, "predict_proba"):
         proba = loss.predict_proba(raw_prediction)
         assert proba.shape == (n_samples, loss.n_classes)
-        assert np.sum(proba, axis=1) == approx(1)
+        assert np.sum(proba, axis=1) == approx(1, rel=1e-11)
 
     if hasattr(loss, "gradient_proba"):
         for grad, proba in (
@@ -979,7 +979,7 @@ def test_predict_proba(loss):
                 proba=proba,
             )
             assert proba.shape == (n_samples, loss.n_classes)
-            assert np.sum(proba, axis=1) == approx(1)
+            assert np.sum(proba, axis=1) == approx(1, rel=1e-11)
             assert_allclose(
                 grad,
                 loss.gradient(

From 5a54fbe9d6e35dc657069510803661449628ce6f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 31 Aug 2021 20:10:10 +0200
Subject: [PATCH 097/143] CLN nitpick

---
 sklearn/_loss/tests/test_loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 98416099a28b3..f10635b64b123 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -244,10 +244,10 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
 @pytest.mark.parametrize("out1", [None, 1])
 @pytest.mark.parametrize("out2", [None, 1])
 @pytest.mark.parametrize("n_threads", [1, 2])
-def test_loss_dtype_readonly(
+def test_loss_dtype(
     loss, readonly_memmap, dtype_in, dtype_out, sample_weight, out1, out2, n_threads
 ):
-    """Test acceptance of dtypes and readonly arrays in loss functions.
+    """Test acceptance of dtypes, readonly and writeable arrays in loss functions.
 
     Check that loss accepts if all input arrays are either all float32 or all
     float64, and all output arrays are either all float32 or all float64.

From 2820be5eabf5725c972cbbc68e6a4e4023489cc6 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 31 Aug 2021 20:15:49 +0200
Subject: [PATCH 098/143] CLN import ReadonlyWrapper from utils

---
 sklearn/_loss/_loss.pyx.tp | 32 --------------------------------
 sklearn/_loss/loss.py      |  2 +-
 2 files changed, 1 insertion(+), 33 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index c32303c12b0c5..573b1bba8d47b 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -205,38 +205,6 @@ from libc.stdlib cimport malloc, free
 np.import_array()
 
 
-# -------------------------------------
-# Readonly array wrapper
-# -------------------------------------
-# TODO: Remove with Cython >= 3.0 which supports const memoryviews for fused types.
-#
-# This class supports the buffer protocol, thus can wrap arrays and memoryvies.
-# All it does is LIE about the readonly attribute: tell it's false!
-# This way, we can use it on arrays that we don't touch.
-# !!! USE CAREFULLY !!!
-
-from cpython cimport Py_buffer
-from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, PyBUF_WRITABLE
-
-cdef class ReadonlyWrapper:
-    cdef object wraps
-
-    def __init__(self, wraps):
-        self.wraps = wraps
-
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        request_for_writeable = False
-        if flags & PyBUF_WRITABLE:
-            flags ^= PyBUF_WRITABLE
-            request_for_writeable = True
-        PyObject_GetBuffer(self.wraps, buffer, flags)
-        if request_for_writeable:
-            buffer.readonly = False  # This is a lie!
-
-    def __releasebuffer__(self, Py_buffer *buffer):
-        PyBuffer_Release(buffer)
-
-
 # -------------------------------------
 # Helper functions
 # -------------------------------------
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 4031be741dc65..ed129695cab8d 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -27,7 +27,6 @@
     CyHalfTweedieLoss,
     CyBinaryCrossEntropy,
     CyCategoricalCrossEntropy,
-    ReadonlyWrapper,  # TODO: Remove with Cython >= 3.0
 )
 from .link import (
     Interval,
@@ -37,6 +36,7 @@
     LogitLink,
     MultinomialLogit,
 )
+from ..utils._readonly_array_wrapper import ReadonlyWrapper
 from ..utils.stats import _weighted_percentile
 
 
From 45250853322bf56aaaa6e986ce134d2a45076a78 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 6 Sep 2021 18:34:03 +0200
Subject: [PATCH 099/143] MNT replace ReadonlyWrapper by ReadonlyArrayWrapper

---
 sklearn/_loss/loss.py | 52 +++++++++++++++++--------------------------
 1 file changed, 21 insertions(+), 31 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index ed129695cab8d..95f646bbaf773 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -36,7 +36,7 @@
     LogitLink,
     MultinomialLogit,
 )
-from ..utils._readonly_array_wrapper import ReadonlyWrapper
+from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
 from ..utils.stats import _weighted_percentile
 
 
@@ -172,12 +172,10 @@ def loss(
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
 
-        if not y_true.flags["WRITEABLE"]:
-            y_true = ReadonlyWrapper(y_true)
-        if not raw_prediction.flags["WRITEABLE"]:
-            raw_prediction = ReadonlyWrapper(raw_prediction)
-        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
-            sample_weight = ReadonlyWrapper(sample_weight)
+        y_true = ReadonlyArrayWrapper(y_true)
+        raw_prediction = ReadonlyArrayWrapper(raw_prediction)
+        if sample_weight is not None:
+            sample_weight = ReadonlyArrayWrapper(sample_weight)
         return self._loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -239,12 +237,10 @@ def loss_gradient(
         if gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
-        if not y_true.flags["WRITEABLE"]:
-            y_true = ReadonlyWrapper(y_true)
-        if not raw_prediction.flags["WRITEABLE"]:
-            raw_prediction = ReadonlyWrapper(raw_prediction)
-        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
-            sample_weight = ReadonlyWrapper(sample_weight)
+        y_true = ReadonlyArrayWrapper(y_true)
+        raw_prediction = ReadonlyArrayWrapper(raw_prediction)
+        if sample_weight is not None:
+            sample_weight = ReadonlyArrayWrapper(sample_weight)
         return self._loss_gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -294,12 +290,10 @@ def gradient(
         if gradient.ndim == 2 and gradient.shape[1] == 1:
             gradient = gradient.squeeze(1)
 
-        if not y_true.flags["WRITEABLE"]:
-            y_true = ReadonlyWrapper(y_true)
-        if not raw_prediction.flags["WRITEABLE"]:
-            raw_prediction = ReadonlyWrapper(raw_prediction)
-        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
-            sample_weight = ReadonlyWrapper(sample_weight)
+        y_true = ReadonlyArrayWrapper(y_true)
+        raw_prediction = ReadonlyArrayWrapper(raw_prediction)
+        if sample_weight is not None:
+            sample_weight = ReadonlyArrayWrapper(sample_weight)
         return self._gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -364,12 +358,10 @@ def gradient_hessian(
         if hessian.ndim == 2 and hessian.shape[1] == 1:
             hessian = hessian.squeeze(1)
 
-        if not y_true.flags["WRITEABLE"]:
-            y_true = ReadonlyWrapper(y_true)
-        if not raw_prediction.flags["WRITEABLE"]:
-            raw_prediction = ReadonlyWrapper(raw_prediction)
-        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
-            sample_weight = ReadonlyWrapper(sample_weight)
+        y_true = ReadonlyArrayWrapper(y_true)
+        raw_prediction = ReadonlyArrayWrapper(raw_prediction)
+        if sample_weight is not None:
+            sample_weight = ReadonlyArrayWrapper(sample_weight)
         return self._gradient_hessian(
             y_true=y_true,
             raw_prediction=raw_prediction,
@@ -894,12 +886,10 @@ def gradient_proba(
         elif proba is None:
             proba = np.empty_like(gradient)
 
-        if not y_true.flags["WRITEABLE"]:
-            y_true = ReadonlyWrapper(y_true)
-        if not raw_prediction.flags["WRITEABLE"]:
-            raw_prediction = ReadonlyWrapper(raw_prediction)
-        if sample_weight is not None and not sample_weight.flags["WRITEABLE"]:
-            sample_weight = ReadonlyWrapper(sample_weight)
+        y_true = ReadonlyArrayWrapper(y_true)
+        raw_prediction = ReadonlyArrayWrapper(raw_prediction)
+        if sample_weight is not None:
+            sample_weight = ReadonlyArrayWrapper(sample_weight)
         return self._gradient_proba(
             y_true=y_true,
             raw_prediction=raw_prediction,

From 439ee83b83ff472a169dce3a9300f3cb5b3fa6c8 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 6 Sep 2021 19:29:57 +0200
Subject: [PATCH 100/143] trigger CI


From a201fd0adb3d2cf2a08e90fad6185ffc59eb138f Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 23 Oct 2021 20:15:57 +0200
Subject: [PATCH 101/143] MNT rename out parameters

For instance, loss -> loss_out and so on.
---
 sklearn/_loss/_loss.pyx.tp       | 146 +++++++++++++++----------------
 sklearn/_loss/loss.py            | 116 ++++++++++++------------
 sklearn/_loss/tests/test_loss.py |  62 ++++++-------
 3 files changed, 161 insertions(+), 163 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 573b1bba8d47b..1c164fdf32759 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -645,7 +645,7 @@ cdef class CyLossFunction:
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss,            # OUT
+        G_DTYPE_C[::1] loss_out,        # OUT
         int n_threads=1
     ):
         """Compute the pointwise loss value for each input.
@@ -658,7 +658,7 @@ cdef class CyLossFunction:
             Raw prediction values (in link space).
         sample_weight : array of shape (n_samples,) or None
             Sample weights.
-        loss : array of shape (n_samples,)
+        loss_out : array of shape (n_samples,)
             A location into which the result is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
@@ -675,7 +675,7 @@ cdef class CyLossFunction:
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] gradient_out,    # OUT
         int n_threads=1
     ):
         """Compute gradient of loss w.r.t raw_prediction for each input.
@@ -688,7 +688,7 @@ cdef class CyLossFunction:
             Raw prediction values (in link space).
         sample_weight : array of shape (n_samples,) or None
             Sample weights.
-        gradient : array of shape (n_samples,)
+        gradient_out : array of shape (n_samples,)
             A location into which the result is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
@@ -705,8 +705,8 @@ cdef class CyLossFunction:
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss,            # OUT
-        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] loss_out,        # OUT
+        G_DTYPE_C[::1] gradient_out,    # OUT
         int n_threads=1
     ):
         """Compute loss and gradient of loss w.r.t raw_prediction.
@@ -719,9 +719,9 @@ cdef class CyLossFunction:
             Raw prediction values (in link space).
         sample_weight : array of shape (n_samples,) or None
             Sample weights.
-        loss : array of shape (n_samples,) or None
+        loss_out : array of shape (n_samples,) or None
             A location into which the element-wise loss is stored.
-        gradient : array of shape (n_samples,)
+        gradient_out : array of shape (n_samples,)
             A location into which the gradient is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
@@ -734,19 +734,17 @@ cdef class CyLossFunction:
         gradient : array of shape (n_samples,)
             Element-wise gradients.
         """
-        self._loss(y_true, raw_prediction, sample_weight, loss,
-                            n_threads)
-        self._gradient(y_true, raw_prediction, sample_weight, gradient,
-                      n_threads)
-        return np.asarray(loss), np.asarray(gradient)
+        self._loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
+        self._gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
+        return np.asarray(loss_out), np.asarray(gradient_out)
 
     def _gradient_hessian(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient,        # OUT
-        G_DTYPE_C[::1] hessian,         # OUT
+        G_DTYPE_C[::1] gradient_out,    # OUT
+        G_DTYPE_C[::1] hessian_out,     # OUT
         int n_threads=1
     ):
         """Compute gradient and hessian of loss w.r.t raw_prediction.
@@ -759,9 +757,9 @@ cdef class CyLossFunction:
             Raw prediction values (in link space).
         sample_weight : array of shape (n_samples,) or None
             Sample weights.
-        gradient : array of shape (n_samples,)
+        gradient_out : array of shape (n_samples,)
             A location into which the gradient is stored.
-        hessian : array of shape (n_samples,)
+        hessian_out : array of shape (n_samples,)
             A location into which the hessian is stored.
         n_threads : int
             Number of threads used by OpenMP (if any).
@@ -807,7 +805,7 @@ cdef class {{name}}(CyLossFunction):
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss,            # OUT
+        G_DTYPE_C[::1] loss_out,        # OUT
         int n_threads=1
     ):
         cdef:
@@ -818,14 +816,14 @@ cdef class {{name}}(CyLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss_out[i] = {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                loss[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
+                loss_out[i] = sample_weight[i] * {{closs}}(y_true[i], raw_prediction[i]{{with_param}})
 
-        return np.asarray(loss)
+        return np.asarray(loss_out)
 
     {{if closs_grad is not None}}
     def _loss_gradient(
@@ -833,8 +831,8 @@ cdef class {{name}}(CyLossFunction):
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] loss,            # OUT
-        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] loss_out,        # OUT
+        G_DTYPE_C[::1] gradient_out,    # OUT
         int n_threads=1
     ):
         cdef:
@@ -847,17 +845,17 @@ cdef class {{name}}(CyLossFunction):
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
                 dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
-                loss[i] = dbl2.val1
-                gradient[i] = dbl2.val2
+                loss_out[i] = dbl2.val1
+                gradient_out[i] = dbl2.val2
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
                 dbl2 = {{closs_grad}}(y_true[i], raw_prediction[i]{{with_param}})
-                loss[i] = sample_weight[i] * dbl2.val1
-                gradient[i] = sample_weight[i] * dbl2.val2
+                loss_out[i] = sample_weight[i] * dbl2.val1
+                gradient_out[i] = sample_weight[i] * dbl2.val2
 
-        return np.asarray(loss), np.asarray(gradient)
+        return np.asarray(loss_out), np.asarray(gradient_out)
     {{endif}}
 
     def _gradient(
@@ -865,7 +863,7 @@ cdef class {{name}}(CyLossFunction):
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient,        # OUT
+        G_DTYPE_C[::1] gradient_out,    # OUT
         int n_threads=1
     ):
         cdef:
@@ -876,22 +874,22 @@ cdef class {{name}}(CyLossFunction):
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                gradient[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
+                gradient_out[i] = {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
-                gradient[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
+                gradient_out[i] = sample_weight[i] * {{cgrad}}(y_true[i], raw_prediction[i]{{with_param}})
 
-        return np.asarray(gradient)
+        return np.asarray(gradient_out)
 
     def _gradient_hessian(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,   # IN
-        G_DTYPE_C[::1] gradient,        # OUT
-        G_DTYPE_C[::1] hessian,         # OUT
+        G_DTYPE_C[::1] gradient_out,    # OUT
+        G_DTYPE_C[::1] hessian_out,     # OUT
         int n_threads=1
     ):
         cdef:
@@ -904,17 +902,17 @@ cdef class {{name}}(CyLossFunction):
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
                 dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
-                gradient[i] = dbl2.val1
-                hessian[i] = dbl2.val2
+                gradient_out[i] = dbl2.val1
+                hessian_out[i] = dbl2.val2
         else:
             for i in prange(
                 n_samples, schedule='static', nogil=True, num_threads=n_threads
             ):
                 dbl2 = {{cgrad_hess}}(y_true[i], raw_prediction[i]{{with_param}})
-                gradient[i] = sample_weight[i] * dbl2.val1
-                hessian[i] = sample_weight[i] * dbl2.val2
+                gradient_out[i] = sample_weight[i] * dbl2.val1
+                hessian_out[i] = sample_weight[i] * dbl2.val2
 
-        return np.asarray(gradient), np.asarray(hessian)
+        return np.asarray(gradient_out), np.asarray(hessian_out)
 
 {{endfor}}
 
@@ -941,7 +939,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[::1] loss,             # OUT
+        G_DTYPE_C[::1] loss_out,         # OUT
         int n_threads=1
     ):
         cdef:
@@ -967,12 +965,12 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     sum_exp_minus_max(i, raw_prediction, p)
                     max_value = p[n_classes]     # p[-2]
                     sum_exps = p[n_classes + 1]  # p[-1]
-                    loss[i] = log(sum_exps) + max_value
+                    loss_out[i] = log(sum_exps) + max_value
 
                     for k in range(n_classes):
                         # label decode y_true
                         if y_true[i] == k:
-                            loss[i] -= raw_prediction[i, k]
+                            loss_out[i] -= raw_prediction[i, k]
 
                 free(p)
         else:
@@ -983,26 +981,26 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     sum_exp_minus_max(i, raw_prediction, p)
                     max_value = p[n_classes]     # p[-2]
                     sum_exps = p[n_classes + 1]  # p[-1]
-                    loss[i] = log(sum_exps) + max_value
+                    loss_out[i] = log(sum_exps) + max_value
 
                     for k in range(n_classes):
                         # label decode y_true
                         if y_true[i] == k:
-                            loss[i] -= raw_prediction[i, k]
+                            loss_out[i] -= raw_prediction[i, k]
 
-                    loss[i] *= sample_weight[i]
+                    loss_out[i] *= sample_weight[i]
 
                 free(p)
 
-        return np.asarray(loss)
+        return np.asarray(loss_out)
 
     def _loss_gradient(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[::1] loss,             # OUT
-        G_DTYPE_C[:, :] gradient,        # OUT
+        G_DTYPE_C[::1] loss_out,         # OUT
+        G_DTYPE_C[:, :] gradient_out,    # OUT
         int n_threads=1
     ):
         cdef:
@@ -1023,15 +1021,15 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     sum_exp_minus_max(i, raw_prediction, p)
                     max_value = p[n_classes]  # p[-2]
                     sum_exps = p[n_classes + 1]  # p[-1]
-                    loss[i] = log(sum_exps) + max_value
+                    loss_out[i] = log(sum_exps) + max_value
 
                     for k in range(n_classes):
                         # label decode y_true
                         if y_true [i] == k:
-                            loss[i] -= raw_prediction[i, k]
+                            loss_out[i] -= raw_prediction[i, k]
                         p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
                         # gradient_k = p_k - (y_true == k)
-                        gradient[i, k] = p[k] - (y_true[i] == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
 
                 free(p)
         else:
@@ -1042,28 +1040,28 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     sum_exp_minus_max(i, raw_prediction, p)
                     max_value = p[n_classes]  # p[-2]
                     sum_exps = p[n_classes + 1]  # p[-1]
-                    loss[i] = log(sum_exps) + max_value
+                    loss_out[i] = log(sum_exps) + max_value
 
                     for k in range(n_classes):
                         # label decode y_true
                         if y_true [i] == k:
-                            loss[i] -= raw_prediction[i, k]
+                            loss_out[i] -= raw_prediction[i, k]
                         p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
                         # gradient_k = (p_k - (y_true == k)) * sw
-                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
 
-                    loss[i] *= sample_weight[i]
+                    loss_out[i] *= sample_weight[i]
 
                 free(p)
 
-        return np.asarray(loss), np.asarray(gradient)
+        return np.asarray(loss_out), np.asarray(gradient_out)
 
     def _gradient(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[:, :] gradient,        # OUT
+        G_DTYPE_C[:, :] gradient_out,    # OUT
         int n_threads=1
     ):
         cdef:
@@ -1087,7 +1085,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     for k in range(n_classes):
                         p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
                         # gradient_k = y_pred_k - (y_true == k)
-                        gradient[i, k] = p[k] - (y_true[i] == k)
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
 
                 free(p)
         else:
@@ -1101,19 +1099,19 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     for k in range(n_classes):
                         p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
                         # gradient_k = (p_k - (y_true == k)) * sw
-                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
 
                 free(p)
 
-        return np.asarray(gradient)
+        return np.asarray(gradient_out)
 
     def _gradient_hessian(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[:, :] gradient,        # OUT
-        G_DTYPE_C[:, :] hessian,         # OUT
+        G_DTYPE_C[:, :] gradient_out,    # OUT
+        G_DTYPE_C[:, :] hessian_out,     # OUT
         int n_threads=1
     ):
         cdef:
@@ -1138,8 +1136,8 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                         p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
                         # hessian_k = p_k * (1 - p_k)
                         # gradient_k = p_k - (y_true == k)
-                        gradient[i, k] = p[k] - (y_true[i] == k)
-                        hessian[i, k] = p[k] * (1. - p[k])
+                        gradient_out[i, k] = p[k] - (y_true[i] == k)
+                        hessian_out[i, k] = p[k] * (1. - p[k])
 
                 free(p)
         else:
@@ -1154,12 +1152,12 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                         p[k] /= sum_exps  # p_k = y_pred_k = prob of class k
                         # gradient_k = (p_k - (y_true == k)) * sw
                         # hessian_k = p_k * (1 - p_k) * sw
-                        gradient[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
-                        hessian[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
+                        gradient_out[i, k] = (p[k] - (y_true[i] == k)) * sample_weight[i]
+                        hessian_out[i, k] = (p[k] * (1. - p[k])) * sample_weight[i]
 
                 free(p)
 
-        return np.asarray(gradient), np.asarray(hessian)
+        return np.asarray(gradient_out), np.asarray(hessian_out)
 
 
     # This method simplifies the implementation of hessp in linear models,
@@ -1170,8 +1168,8 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
         Y_DTYPE_C[::1] sample_weight,    # IN
-        G_DTYPE_C[:, :] gradient,        # OUT
-        G_DTYPE_C[:, :] proba,           # OUT
+        G_DTYPE_C[:, :] gradient_out,    # OUT
+        G_DTYPE_C[:, :] proba_out,       # OUT
         int n_threads=1
     ):
         cdef:
@@ -1193,9 +1191,9 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     sum_exps = p[n_classes + 1]  # p[-1]
 
                     for k in range(n_classes):
-                        proba[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        proba_out[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
                         # gradient_k = y_pred_k - (y_true == k)
-                        gradient[i, k] = proba[i, k] - (y_true[i] == k)
+                        gradient_out[i, k] = proba_out[i, k] - (y_true[i] == k)
 
                 free(p)
         else:
@@ -1207,10 +1205,10 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
                     sum_exps = p[n_classes + 1]  # p[-1]
 
                     for k in range(n_classes):
-                        proba[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
+                        proba_out[i, k] = p[k] / sum_exps  # y_pred_k = prob of class k
                         # gradient_k = (p_k - (y_true == k)) * sw
-                        gradient[i, k] = (proba[i, k] - (y_true[i] == k)) * sample_weight[i]
+                        gradient_out[i, k] = (proba_out[i, k] - (y_true[i] == k)) * sample_weight[i]
 
                 free(p)
 
-        return np.asarray(gradient), np.asarray(proba)
+        return np.asarray(gradient_out), np.asarray(proba_out)
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 95f646bbaf773..bf9609a1a2a27 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -141,7 +141,7 @@ def loss(
         y_true,
         raw_prediction,
         sample_weight=None,
-        loss=None,
+        loss_out=None,
         n_threads=1,
     ):
         """Compute the pointwise loss value for each input.
@@ -155,7 +155,7 @@ def loss(
             Raw prediction values (in link space).
         sample_weight : None or C-contiguous array of shape (n_samples,)
             Sample weights.
-        loss : None or C-contiguous array of shape (n_samples,)
+        loss_out : None or C-contiguous array of shape (n_samples,)
             A location into which the result is stored. If None, a new array
             might be created.
         n_threads : int, default=1
@@ -166,8 +166,8 @@ def loss(
         loss : array of shape (n_samples,)
             Element-wise loss function.
         """
-        if loss is None:
-            loss = np.empty_like(y_true)
+        if loss_out is None:
+            loss_out = np.empty_like(y_true)
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
@@ -180,7 +180,7 @@ def loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            loss=loss,
+            loss_out=loss_out,
             n_threads=n_threads,
         )
 
@@ -189,8 +189,8 @@ def loss_gradient(
         y_true,
         raw_prediction,
         sample_weight=None,
-        loss=None,
-        gradient=None,
+        loss_out=None,
+        gradient_out=None,
         n_threads=1,
     ):
         """Compute loss and gradient w.r.t. raw_prediction for each input.
@@ -204,10 +204,10 @@ def loss_gradient(
             Raw prediction values (in link space).
         sample_weight : None or C-contiguous array of shape (n_samples,)
             Sample weights.
-        loss : None or C-contiguous array of shape (n_samples,)
+        loss_out : None or C-contiguous array of shape (n_samples,)
             A location into which the loss is stored. If None, a new array
             might be created.
-        gradient : None or C-contiguous array of shape (n_samples,) or array \
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
             of shape (n_samples, n_classes)
             A location into which the gradient is stored. If None, a new array
             might be created.
@@ -222,20 +222,20 @@ def loss_gradient(
         gradient : array of shape (n_samples,) or (n_samples, n_classes)
             Element-wise gradients.
         """
-        if loss is None:
-            if gradient is None:
-                loss = np.empty_like(y_true)
-                gradient = np.empty_like(raw_prediction)
+        if loss_out is None:
+            if gradient_out is None:
+                loss_out = np.empty_like(y_true)
+                gradient_out = np.empty_like(raw_prediction)
             else:
-                loss = np.empty_like(y_true, dtype=gradient.dtype)
-        elif gradient is None:
-            gradient = np.empty_like(raw_prediction, dtype=loss.dtype)
+                loss_out = np.empty_like(y_true, dtype=gradient_out.dtype)
+        elif gradient_out is None:
+            gradient_out = np.empty_like(raw_prediction, dtype=loss_out.dtype)
 
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
-        if gradient.ndim == 2 and gradient.shape[1] == 1:
-            gradient = gradient.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
 
         y_true = ReadonlyArrayWrapper(y_true)
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
@@ -245,8 +245,8 @@ def loss_gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            loss=loss,
-            gradient=gradient,
+            loss_out=loss_out,
+            gradient_out=gradient_out,
             n_threads=n_threads,
         )
 
@@ -255,7 +255,7 @@ def gradient(
         y_true,
         raw_prediction,
         sample_weight=None,
-        gradient=None,
+        gradient_out=None,
         n_threads=1,
     ):
         """Compute gradient of loss w.r.t raw_prediction for each input.
@@ -269,7 +269,7 @@ def gradient(
             Raw prediction values (in link space).
         sample_weight : None or C-contiguous array of shape (n_samples,)
             Sample weights.
-        gradient : None or C-contiguous array of shape (n_samples,) or array \
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
             of shape (n_samples, n_classes)
             A location into which the result is stored. If None, a new array
             might be created.
@@ -281,14 +281,14 @@ def gradient(
         gradient : array of shape (n_samples,) or (n_samples, n_classes)
             Element-wise gradients.
         """
-        if gradient is None:
-            gradient = np.empty_like(raw_prediction)
+        if gradient_out is None:
+            gradient_out = np.empty_like(raw_prediction)
 
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
-        if gradient.ndim == 2 and gradient.shape[1] == 1:
-            gradient = gradient.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
 
         y_true = ReadonlyArrayWrapper(y_true)
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
@@ -298,7 +298,7 @@ def gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            gradient=gradient,
+            gradient_out=gradient_out,
             n_threads=n_threads,
         )
 
@@ -307,8 +307,8 @@ def gradient_hessian(
         y_true,
         raw_prediction,
         sample_weight=None,
-        gradient=None,
-        hessian=None,
+        gradient_out=None,
+        hessian_out=None,
         n_threads=1,
     ):
         """Compute gradient and hessian of loss w.r.t raw_prediction.
@@ -322,11 +322,11 @@ def gradient_hessian(
             Raw prediction values (in link space).
         sample_weight : None or C-contiguous array of shape (n_samples,)
             Sample weights.
-        gradient : None or C-contiguous array of shape (n_samples,) or array \
+        gradient_out : None or C-contiguous array of shape (n_samples,) or array \
             of shape (n_samples, n_classes)
             A location into which the gradient is stored. If None, a new array
             might be created.
-        hessian : None or C-contiguous array of shape (n_samples,) or array \
+        hessian_out : None or C-contiguous array of shape (n_samples,) or array \
             of shape (n_samples, n_classes)
             A location into which the hessian is stored. If None, a new array
             might be created.
@@ -341,22 +341,22 @@ def gradient_hessian(
         hessian : arrays of shape (n_samples,) or (n_samples, n_classes)
             Element-wise hessians.
         """
-        if gradient is None:
-            if hessian is None:
-                gradient = np.empty_like(raw_prediction)
-                hessian = np.empty_like(raw_prediction)
+        if gradient_out is None:
+            if hessian_out is None:
+                gradient_out = np.empty_like(raw_prediction)
+                hessian_out = np.empty_like(raw_prediction)
             else:
-                gradient = np.empty_like(hessian)
-        elif hessian is None:
-            hessian = np.empty_like(gradient)
+                gradient_out = np.empty_like(hessian_out)
+        elif hessian_out is None:
+            hessian_out = np.empty_like(gradient_out)
 
         # Be graceful to shape (n_samples, 1) -> (n_samples,)
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
-        if gradient.ndim == 2 and gradient.shape[1] == 1:
-            gradient = gradient.squeeze(1)
-        if hessian.ndim == 2 and hessian.shape[1] == 1:
-            hessian = hessian.squeeze(1)
+        if gradient_out.ndim == 2 and gradient_out.shape[1] == 1:
+            gradient_out = gradient_out.squeeze(1)
+        if hessian_out.ndim == 2 and hessian_out.shape[1] == 1:
+            hessian_out = hessian_out.squeeze(1)
 
         y_true = ReadonlyArrayWrapper(y_true)
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
@@ -366,8 +366,8 @@ def gradient_hessian(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            gradient=gradient,
-            hessian=hessian,
+            gradient_out=gradient_out,
+            hessian_out=hessian_out,
             n_threads=n_threads,
         )
 
@@ -396,7 +396,7 @@ def __call__(self, y_true, raw_prediction, sample_weight=None, n_threads=1):
                 y_true=y_true,
                 raw_prediction=raw_prediction,
                 sample_weight=None,
-                loss=None,
+                loss_out=None,
                 n_threads=n_threads,
             ),
             weights=sample_weight,
@@ -846,8 +846,8 @@ def gradient_proba(
         y_true,
         raw_prediction,
         sample_weight=None,
-        gradient=None,
-        proba=None,
+        gradient_out=None,
+        proba_out=None,
         n_threads=1,
     ):
         """Compute gradient and class probabilities fow raw_prediction.
@@ -860,10 +860,10 @@ def gradient_proba(
             Raw prediction values (in link space).
         sample_weight : None or C-contiguous array of shape (n_samples,)
             Sample weights.
-        gradient : None or array of shape (n_samples, n_classes)
+        gradient_out : None or array of shape (n_samples, n_classes)
             A location into which the gradient is stored. If None, a new array
             might be created.
-        proba : None or array of shape (n_samples, n_classes)
+        proba_out : None or array of shape (n_samples, n_classes)
             A location into which the class probabilities are stored. If None,
             a new array might be created.
         n_threads : int, default=1
@@ -877,14 +877,14 @@ def gradient_proba(
         proba : array of shape (n_samples, n_classes)
             Element-wise class probabilites.
         """
-        if gradient is None:
-            if proba is None:
-                gradient = np.empty_like(raw_prediction)
-                proba = np.empty_like(raw_prediction)
+        if gradient_out is None:
+            if proba_out is None:
+                gradient_out = np.empty_like(raw_prediction)
+                proba_out = np.empty_like(raw_prediction)
             else:
-                gradient = np.empty_like(proba)
-        elif proba is None:
-            proba = np.empty_like(gradient)
+                gradient_out = np.empty_like(proba_out)
+        elif proba_out is None:
+            proba_out = np.empty_like(gradient_out)
 
         y_true = ReadonlyArrayWrapper(y_true)
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
@@ -894,8 +894,8 @@ def gradient_proba(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            gradient=gradient,
-            proba=proba,
+            gradient_out=gradient_out,
+            proba_out=proba_out,
             n_threads=n_threads,
         )
 
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index f10635b64b123..95c814162554c 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -283,22 +283,22 @@ def test_loss_dtype(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        loss=out1,
+        loss_out=out1,
         n_threads=n_threads,
     )
     loss.gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        gradient=out2,
+        gradient_out=out2,
         n_threads=n_threads,
     )
     loss.loss_gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        loss=out1,
-        gradient=out2,
+        loss_out=out1,
+        gradient_out=out2,
         n_threads=n_threads,
     )
     if out1 is not None and loss.is_multiclass:
@@ -307,8 +307,8 @@ def test_loss_dtype(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        gradient=out1,
-        hessian=out2,
+        gradient_out=out1,
+        hessian_out=out2,
         n_threads=n_threads,
     )
     loss(y_true=y_true, raw_prediction=raw_prediction, sample_weight=sample_weight)
@@ -321,8 +321,8 @@ def test_loss_dtype(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            gradient=out1,
-            proba=out2,
+            gradient_out=out1,
+            proba_out=out2,
             n_threads=n_threads,
         )
 
@@ -352,13 +352,13 @@ def test_loss_same_as_C_functions(loss, sample_weight):
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            loss=out_l1,
+            loss_out=out_l1,
         ),
         loss._loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            loss=out_l2,
+            loss_out=out_l2,
         ),
     )
     assert_allclose(
@@ -366,28 +366,28 @@ def test_loss_same_as_C_functions(loss, sample_weight):
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            gradient=out_g1,
+            gradient_out=out_g1,
         ),
         loss._gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            gradient=out_g2,
+            gradient_out=out_g2,
         ),
     )
     loss.loss_gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        loss=out_l1,
-        gradient=out_g1,
+        loss_out=out_l1,
+        gradient_out=out_g1,
     )
     loss._loss_gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        loss=out_l2,
-        gradient=out_g2,
+        loss_out=out_l2,
+        gradient_out=out_g2,
     )
     assert_allclose(out_l1, out_l2)
     assert_allclose(out_g1, out_g2)
@@ -395,15 +395,15 @@ def test_loss_same_as_C_functions(loss, sample_weight):
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        gradient=out_g1,
-        hessian=out_h1,
+        gradient_out=out_g1,
+        hessian_out=out_h1,
     )
     loss._gradient_hessian(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        gradient=out_g2,
-        hessian=out_h2,
+        gradient_out=out_g2,
+        hessian_out=out_h2,
     )
     assert_allclose(out_g1, out_g2)
     assert_allclose(out_h1, out_h2)
@@ -437,27 +437,27 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        loss=out_l1,
+        loss_out=out_l1,
     )
     g1 = loss.gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        gradient=out_g1,
+        gradient_out=out_g1,
     )
     l2, g2 = loss.loss_gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        loss=out_l2,
-        gradient=out_g2,
+        loss_out=out_l2,
+        gradient_out=out_g2,
     )
     g3, h3 = loss.gradient_hessian(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
-        gradient=out_g3,
-        hessian=out_h3,
+        gradient_out=out_g3,
+        hessian_out=out_h3,
     )
     assert_allclose(l1, l2)
     assert_array_equal(l1, out_l1)
@@ -481,8 +481,8 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
-            gradient=out_g4,
-            proba=out_proba,
+            gradient_out=out_g4,
+            proba_out=out_proba,
         )
         assert_allclose(g1, out_g4)
         assert_allclose(g1, g4)
@@ -975,8 +975,8 @@ def test_predict_proba(loss):
                 y_true=y_true,
                 raw_prediction=raw_prediction,
                 sample_weight=None,
-                gradient=grad,
-                proba=proba,
+                gradient_out=grad,
+                proba_out=proba,
             )
             assert proba.shape == (n_samples, loss.n_classes)
             assert np.sum(proba, axis=1) == approx(1, rel=1e-11)
@@ -986,7 +986,7 @@ def test_predict_proba(loss):
                     y_true=y_true,
                     raw_prediction=raw_prediction,
                     sample_weight=None,
-                    gradient=None,
+                    gradient_out=None,
                 ),
             )
 

From 0c7c68b6ac4f34ea97002c86485a150be97c6522 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 23 Oct 2021 20:22:53 +0200
Subject: [PATCH 102/143] CLN address review comments

---
 sklearn/_loss/_loss.pyx.tp       |  2 +-
 sklearn/_loss/link.py            | 14 +++++---------
 sklearn/_loss/loss.py            | 15 +++------------
 sklearn/_loss/tests/test_link.py |  4 ++--
 4 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 1c164fdf32759..271436e970d09 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -951,7 +951,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
         # We assume n_samples > n_classes. In this case having the inner loop
         # over n_classes is a good default.
-        # TODO: If every memoryview is contiguous and raw_preduction is
+        # TODO: If every memoryview is contiguous and raw_prediction is
         #       f-contiguous, can we write a better algo (loops) to improve
         #       performance?
         if sample_weight is None:
diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index 7dd40876a5683..beb4738847a89 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -52,10 +52,7 @@ def includes(self, x):
             high = np.less(x, self.high)
 
         # Note: np.all returns numpy.bool_
-        if np.all(high):
-            return True
-        else:
-            return False
+        return bool(np.all(high))
 
 
 def _inclusive_low_high(interval, dtype=np.float64):
@@ -101,7 +98,7 @@ class BaseLink(ABC):
     parameters.
     """
 
-    multiclass = False
+    is_multiclass = False  # used for testing only
 
     # Usually, raw_prediction may be any real number and y_pred is an open
     # interval.
@@ -202,7 +199,7 @@ class MultinomialLogit(BaseLink):
 
     We have to choose additional contraints in order to make
 
-        y_pred_k = exp(raw_pred_k) / sum(exp(raw_pred_k), k=0..n_classes-1)
+        y_pred[k] = exp(raw_pred[k]) / sum(exp(raw_pred[k]), k=0..n_classes-1)
 
     for n_classes classes identifiable and invertible.
     We choose the symmetric side contraint where the geometric mean response
@@ -236,7 +233,7 @@ class MultinomialLogit(BaseLink):
         http://epub.ub.uni-muenchen.de/11001/1/tr067.pdf
     """
 
-    multiclass = True
+    is_multiclass = True
     interval_y_pred = Interval(0, 1, False, False)
 
     def symmetrize_raw_prediction(self, raw_prediction):
@@ -245,8 +242,7 @@ def symmetrize_raw_prediction(self, raw_prediction):
     def link(self, y_pred, out=None):
         # geometric mean as reference category
         gm = gmean(y_pred, axis=1)
-        out = np.log(y_pred / gm[:, np.newaxis], out=out)
-        return out
+        return np.log(y_pred / gm[:, np.newaxis], out=out)
 
     def inverse(self, raw_prediction, out=None):
         if out is None:
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index bf9609a1a2a27..5af54edd3ab53 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -476,10 +476,7 @@ class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
 
     def __init__(self, sample_weight=None):
         super().__init__()
-        if sample_weight is None:
-            self.constant_hessian = True
-        else:
-            self.constant_hessian = False
+        self.constant_hessian = sample_weight is None
 
 
 class AbsoluteError(IdentityLink, CyAbsoluteError, BaseLoss):
@@ -502,10 +499,7 @@ class AbsoluteError(IdentityLink, CyAbsoluteError, BaseLoss):
     def __init__(self, sample_weight=None):
         super().__init__()
         self.approx_hessian = True
-        if sample_weight is None:
-            self.constant_hessian = True
-        else:
-            self.constant_hessian = False
+        self.constant_hessian = sample_weight is None
 
     def fit_intercept_only(self, y_true, sample_weight=None):
         """Compute raw_prediction of an intercept-only model.
@@ -552,10 +546,7 @@ def __init__(self, sample_weight=None, quantile=0.5):
         BaseLoss.__init__(self)
         CyPinballLoss.__init__(self, quantile=float(quantile))
         self.approx_hessian = True
-        if sample_weight is None:
-            self.constant_hessian = True
-        else:
-            self.constant_hessian = False
+        self.constant_hessian = sample_weight is None
         if quantile <= 0 or quantile >= 1:
             raise ValueError(
                 "PinballLoss aka quantile loss only accepts "
diff --git a/sklearn/_loss/tests/test_link.py b/sklearn/_loss/tests/test_link.py
index 4c0fc44060cbb..b363a45109989 100644
--- a/sklearn/_loss/tests/test_link.py
+++ b/sklearn/_loss/tests/test_link.py
@@ -63,7 +63,7 @@ def test_link_inverse_identity(link):
     rng = np.random.RandomState(42)
     link = link()
     n_samples, n_classes = 100, None
-    if link.multiclass:
+    if link.is_multiclass:
         n_classes = 10
         raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
         if isinstance(link, MultinomialLogit):
@@ -84,7 +84,7 @@ def test_link_out_argument(link):
     rng = np.random.RandomState(42)
     link = link()
     n_samples, n_classes = 100, None
-    if link.multiclass:
+    if link.is_multiclass:
         n_classes = 10
         raw_prediction = rng.normal(loc=0, scale=10, size=(n_samples, n_classes))
         if isinstance(link, MultinomialLogit):

From 5f3b0a5f8a0a73baef82c4f969b064d501731f1d Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 24 Oct 2021 12:52:44 +0200
Subject: [PATCH 103/143] TST increase maxiter

---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 95c814162554c..46fde973a0d96 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -846,7 +846,7 @@ def fun(x):
             fun,
             np.empty((loss.n_classes)),
             tol=1e-13,
-            options={"maxiter": 100},
+            options={"maxiter": 200},
             method="SLSQP",
             constraints={
                 "type": "eq",

From 12f529e73f687c34ee75dec1b2ed9c8ff754503a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 24 Oct 2021 14:41:54 +0200
Subject: [PATCH 104/143] Revert "TST increase maxiter"

This reverts commit 5f3b0a5f8a0a73baef82c4f969b064d501731f1d.
---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 46fde973a0d96..95c814162554c 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -846,7 +846,7 @@ def fun(x):
             fun,
             np.empty((loss.n_classes)),
             tol=1e-13,
-            options={"maxiter": 200},
+            options={"maxiter": 100},
             method="SLSQP",
             constraints={
                 "type": "eq",

From 643ad0512518eb4f0e9b8d7f69aa2c6e136d5be3 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 9 Nov 2021 23:23:46 +0100
Subject: [PATCH 105/143] MNT composition instead of inheritance

---
 sklearn/_loss/_loss.pyx.tp       |  34 ++++-----
 sklearn/_loss/loss.py            | 122 ++++++++++++++++++-------------
 sklearn/_loss/tests/test_loss.py |  30 ++++----
 3 files changed, 103 insertions(+), 83 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 271436e970d09..814913baac850 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -185,9 +185,9 @@ WARNING: Do not edit `sklearn/_loss/_loss.pyx` file directly, as it is generated
 #      These are used e.g. in SGD `_plain_sgd`.
 #    - Member functions operating on ndarrays, see c), looping over calls to C
 #      functions from b).
-# e) Provide convenience Python classes that inherit from these extension types
+# e) Provide convenience Python classes that compose from these extension types
 #    elsewhere (see loss.py)
-#    - Example: loss.gradient calls CyLoss._gradient but does some input
+#    - Example: loss.gradient calls CyLoss.gradient but does some input
 #      checking like None -> np.empty().
 #
 # Note: We require 1-dim ndarrays to be contiguous.
@@ -640,7 +640,7 @@ cdef class CyLossFunction:
     #       const Y_DTYPE_C double[::1] y_true
     # See release notes 3.0.0 alpha1
     # https://cython.readthedocs.io/en/latest/src/changes.html#alpha-1-2020-04-12
-    def _loss(
+    def loss(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -670,7 +670,7 @@ cdef class CyLossFunction:
         """
         pass
 
-    def _gradient(
+    def gradient(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -700,7 +700,7 @@ cdef class CyLossFunction:
         """
         pass
 
-    def _loss_gradient(
+    def loss_gradient(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -734,11 +734,11 @@ cdef class CyLossFunction:
         gradient : array of shape (n_samples,)
             Element-wise gradients.
         """
-        self._loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
-        self._gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
+        self.loss(y_true, raw_prediction, sample_weight, loss_out, n_threads)
+        self.gradient(y_true, raw_prediction, sample_weight, gradient_out, n_threads)
         return np.asarray(loss_out), np.asarray(gradient_out)
 
-    def _gradient_hessian(
+    def gradient_hessian(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -800,7 +800,7 @@ cdef class {{name}}(CyLossFunction):
     cdef inline double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil:
         return {{cgrad_hess}}(y_true, raw_prediction{{with_param}})
 
-    def _loss(
+    def loss(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -826,7 +826,7 @@ cdef class {{name}}(CyLossFunction):
         return np.asarray(loss_out)
 
     {{if closs_grad is not None}}
-    def _loss_gradient(
+    def loss_gradient(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -858,7 +858,7 @@ cdef class {{name}}(CyLossFunction):
         return np.asarray(loss_out), np.asarray(gradient_out)
     {{endif}}
 
-    def _gradient(
+    def gradient(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -883,7 +883,7 @@ cdef class {{name}}(CyLossFunction):
 
         return np.asarray(gradient_out)
 
-    def _gradient_hessian(
+    def gradient_hessian(
         self,
         Y_DTYPE_C[::1] y_true,          # IN
         Y_DTYPE_C[::1] raw_prediction,  # IN
@@ -934,7 +934,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
     # Note that we do not assume memory alignement/contiguity of 2d arrays.
     # There seems to be little benefit in doing so. Benchmarks proofing the
     # opposite are welcome.
-    def _loss(
+    def loss(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
@@ -994,7 +994,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
         return np.asarray(loss_out)
 
-    def _loss_gradient(
+    def loss_gradient(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
@@ -1056,7 +1056,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
         return np.asarray(loss_out), np.asarray(gradient_out)
 
-    def _gradient(
+    def gradient(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
@@ -1105,7 +1105,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
 
         return np.asarray(gradient_out)
 
-    def _gradient_hessian(
+    def gradient_hessian(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
@@ -1163,7 +1163,7 @@ cdef class CyCategoricalCrossEntropy(CyLossFunction):
     # This method simplifies the implementation of hessp in linear models,
     # i.e. the matrix-vector product of the full hessian, not only of the
     # diagonal (in the classes) approximation as implemented above.
-    def _gradient_proba(
+    def gradient_proba(
         self,
         Y_DTYPE_C[::1] y_true,           # IN
         Y_DTYPE_C[:, :] raw_prediction,  # IN
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 5af54edd3ab53..e472c07c7e45d 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -7,7 +7,7 @@
 """
 # Goals:
 # - Provide a common private module for loss functions/classes.
-# - Replace losses for:
+# - To be used in:
 #   - LogisticRegression
 #   - PoissonRegressor, GammaRegressor, TweedieRegressor
 #   - HistGradientBoostingRegressor, HistGradientBoostingClassifier
@@ -18,7 +18,6 @@
 import numpy as np
 from scipy.special import xlogy
 from ._loss import (
-    CyLossFunction,
     CyHalfSquaredError,
     CyAbsoluteError,
     CyPinballLoss,
@@ -30,7 +29,6 @@
 )
 from .link import (
     Interval,
-    BaseLink,
     IdentityLink,
     LogLink,
     LogitLink,
@@ -43,7 +41,22 @@
 # Note: The shape of raw_prediction for multiclass classifications are
 # - GradientBoostingClassifier: (n_samples, n_classes)
 # - HistGradientBoostingClassifier: (n_classes, n_samples)
-class BaseLoss(BaseLink, CyLossFunction):
+#
+# Note: Instead of inheritance like
+#
+#    class BaseLoss(BaseLink, CyLossFunction):
+#    ...
+#
+#    # Note: Naturally, we would inherit in the following order
+#    #     class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
+#    #   But because of https://github.com/cython/cython/issues/4350 we set BaseLoss as
+#    #   the last one. This, of course, changes the MRO.
+#    class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
+#
+# we use composition. This way we improve maintainability by avoiding the above
+# mentioned Cython edge case and have easier to understand code (which method calls
+# which code).
+class BaseLoss:
     """Base class for a loss function of 1-dimensional targets.
 
     Conventions:
@@ -72,14 +85,16 @@ class BaseLoss(BaseLink, CyLossFunction):
 
     Attributes
     ----------
-    interval_y_true: Interval
+    closs: CyLossFunction
+    link : BaseLink
+    interval_y_true : Interval
         Valid interval for y_true
-    interval_y_pred: Interval
+    interval_y_pred : Interval
         Valid Interval for y_pred
-    differentiable: bool
+    differentiable : bool
         Indicates whether or not loss function is differentiable in
         raw_prediction everywhere.
-    need_update_leaves_values: bool
+    need_update_leaves_values : bool
         Indicates whether decision trees in gradient boosting need to uptade
         leave values after having been fit to the (negative) gradients.
     approx_hessian : bool
@@ -91,13 +106,6 @@ class BaseLoss(BaseLink, CyLossFunction):
         Indicates whether n_classes > 2 is allowed.
     """
 
-    # Inherited methods from BaseLink:
-    # - link
-    # - inverse
-    #
-    # Inherited methods from CyLossFunction:
-    # - _loss, _loss_gradient, _gradient, _gradient_hessian
-
     # For decision trees:
     # This variable indicates whether the loss requires the leaves values to
     # be updated once the tree has been trained. The trees are trained to
@@ -116,7 +124,7 @@ def __init__(self, n_classes=1):
         self.constant_hessian = False
         self.n_classes = n_classes
         self.interval_y_true = Interval(-np.inf, np.inf, False, False)
-        self.interval_y_pred = Interval(-np.inf, np.inf, False, False)
+        self.interval_y_pred = self.link.interval_y_pred
 
     def in_y_true_range(self, y):
         """Return True if y is in the valid range of y_true.
@@ -176,7 +184,7 @@ def loss(
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
         if sample_weight is not None:
             sample_weight = ReadonlyArrayWrapper(sample_weight)
-        return self._loss(
+        return self.closs.loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
@@ -241,7 +249,7 @@ def loss_gradient(
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
         if sample_weight is not None:
             sample_weight = ReadonlyArrayWrapper(sample_weight)
-        return self._loss_gradient(
+        return self.closs.loss_gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
@@ -294,7 +302,7 @@ def gradient(
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
         if sample_weight is not None:
             sample_weight = ReadonlyArrayWrapper(sample_weight)
-        return self._gradient(
+        return self.closs.gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
@@ -362,7 +370,7 @@ def gradient_hessian(
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
         if sample_weight is not None:
             sample_weight = ReadonlyArrayWrapper(sample_weight)
-        return self._gradient_hessian(
+        return self.closs.gradient_hessian(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
@@ -440,9 +448,9 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             a_max = self.interval_y_pred.high - eps
 
         if a_min is None and a_max is None:
-            return self.link(y_pred)
+            return self.link.link(y_pred)
         else:
-            return self.link(np.clip(y_pred, a_min, a_max))
+            return self.link.link(np.clip(y_pred, a_min, a_max))
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
         """Calculate term dropped in loss.
@@ -456,7 +464,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
 #         class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
 #       But because of https://github.com/cython/cython/issues/4350 we
 #       set BaseLoss as the last one. This, of course, changes the MRO.
-class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
+class HalfSquaredError(BaseLoss):
     """Half squared error with identity link, for regression.
 
     Domain:
@@ -475,11 +483,13 @@ class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
+        self.closs = CyHalfSquaredError()
+        self.link = IdentityLink()
         super().__init__()
         self.constant_hessian = sample_weight is None
 
 
-class AbsoluteError(IdentityLink, CyAbsoluteError, BaseLoss):
+class AbsoluteError(BaseLoss):
     """Absolute error with identity link, for regression.
 
     Domain:
@@ -497,6 +507,8 @@ class AbsoluteError(IdentityLink, CyAbsoluteError, BaseLoss):
     need_update_leaves_values = True
 
     def __init__(self, sample_weight=None):
+        self.closs = CyAbsoluteError()
+        self.link = IdentityLink()
         super().__init__()
         self.approx_hessian = True
         self.constant_hessian = sample_weight is None
@@ -513,7 +525,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
             return _weighted_percentile(y_true, sample_weight, 50)
 
 
-class PinballLoss(IdentityLink, CyPinballLoss, BaseLoss):
+class PinballLoss(BaseLoss):
     """Quantile loss aka pinball loss, for regression.
 
     Domain:
@@ -543,15 +555,16 @@ class PinballLoss(IdentityLink, CyPinballLoss, BaseLoss):
     need_update_leaves_values = True
 
     def __init__(self, sample_weight=None, quantile=0.5):
-        BaseLoss.__init__(self)
-        CyPinballLoss.__init__(self, quantile=float(quantile))
-        self.approx_hessian = True
-        self.constant_hessian = sample_weight is None
         if quantile <= 0 or quantile >= 1:
             raise ValueError(
                 "PinballLoss aka quantile loss only accepts "
                 f"0 < quantile < 1; {quantile} was given."
             )
+        self.closs = CyPinballLoss(quantile=float(quantile))
+        self.link = IdentityLink()
+        BaseLoss.__init__(self)
+        self.approx_hessian = True
+        self.constant_hessian = sample_weight is None
 
     def fit_intercept_only(self, y_true, sample_weight=None):
         """Compute raw_prediction of an intercept-only model.
@@ -560,12 +573,14 @@ def fit_intercept_only(self, y_true, sample_weight=None):
         axis=0.
         """
         if sample_weight is None:
-            return np.percentile(y_true, 100 * self.quantile, axis=0)
+            return np.percentile(y_true, 100 * self.closs.quantile, axis=0)
         else:
-            return _weighted_percentile(y_true, sample_weight, 100 * self.quantile)
+            return _weighted_percentile(
+                y_true, sample_weight, 100 * self.closs.quantile
+            )
 
 
-class HalfPoissonLoss(LogLink, CyHalfPoissonLoss, BaseLoss):
+class HalfPoissonLoss(BaseLoss):
     """Poisson deviance loss with log-link, for regression.
 
     Domain:
@@ -587,9 +602,10 @@ class HalfPoissonLoss(LogLink, CyHalfPoissonLoss, BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
+        self.closs = CyHalfPoissonLoss()
+        self.link = LogLink()
         super().__init__()
         self.interval_y_true = Interval(0, np.inf, True, False)
-        self.interval_y_pred = Interval(0, np.inf, False, False)
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
         term = xlogy(y_true, y_true) - y_true
@@ -598,7 +614,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfGammaLoss(LogLink, CyHalfGammaLoss, BaseLoss):
+class HalfGammaLoss(BaseLoss):
     """Gamma deviance loss with log-link, for regression.
 
     Domain:
@@ -619,9 +635,10 @@ class HalfGammaLoss(LogLink, CyHalfGammaLoss, BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
+        self.closs = CyHalfGammaLoss()
+        self.link = LogLink()
         super().__init__()
         self.interval_y_true = Interval(0, np.inf, False, False)
-        self.interval_y_pred = Interval(0, np.inf, False, False)
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
         term = -np.log(y_true) - 1
@@ -630,7 +647,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         return term
 
 
-class HalfTweedieLoss(LogLink, CyHalfTweedieLoss, BaseLoss):
+class HalfTweedieLoss(BaseLoss):
     """Tweedie deviance loss with log-link, for regression.
 
     Domain:
@@ -662,38 +679,38 @@ class HalfTweedieLoss(LogLink, CyHalfTweedieLoss, BaseLoss):
     """
 
     def __init__(self, sample_weight=None, power=1.5):
+        self.closs = CyHalfTweedieLoss(power=power)
+        self.link = LogLink()
         BaseLoss.__init__(self)
-        CyHalfTweedieLoss.__init__(self, power=power)
-        self.interval_y_pred = Interval(0, np.inf, False, False)
-        if self.power <= 0:
+        if self.closs.power <= 0:
             self.interval_y_true = Interval(-np.inf, np.inf, False, False)
-        elif self.power < 2:
+        elif self.closs.power < 2:
             self.interval_y_true = Interval(0, np.inf, True, False)
         else:
             self.interval_y_true = Interval(0, np.inf, False, False)
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
-        if self.power == 0:
+        if self.closs.power == 0:
             return HalfSquaredError().constant_to_optimal_zero(
                 y_true=y_true, sample_weight=sample_weight
             )
-        elif self.power == 1:
+        elif self.closs.power == 1:
             return HalfPoissonLoss().constant_to_optimal_zero(
                 y_true=y_true, sample_weight=sample_weight
             )
-        elif self.power == 2:
+        elif self.closs.power == 2:
             return HalfGammaLoss().constant_to_optimal_zero(
                 y_true=y_true, sample_weight=sample_weight
             )
         else:
-            p = self.power
+            p = self.closs.power
             term = np.power(np.maximum(y_true, 0), 2 - p) / (1 - p) / (2 - p)
             if sample_weight is not None:
                 term *= sample_weight
             return term
 
 
-class BinaryCrossEntropy(LogitLink, CyBinaryCrossEntropy, BaseLoss):
+class BinaryCrossEntropy(BaseLoss):
     """Binary cross entropy loss with logit link, for binary classification.
 
     Domain:
@@ -720,9 +737,10 @@ class BinaryCrossEntropy(LogitLink, CyBinaryCrossEntropy, BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
+        self.closs = CyBinaryCrossEntropy()
+        self.link = LogitLink()
         super().__init__(n_classes=2)
         self.interval_y_true = Interval(0, 1, True, True)
-        self.interval_y_pred = Interval(0, 1, False, False)
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
         # This is non-zero only if y_true is neither 0 nor 1.
@@ -748,12 +766,12 @@ def predict_proba(self, raw_prediction):
         if raw_prediction.ndim == 2 and raw_prediction.shape[1] == 1:
             raw_prediction = raw_prediction.squeeze(1)
         proba = np.empty((raw_prediction.shape[0], 2), dtype=raw_prediction.dtype)
-        proba[:, 1] = self.inverse(raw_prediction)
+        proba[:, 1] = self.link.inverse(raw_prediction)
         proba[:, 0] = 1 - proba[:, 1]
         return proba
 
 
-class CategoricalCrossEntropy(MultinomialLogit, CyCategoricalCrossEntropy, BaseLoss):
+class CategoricalCrossEntropy(BaseLoss):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:
@@ -791,6 +809,8 @@ class CategoricalCrossEntropy(MultinomialLogit, CyCategoricalCrossEntropy, BaseL
     is_multiclass = True
 
     def __init__(self, sample_weight=None, n_classes=3):
+        self.closs = CyCategoricalCrossEntropy()
+        self.link = MultinomialLogit()
         super().__init__(n_classes=n_classes)
         self.interval_y_true = Interval(0, np.inf, True, False)
         self.interval_y_pred = Interval(0, 1, False, False)
@@ -815,7 +835,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
         for k in range(self.n_classes):
             out[k] = np.average(y_true == k, weights=sample_weight, axis=0)
             out[k] = np.clip(out[k], eps, 1 - eps)
-        return self.link(out[None, :]).reshape(-1)
+        return self.link.link(out[None, :]).reshape(-1)
 
     def predict_proba(self, raw_prediction):
         """Predict probabilities.
@@ -830,7 +850,7 @@ def predict_proba(self, raw_prediction):
         proba : array of shape (n_samples, n_classes)
             Element-wise class probabilites.
         """
-        return self.inverse(raw_prediction)
+        return self.link.inverse(raw_prediction)
 
     def gradient_proba(
         self,
@@ -881,7 +901,7 @@ def gradient_proba(
         raw_prediction = ReadonlyArrayWrapper(raw_prediction)
         if sample_weight is not None:
             sample_weight = ReadonlyArrayWrapper(sample_weight)
-        return self._gradient_proba(
+        return self.closs.gradient_proba(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 95c814162554c..2cce8a8974656 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -48,9 +48,9 @@ def loss_instance_name(param):
         loss = param
         name = loss.__class__.__name__
         if hasattr(loss, "quantile"):
-            name += f"(quantile={loss.quantile})"
+            name += f"(quantile={loss.closs.quantile})"
         elif hasattr(loss, "power"):
-            name += f"(power={loss.power})"
+            name += f"(power={loss.closs.power})"
         return name
     else:
         return str(param)
@@ -132,7 +132,7 @@ def test_loss_boundary(loss):
     assert loss.in_y_pred_range(y_pred)
 
     # calculating losses should not fail
-    raw_prediction = loss.link(y_pred)
+    raw_prediction = loss.link.link(y_pred)
     loss.loss(y_true=y_true, raw_prediction=raw_prediction)
 
 
@@ -354,7 +354,7 @@ def test_loss_same_as_C_functions(loss, sample_weight):
             sample_weight=sample_weight,
             loss_out=out_l1,
         ),
-        loss._loss(
+        loss.closs.loss(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
@@ -368,21 +368,21 @@ def test_loss_same_as_C_functions(loss, sample_weight):
             sample_weight=sample_weight,
             gradient_out=out_g1,
         ),
-        loss._gradient(
+        loss.closs.gradient(
             y_true=y_true,
             raw_prediction=raw_prediction,
             sample_weight=sample_weight,
             gradient_out=out_g2,
         ),
     )
-    loss.loss_gradient(
+    loss.closs.loss_gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
         loss_out=out_l1,
         gradient_out=out_g1,
     )
-    loss._loss_gradient(
+    loss.closs.loss_gradient(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
@@ -398,7 +398,7 @@ def test_loss_same_as_C_functions(loss, sample_weight):
         gradient_out=out_g1,
         hessian_out=out_h1,
     )
-    loss._gradient_hessian(
+    loss.closs.gradient_hessian(
         y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight,
@@ -603,7 +603,7 @@ def test_loss_of_perfect_prediction(loss, sample_weight):
     if not loss.is_multiclass:
         # Use small values such that exp(value) is not nan.
         raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
-        y_true = loss.inverse(raw_prediction)
+        y_true = loss.link.inverse(raw_prediction)
     else:
         # CategoricalCrossEntropy
         y_true = np.arange(loss.n_classes).astype(float)
@@ -786,7 +786,7 @@ def fprime2(x: np.ndarray) -> np.ndarray:
     # dimensions.
     y_true = y_true.ravel()
     optimum = optimum.ravel()
-    assert_allclose(loss.inverse(optimum), y_true)
+    assert_allclose(loss.link.inverse(optimum), y_true)
     assert_allclose(func(optimum), 0, atol=1e-14)
     assert_allclose(loss.gradient(y_true=y_true, raw_prediction=optimum), 0, atol=5e-7)
 
@@ -800,7 +800,7 @@ def test_loss_intercept_only(loss, sample_weight):
     """
     n_samples = 50
     if not loss.is_multiclass:
-        y_true = loss.inverse(np.linspace(-4, 4, num=n_samples))
+        y_true = loss.link.inverse(np.linspace(-4, 4, num=n_samples))
     else:
         y_true = np.arange(n_samples).astype(float) % loss.n_classes
         y_true[::5] = 0  # exceedance of class 0
@@ -891,10 +891,10 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
     # Make sure baseline prediction is the expected functional=func, e.g. mean
     # or median.
     assert_all_finite(baseline_prediction)
-    assert baseline_prediction == approx(loss.link(func(y_train)))
-    assert loss.inverse(baseline_prediction) == approx(func(y_train))
+    assert baseline_prediction == approx(loss.link.link(func(y_train)))
+    assert loss.link.inverse(baseline_prediction) == approx(func(y_train))
     if isinstance(loss, IdentityLink):
-        assert_allclose(loss.inverse(baseline_prediction), baseline_prediction)
+        assert_allclose(loss.link.inverse(baseline_prediction), baseline_prediction)
 
     # Test baseline at boundary
     if loss.interval_y_true.low_inclusive:
@@ -921,7 +921,7 @@ def test_categorical_crossentropy_fit_intercept_only():
     for k in range(n_classes):
         p[k] = (y_train == k).mean()
     assert_allclose(baseline_prediction, np.log(p) - np.mean(np.log(p)))
-    assert_allclose(baseline_prediction[None, :], loss.link(p[None, :]))
+    assert_allclose(baseline_prediction[None, :], loss.link.link(p[None, :]))
 
     for y_train in (np.zeros(shape=10), np.ones(shape=10)):
         y_train = y_train.astype(np.float64)

From cb3cacca4954e96fcfd8200180ae6dcde9c4f238 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 9 Nov 2021 23:25:30 +0100
Subject: [PATCH 106/143] MNT interval_raw_prediction never used

---
 sklearn/_loss/link.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/link.py b/sklearn/_loss/link.py
index beb4738847a89..18ad5901d1f3c 100644
--- a/sklearn/_loss/link.py
+++ b/sklearn/_loss/link.py
@@ -102,7 +102,7 @@ class BaseLink(ABC):
 
     # Usually, raw_prediction may be any real number and y_pred is an open
     # interval.
-    interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
+    # interval_raw_prediction = Interval(-np.inf, np.inf, False, False)
     interval_y_pred = Interval(-np.inf, np.inf, False, False)
 
     @abstractmethod

From 2147c61a7f2f7daa9117408150da8c88bbb934da Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 10 Nov 2021 20:30:29 +0100
Subject: [PATCH 107/143] CLN closs and link as args in __init__

---
 sklearn/_loss/loss.py | 50 +++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index e472c07c7e45d..ba91887280170 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -119,7 +119,9 @@ class BaseLoss:
     differentiable = True
     is_multiclass = False
 
-    def __init__(self, n_classes=1):
+    def __init__(self, closs, link, n_classes=1):
+        self.closs = closs
+        self.link = link
         self.approx_hessian = False
         self.constant_hessian = False
         self.n_classes = n_classes
@@ -483,9 +485,7 @@ class HalfSquaredError(BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
-        self.closs = CyHalfSquaredError()
-        self.link = IdentityLink()
-        super().__init__()
+        super().__init__(closs=CyHalfSquaredError(), link=IdentityLink())
         self.constant_hessian = sample_weight is None
 
 
@@ -507,9 +507,7 @@ class AbsoluteError(BaseLoss):
     need_update_leaves_values = True
 
     def __init__(self, sample_weight=None):
-        self.closs = CyAbsoluteError()
-        self.link = IdentityLink()
-        super().__init__()
+        super().__init__(closs=CyAbsoluteError(), link=IdentityLink())
         self.approx_hessian = True
         self.constant_hessian = sample_weight is None
 
@@ -560,9 +558,10 @@ def __init__(self, sample_weight=None, quantile=0.5):
                 "PinballLoss aka quantile loss only accepts "
                 f"0 < quantile < 1; {quantile} was given."
             )
-        self.closs = CyPinballLoss(quantile=float(quantile))
-        self.link = IdentityLink()
-        BaseLoss.__init__(self)
+        super().__init__(
+            closs=CyPinballLoss(quantile=float(quantile)),
+            link=IdentityLink(),
+        )
         self.approx_hessian = True
         self.constant_hessian = sample_weight is None
 
@@ -602,9 +601,7 @@ class HalfPoissonLoss(BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
-        self.closs = CyHalfPoissonLoss()
-        self.link = LogLink()
-        super().__init__()
+        super().__init__(closs=CyHalfPoissonLoss(), link=LogLink())
         self.interval_y_true = Interval(0, np.inf, True, False)
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
@@ -635,9 +632,7 @@ class HalfGammaLoss(BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
-        self.closs = CyHalfGammaLoss()
-        self.link = LogLink()
-        super().__init__()
+        super().__init__(closs=CyHalfGammaLoss(), link=LogLink())
         self.interval_y_true = Interval(0, np.inf, False, False)
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
@@ -679,9 +674,10 @@ class HalfTweedieLoss(BaseLoss):
     """
 
     def __init__(self, sample_weight=None, power=1.5):
-        self.closs = CyHalfTweedieLoss(power=power)
-        self.link = LogLink()
-        BaseLoss.__init__(self)
+        super().__init__(
+            closs=CyHalfTweedieLoss(power=float(power)),
+            link=LogLink(),
+        )
         if self.closs.power <= 0:
             self.interval_y_true = Interval(-np.inf, np.inf, False, False)
         elif self.closs.power < 2:
@@ -737,9 +733,11 @@ class BinaryCrossEntropy(BaseLoss):
     """
 
     def __init__(self, sample_weight=None):
-        self.closs = CyBinaryCrossEntropy()
-        self.link = LogitLink()
-        super().__init__(n_classes=2)
+        super().__init__(
+            closs=CyBinaryCrossEntropy(),
+            link=LogitLink(),
+            n_classes=2,
+        )
         self.interval_y_true = Interval(0, 1, True, True)
 
     def constant_to_optimal_zero(self, y_true, sample_weight=None):
@@ -809,9 +807,11 @@ class CategoricalCrossEntropy(BaseLoss):
     is_multiclass = True
 
     def __init__(self, sample_weight=None, n_classes=3):
-        self.closs = CyCategoricalCrossEntropy()
-        self.link = MultinomialLogit()
-        super().__init__(n_classes=n_classes)
+        super().__init__(
+            closs=CyCategoricalCrossEntropy(),
+            link=MultinomialLogit(),
+            n_classes=n_classes,
+        )
         self.interval_y_true = Interval(0, np.inf, True, False)
         self.interval_y_pred = Interval(0, 1, False, False)
 

From 48fc50b0c81b6bf16b9525d3393abd79678e5ae5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 10 Nov 2021 23:09:53 +0100
Subject: [PATCH 108/143] trigger CI


From 1a031ff94768292b8393956799c704feae096572 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 11 Nov 2021 19:07:34 +0100
Subject: [PATCH 109/143] trigger CI


From 0068c684fce8119f87dba1a77e066643733754a5 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 11 Nov 2021 21:10:38 +0100
Subject: [PATCH 110/143] DEBUG print infos

---
 build_tools/azure/test_script.sh |  2 +-
 sklearn/_loss/tests/test_loss.py | 26 ++++++++++++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index 44b06db6621c9..dc04f04050625 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -26,7 +26,7 @@ else
     conda list
 fi
 
-TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML"
+TEST_CMD="python -X faulthandler -m pytest -v --full-trace --showlocals --durations=20 --junitxml=$JUNITXML"
 
 if [[ "$COVERAGE" == "true" ]]; then
     # Note: --cov-report= is used to disable to long text output report in the
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 2cce8a8974656..5d1d9c868f874 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -236,7 +236,8 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
     ) == approx(loss_true, rel=1e-11, abs=1e-12)
 
 
-@pytest.mark.parametrize("loss", ALL_LOSSES)
+# @pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize("loss", [HalfSquaredError, HalfGammaLoss])
 @pytest.mark.parametrize("readonly_memmap", [False, True])
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
@@ -279,6 +280,27 @@ def test_loss_dtype(
         if sample_weight is not None:
             sample_weight = create_memmap_backed_data(sample_weight)
 
+    print("START DEBUG")
+    print(f"loss={loss}")
+    print(f"readonly_memmap={readonly_memmap}")
+    print(f"dtype_in={dtype_in}")
+    print(f"dtype_out={dtype_out}")
+    print(f"sample_weight={sample_weight}")
+    print(f"out1={out1}")
+    print(f"out2={out2}")
+    print(f"n_threads={n_threads}")
+    print(f"y_true.shape={y_true.shape}")
+    print(f"y_true.dtype={y_true.dtype}")
+    print(f"raw_prediction.shape={raw_prediction.shape}")
+    print(f"raw_prediction.dtype={raw_prediction.dtype}")
+    if sample_weight is not None:
+        print(f"sample_weight.shape={sample_weight.shape}")
+        print(f"sample_weight.dtype={sample_weight.dtype}")
+    if out1 is not None:
+        print(f"out1.shape={out1.shape}")
+        print(f"out1.dtype={out1.dtype}")
+    print("END DEBUG", flush=True)
+
     loss.loss(
         y_true=y_true,
         raw_prediction=raw_prediction,
@@ -846,7 +868,7 @@ def fun(x):
             fun,
             np.empty((loss.n_classes)),
             tol=1e-13,
-            options={"maxiter": 100},
+            options={"maxiter": 100, "disp": True},
             method="SLSQP",
             constraints={
                 "type": "eq",

From 903be0be4c85090d7b97dc3b1a4d2e8965413251 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 12 Nov 2021 19:29:30 +0100
Subject: [PATCH 111/143] DEBUG remove readonly_memmap test

---
 sklearn/_loss/tests/test_loss.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 5d1d9c868f874..3124142bbd0df 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -238,7 +238,8 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
 
 # @pytest.mark.parametrize("loss", ALL_LOSSES)
 @pytest.mark.parametrize("loss", [HalfSquaredError, HalfGammaLoss])
-@pytest.mark.parametrize("readonly_memmap", [False, True])
+# @pytest.mark.parametrize("readonly_memmap", [False, True])
+@pytest.mark.parametrize("readonly_memmap", [False])
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
 @pytest.mark.parametrize("sample_weight", [None, 1])

From 4492c5880e3f95286d53f35c252d6b9ae2f16afc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 12 Nov 2021 19:59:15 +0100
Subject: [PATCH 112/143] Revert "DEBUG remove readonly_memmap test"

This reverts commit 903be0be4c85090d7b97dc3b1a4d2e8965413251.
---
 sklearn/_loss/tests/test_loss.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 3124142bbd0df..5d1d9c868f874 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -238,8 +238,7 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
 
 # @pytest.mark.parametrize("loss", ALL_LOSSES)
 @pytest.mark.parametrize("loss", [HalfSquaredError, HalfGammaLoss])
-# @pytest.mark.parametrize("readonly_memmap", [False, True])
-@pytest.mark.parametrize("readonly_memmap", [False])
+@pytest.mark.parametrize("readonly_memmap", [False, True])
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
 @pytest.mark.parametrize("sample_weight", [None, 1])

From ef3b9e79de161b02d12aba64eb922c5479d595d1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 13 Nov 2021 12:24:44 +0100
Subject: [PATCH 113/143] TST skip test if data not aligned

---
 sklearn/_loss/tests/test_loss.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 5d1d9c868f874..0dd972f4d0bd5 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -277,8 +277,15 @@ def test_loss_dtype(
 
     if readonly_memmap:
         y_true, raw_prediction = create_memmap_backed_data([y_true, raw_prediction])
+        is_aligned = y_true.flags["ALIGNED"] and raw_prediction.flags["ALIGNED"]
         if sample_weight is not None:
             sample_weight = create_memmap_backed_data(sample_weight)
+            is_aligned &= sample_weight.flags["ALIGNED"]
+        if not is_aligned:
+            pytest.skip(
+                "Losses need aligned data, but "
+                "https://github.com/joblib/joblib/issues/563 gets in the way."
+            )
 
     print("START DEBUG")
     print(f"loss={loss}")

From 942c1425470fa9b018e0d19a1c2a57b593b96dde Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 13 Nov 2021 12:34:41 +0100
Subject: [PATCH 114/143] Revert "DEBUG print infos"

This reverts commit 0068c684fce8119f87dba1a77e066643733754a5.
---
 build_tools/azure/test_script.sh |  2 +-
 sklearn/_loss/tests/test_loss.py | 26 ++------------------------
 2 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/build_tools/azure/test_script.sh b/build_tools/azure/test_script.sh
index dc04f04050625..44b06db6621c9 100755
--- a/build_tools/azure/test_script.sh
+++ b/build_tools/azure/test_script.sh
@@ -26,7 +26,7 @@ else
     conda list
 fi
 
-TEST_CMD="python -X faulthandler -m pytest -v --full-trace --showlocals --durations=20 --junitxml=$JUNITXML"
+TEST_CMD="python -m pytest --showlocals --durations=20 --junitxml=$JUNITXML"
 
 if [[ "$COVERAGE" == "true" ]]; then
     # Note: --cov-report= is used to disable to long text output report in the
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 0dd972f4d0bd5..032dfc8c7cfe6 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -236,8 +236,7 @@ def test_loss_on_specific_values(loss, y_true, raw_prediction, loss_true):
     ) == approx(loss_true, rel=1e-11, abs=1e-12)
 
 
-# @pytest.mark.parametrize("loss", ALL_LOSSES)
-@pytest.mark.parametrize("loss", [HalfSquaredError, HalfGammaLoss])
+@pytest.mark.parametrize("loss", ALL_LOSSES)
 @pytest.mark.parametrize("readonly_memmap", [False, True])
 @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
 @pytest.mark.parametrize("dtype_out", [np.float32, np.float64])
@@ -287,27 +286,6 @@ def test_loss_dtype(
                 "https://github.com/joblib/joblib/issues/563 gets in the way."
             )
 
-    print("START DEBUG")
-    print(f"loss={loss}")
-    print(f"readonly_memmap={readonly_memmap}")
-    print(f"dtype_in={dtype_in}")
-    print(f"dtype_out={dtype_out}")
-    print(f"sample_weight={sample_weight}")
-    print(f"out1={out1}")
-    print(f"out2={out2}")
-    print(f"n_threads={n_threads}")
-    print(f"y_true.shape={y_true.shape}")
-    print(f"y_true.dtype={y_true.dtype}")
-    print(f"raw_prediction.shape={raw_prediction.shape}")
-    print(f"raw_prediction.dtype={raw_prediction.dtype}")
-    if sample_weight is not None:
-        print(f"sample_weight.shape={sample_weight.shape}")
-        print(f"sample_weight.dtype={sample_weight.dtype}")
-    if out1 is not None:
-        print(f"out1.shape={out1.shape}")
-        print(f"out1.dtype={out1.dtype}")
-    print("END DEBUG", flush=True)
-
     loss.loss(
         y_true=y_true,
         raw_prediction=raw_prediction,
@@ -875,7 +853,7 @@ def fun(x):
             fun,
             np.empty((loss.n_classes)),
             tol=1e-13,
-            options={"maxiter": 100, "disp": True},
+            options={"maxiter": 100},
             method="SLSQP",
             constraints={
                 "type": "eq",

From 0b1d6189c937c783b9e3f20c219253bedddfbe50 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 13 Nov 2021 14:04:26 +0100
Subject: [PATCH 115/143] TST zeros instead of empty initial guess

---
 sklearn/_loss/tests/test_loss.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 032dfc8c7cfe6..e32034906463e 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -844,14 +844,13 @@ def fun(x):
         a == approx(opt.x, rel=1e-7)
         grad.sum() == approx(0, abs=1e-12)
     else:
-        # constraint corresponds to sum(raw_prediction) = 0
-        # without the constraint, we would need to apply
-        # loss.symmetrize_raw_prediction to opt.x before comparing
+        # The constraint corresponds to sum(raw_prediction) = 0. Without it, we would
+        # need to apply loss.symmetrize_raw_prediction to opt.x before comparing.
         # TODO: With scipy 1.1.0, one could use
         # LinearConstraint(np.ones((1, loss.n_classes)), 0, 0)
         opt = minimize(
             fun,
-            np.empty((loss.n_classes)),
+            np.zeros((loss.n_classes)),
             tol=1e-13,
             options={"maxiter": 100},
             method="SLSQP",

From c30535fb748238df95eb46c530fb93beef1190c9 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 13 Nov 2021 17:52:36 +0100
Subject: [PATCH 116/143] Revert "TST skip test if data not aligned"

This reverts commit ef3b9e79de161b02d12aba64eb922c5479d595d1.
---
 sklearn/_loss/tests/test_loss.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index e32034906463e..b7ad4ae1f7bd1 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -276,15 +276,8 @@ def test_loss_dtype(
 
     if readonly_memmap:
         y_true, raw_prediction = create_memmap_backed_data([y_true, raw_prediction])
-        is_aligned = y_true.flags["ALIGNED"] and raw_prediction.flags["ALIGNED"]
         if sample_weight is not None:
             sample_weight = create_memmap_backed_data(sample_weight)
-            is_aligned &= sample_weight.flags["ALIGNED"]
-        if not is_aligned:
-            pytest.skip(
-                "Losses need aligned data, but "
-                "https://github.com/joblib/joblib/issues/563 gets in the way."
-            )
 
     loss.loss(
         y_true=y_true,

From e9c551c0604b43092ae5a94ea08624cb50c4cbcc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 13 Nov 2021 17:53:45 +0100
Subject: [PATCH 117/143] DEBUG set boundscheck=True

---
 sklearn/_build_utils/__init__.py | 2 +-
 sklearn/_loss/_loss.pyx.tp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py
index 67b5f2c662eb0..05aacd704794a 100644
--- a/sklearn/_build_utils/__init__.py
+++ b/sklearn/_build_utils/__init__.py
@@ -78,7 +78,7 @@ def cythonize_extensions(top_path, config):
         },
         compiler_directives={
             "language_level": 3,
-            "boundscheck": False,
+            "boundscheck": True,
             "wraparound": False,
             "initializedcheck": False,
             "nonecheck": False,
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 814913baac850..109d4d74c7122 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -163,7 +163,7 @@ WARNING: Do not edit `sklearn/_loss/_loss.pyx` file directly, as it is generated
 #------------------------------------------------------------------------------
 
 # cython: cdivision=True
-# cython: boundscheck=False
+# cython: boundscheck=True
 # cython: wraparound=False
 # cython: language_level=3
 

From 88e543c22dab75884833e4ddfea854941299a416 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 14 Nov 2021 20:46:02 +0100
Subject: [PATCH 118/143] Revert "DEBUG set boundscheck=True"

This reverts commit e9c551c0604b43092ae5a94ea08624cb50c4cbcc.
---
 sklearn/_build_utils/__init__.py | 2 +-
 sklearn/_loss/_loss.pyx.tp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/_build_utils/__init__.py b/sklearn/_build_utils/__init__.py
index 05aacd704794a..67b5f2c662eb0 100644
--- a/sklearn/_build_utils/__init__.py
+++ b/sklearn/_build_utils/__init__.py
@@ -78,7 +78,7 @@ def cythonize_extensions(top_path, config):
         },
         compiler_directives={
             "language_level": 3,
-            "boundscheck": True,
+            "boundscheck": False,
             "wraparound": False,
             "initializedcheck": False,
             "nonecheck": False,
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 109d4d74c7122..814913baac850 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -163,7 +163,7 @@ WARNING: Do not edit `sklearn/_loss/_loss.pyx` file directly, as it is generated
 #------------------------------------------------------------------------------
 
 # cython: cdivision=True
-# cython: boundscheck=True
+# cython: boundscheck=False
 # cython: wraparound=False
 # cython: language_level=3
 

From a0b3b868efa6aadeea09edeb124fcc43b6a1a687 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 14 Nov 2021 21:33:37 +0100
Subject: [PATCH 119/143] CLN setup.py

---
 sklearn/setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/setup.py b/sklearn/setup.py
index f9d549c094ec2..874bdbbcbed43 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -48,12 +48,12 @@ def configuration(parent_package="", top_path=None):
     config.add_subpackage("experimental/tests")
     config.add_subpackage("ensemble/_hist_gradient_boosting")
     config.add_subpackage("ensemble/_hist_gradient_boosting/tests")
-    config.add_subpackage("_loss/")
-    config.add_subpackage("_loss/tests")
     config.add_subpackage("externals")
     config.add_subpackage("externals/_packaging")
 
     # submodules which have their own setup.py
+    config.add_subpackage("_loss")
+    config.add_subpackage("_loss/tests")
     config.add_subpackage("cluster")
     config.add_subpackage("datasets")
     config.add_subpackage("decomposition")

From 00328b29ed5b8192ea7bccb9a1550e8b45fee9f4 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 15 Nov 2021 14:01:45 +0100
Subject: [PATCH 120/143] CLN rename binary and categorical cross entropy to
 binomial and multinomial loss

---
 sklearn/_loss/__init__.py        |  8 ++---
 sklearn/_loss/_loss.pyx.tp       | 42 +++++++++++-----------
 sklearn/_loss/loss.py            | 33 +++++++++---------
 sklearn/_loss/tests/test_loss.py | 60 ++++++++++++++++----------------
 4 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
index ae7bac5f1a8d8..14548c62231a2 100644
--- a/sklearn/_loss/__init__.py
+++ b/sklearn/_loss/__init__.py
@@ -10,8 +10,8 @@
     HalfPoissonLoss,
     HalfGammaLoss,
     HalfTweedieLoss,
-    BinaryCrossEntropy,
-    CategoricalCrossEntropy,
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
 )
 
 
@@ -22,6 +22,6 @@
     "HalfPoissonLoss",
     "HalfGammaLoss",
     "HalfTweedieLoss",
-    "BinaryCrossEntropy",
-    "CategoricalCrossEntropy",
+    "HalfBinomialLoss",
+    "HalfMultinomialLoss",
 ]
diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 814913baac850..4440b12114a42 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -10,7 +10,7 @@ Each loss class is generated by a cdef functions on single samples.
 The keywords between double braces are substituted in setup.py.
 """
 
-doc_SquaredError = (
+doc_HalfSquaredError = (
     """Half Squared Error with identity link.
 
     Domain:
@@ -46,7 +46,7 @@ doc_PinballLoss = (
     """
 )
 
-doc_PoissonLoss = (
+doc_HalfPoissonLoss = (
     """Half Poisson deviance loss with log-link.
 
     Domain:
@@ -66,7 +66,7 @@ doc_PoissonLoss = (
     """
 )
 
-doc_GammaLoss = (
+doc_HalfGammaLoss = (
     """Half Gamma deviance loss with log-link.
 
     Domain:
@@ -84,7 +84,7 @@ doc_GammaLoss = (
     """
 )
 
-doc_TweedieLoss = (
+doc_HalfTweedieLoss = (
     """Half Tweedie deviance loss with log-link.
 
     Domain:
@@ -117,8 +117,8 @@ doc_TweedieLoss = (
     """
 )
 
-doc_BinaryCrossEntropy = (
-    """BinaryCrossEntropy with logit link.
+doc_HalfBinomialLoss = (
+    """Half Binomial deviance loss with logit link.
 
     Domain:
     y_true in [0, 1]
@@ -133,7 +133,7 @@ doc_BinaryCrossEntropy = (
 # cy_loss, cy_loss_grad,
 # cy_grad, cy_grad_hess,
 class_list = [
-    ("CyHalfSquaredError", doc_SquaredError, None,
+    ("CyHalfSquaredError", doc_HalfSquaredError, None,
      "closs_half_squared_error", None,
      "cgradient_half_squared_error", "cgrad_hess_half_squared_error"),
     ("CyAbsoluteError", doc_AbsoluteError, None,
@@ -142,18 +142,18 @@ class_list = [
     ("CyPinballLoss", doc_PinballLoss, "quantile",
      "closs_pinball_loss", None,
      "cgradient_pinball_loss", "cgrad_hess_pinball_loss"),
-    ("CyHalfPoissonLoss", doc_PoissonLoss, None,
+    ("CyHalfPoissonLoss", doc_HalfPoissonLoss, None,
      "closs_half_poisson", "closs_grad_half_poisson",
      "cgradient_half_poisson", "cgrad_hess_half_poisson"),
-    ("CyHalfGammaLoss", doc_GammaLoss, None,
+    ("CyHalfGammaLoss", doc_HalfGammaLoss, None,
      "closs_half_gamma", "closs_grad_half_gamma",
      "cgradient_half_gamma", "cgrad_hess_half_gamma"),
-    ("CyHalfTweedieLoss", doc_TweedieLoss, "power",
+    ("CyHalfTweedieLoss", doc_HalfTweedieLoss, "power",
      "closs_half_tweedie", "closs_grad_half_tweedie",
      "cgradient_half_tweedie", "cgrad_hess_half_tweedie"),
-    ("CyBinaryCrossEntropy", doc_BinaryCrossEntropy, None,
-     "closs_binary_crossentropy", "closs_grad_binary_crossentropy",
-     "cgradient_binary_crossentropy", "cgrad_hess_binary_crossentropy"),
+    ("CyHalfBinomialLoss", doc_HalfBinomialLoss, None,
+     "closs_half_binomial", "closs_grad_half_binomial",
+     "cgradient_half_binomial", "cgrad_hess_half_binomial"),
 ]
 }}
 """
@@ -500,8 +500,8 @@ cdef inline double_pair cgrad_hess_half_tweedie(
     return gh
 
 
-# Binary cross entropy aka log-loss
-cdef inline double closs_binary_crossentropy(
+# Half Binomial deviance with logit-link, aka log-loss or binary cross entropy
+cdef inline double closs_half_binomial(
     double y_true,
     double raw_prediction
 ) nogil:
@@ -509,7 +509,7 @@ cdef inline double closs_binary_crossentropy(
     return log1pexp(raw_prediction) - y_true * raw_prediction
 
 
-cdef inline double cgradient_binary_crossentropy(
+cdef inline double cgradient_half_binomial(
     double y_true,
     double raw_prediction
 ) nogil:
@@ -534,7 +534,7 @@ cdef inline double cgradient_binary_crossentropy(
     return ((1 - y_true) - y_true * exp_tmp) / (1 + exp_tmp)
 
 
-cdef inline double_pair closs_grad_binary_crossentropy(
+cdef inline double_pair closs_grad_half_binomial(
     double y_true,
     double raw_prediction
 ) nogil:
@@ -557,7 +557,7 @@ cdef inline double_pair closs_grad_binary_crossentropy(
     return lg
 
 
-cdef inline double_pair cgrad_hess_binary_crossentropy(
+cdef inline double_pair cgrad_hess_half_binomial(
     double y_true,
     double raw_prediction
 ) nogil:
@@ -917,8 +917,10 @@ cdef class {{name}}(CyLossFunction):
 {{endfor}}
 
 
-cdef class CyCategoricalCrossEntropy(CyLossFunction):
-    """CategoricalCrossEntropy with multinomial logit link.
+# The multinomial deviance loss is also known as categorical cross-entropy or
+# multinomial log-likelihood
+cdef class CyHalfMultinomialLoss(CyLossFunction):
+    """Half Multinomial deviance loss with multinomial logit link.
 
     Domain:
     y_true in {0, 1, 2, 3, .., n_classes - 1}
diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index ba91887280170..a394bd9de06c3 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -24,8 +24,8 @@
     CyHalfPoissonLoss,
     CyHalfGammaLoss,
     CyHalfTweedieLoss,
-    CyBinaryCrossEntropy,
-    CyCategoricalCrossEntropy,
+    CyHalfBinomialLoss,
+    CyHalfMultinomialLoss,
 )
 from .link import (
     Interval,
@@ -580,7 +580,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
 
 
 class HalfPoissonLoss(BaseLoss):
-    """Poisson deviance loss with log-link, for regression.
+    """Half Poisson deviance loss with log-link, for regression.
 
     Domain:
     y_true in non-negative real numbers
@@ -612,7 +612,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
 
 
 class HalfGammaLoss(BaseLoss):
-    """Gamma deviance loss with log-link, for regression.
+    """Half Gamma deviance loss with log-link, for regression.
 
     Domain:
     y_true and y_pred in positive real numbers
@@ -643,7 +643,7 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
 
 
 class HalfTweedieLoss(BaseLoss):
-    """Tweedie deviance loss with log-link, for regression.
+    """Half Tweedie deviance loss with log-link, for regression.
 
     Domain:
     y_true in real numbers for power <= 0
@@ -706,18 +706,20 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
             return term
 
 
-class BinaryCrossEntropy(BaseLoss):
-    """Binary cross entropy loss with logit link, for binary classification.
+class HalfBinomialLoss(BaseLoss):
+    """Half Binomial deviance loss with logit link, for binary classification.
+
+    This is also know as binary cross entropy, log-loss and logistic loss.
 
     Domain:
-    y_true in [0, 1]
+    y_true in [0, 1], i.e. regression on the unit interval
     y_pred in (0, 1), i.e. boundaries excluded
 
     Link:
     y_pred = expit(raw_prediction)
 
-    For a given sample x_i, the binary cross-entropy, is defined as the
-    negative log-likelihood of the Bernoulli distribution and can be expressed
+    For a given sample x_i, half Binomial deviance is defined as the negative
+    log-likelihood of the Binomial/Bernoulli distribution and can be expressed
     as::
 
         loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
@@ -725,7 +727,6 @@ class BinaryCrossEntropy(BaseLoss):
     See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
     section 4.4.1 (about logistic regression).
 
-    This loss is also known as log loss or logistic loss.
     Note that the formulation works for classification, y = {0, 1}, as well as
     logistic regression, y = [0, 1].
     If you add `constant_to_optimal_zero` to the loss, you get half the
@@ -734,7 +735,7 @@ class BinaryCrossEntropy(BaseLoss):
 
     def __init__(self, sample_weight=None):
         super().__init__(
-            closs=CyBinaryCrossEntropy(),
+            closs=CyHalfBinomialLoss(),
             link=LogitLink(),
             n_classes=2,
         )
@@ -769,7 +770,7 @@ def predict_proba(self, raw_prediction):
         return proba
 
 
-class CategoricalCrossEntropy(BaseLoss):
+class HalfMultinomialLoss(BaseLoss):
     """Categorical cross-entropy loss, for multiclass classification.
 
     Domain:
@@ -808,7 +809,7 @@ class CategoricalCrossEntropy(BaseLoss):
 
     def __init__(self, sample_weight=None, n_classes=3):
         super().__init__(
-            closs=CyCategoricalCrossEntropy(),
+            closs=CyHalfMultinomialLoss(),
             link=MultinomialLogit(),
             n_classes=n_classes,
         )
@@ -918,6 +919,6 @@ def gradient_proba(
     "poisson_loss": HalfPoissonLoss,
     "gamma_loss": HalfGammaLoss,
     "tweedie_loss": HalfTweedieLoss,
-    "binary_crossentropy": BinaryCrossEntropy,
-    "categorical_crossentropy": CategoricalCrossEntropy,
+    "binomial_loss": HalfBinomialLoss,
+    "multinomial_loss": HalfMultinomialLoss,
 }
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index b7ad4ae1f7bd1..e9af49a56591f 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -16,9 +16,9 @@
     _LOSSES,
     BaseLoss,
     AbsoluteError,
-    BinaryCrossEntropy,
-    CategoricalCrossEntropy,
+    HalfBinomialLoss,
     HalfGammaLoss,
+    HalfMultinomialLoss,
     HalfPoissonLoss,
     HalfSquaredError,
     HalfTweedieLoss,
@@ -149,8 +149,8 @@ def test_loss_boundary(loss):
     (HalfTweedieLoss(power=1.5), [0.1, 100], [-np.inf, -3, -0.1, np.inf]),
     (HalfTweedieLoss(power=2), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
     (HalfTweedieLoss(power=3), [0.1, 100], [-np.inf, -3, -0.1, 0, np.inf]),
-    (BinaryCrossEntropy(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]),
-    (CategoricalCrossEntropy(), [], [-np.inf, -1, 1.1, np.inf]),
+    (HalfBinomialLoss(), [0.1, 0.5, 0.9], [-np.inf, -1, 2, np.inf]),
+    (HalfMultinomialLoss(), [], [-np.inf, -1, 1.1, np.inf]),
 ]
 # y_pred and y_true do not always have the same domain (valid value range).
 # Hence, we define extra sets of parameters for each of them.
@@ -160,8 +160,8 @@ def test_loss_boundary(loss):
     (HalfTweedieLoss(power=-3), [-100, -0.1, 0], []),
     (HalfTweedieLoss(power=0), [-100, 0], []),
     (HalfTweedieLoss(power=1.5), [0], []),
-    (BinaryCrossEntropy(), [0, 1], []),
-    (CategoricalCrossEntropy(), [0.0, 1.0, 2], []),
+    (HalfBinomialLoss(), [0, 1], []),
+    (HalfMultinomialLoss(), [0.0, 1.0, 2], []),
 ]
 Y_PRED_PARAMS = [
     # (loss, [y success], [y fail])
@@ -169,8 +169,8 @@ def test_loss_boundary(loss):
     (HalfTweedieLoss(power=-3), [], [-3, -0.1, 0]),
     (HalfTweedieLoss(power=0), [], [-3, -0.1, 0]),
     (HalfTweedieLoss(power=1.5), [], [0]),
-    (BinaryCrossEntropy(), [], [0, 1]),
-    (CategoricalCrossEntropy(), [0.1, 0.5], [0, 1]),
+    (HalfBinomialLoss(), [], [0, 1]),
+    (HalfMultinomialLoss(), [0.1, 0.5], [0, 1]),
 ]
 
 
@@ -207,21 +207,21 @@ def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
         (HalfPoissonLoss(), 2.0, np.log(4), 4 - 2 * np.log(4)),
         (HalfGammaLoss(), 2.0, np.log(4), np.log(4) + 2 / 4),
         (HalfTweedieLoss(power=3), 2.0, np.log(4), -1 / 4 + 1 / 4 ** 2),
-        (BinaryCrossEntropy(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
+        (HalfBinomialLoss(), 0.25, np.log(4), np.log(5) - 0.25 * np.log(4)),
         (
-            CategoricalCrossEntropy(n_classes=3),
+            HalfMultinomialLoss(n_classes=3),
             0.0,
             [0.2, 0.5, 0.3],
             logsumexp([0.2, 0.5, 0.3]) - 0.2,
         ),
         (
-            CategoricalCrossEntropy(n_classes=3),
+            HalfMultinomialLoss(n_classes=3),
             1.0,
             [0.2, 0.5, 0.3],
             logsumexp([0.2, 0.5, 0.3]) - 0.5,
         ),
         (
-            CategoricalCrossEntropy(n_classes=3),
+            HalfMultinomialLoss(n_classes=3),
             2.0,
             [0.2, 0.5, 0.3],
             logsumexp([0.2, 0.5, 0.3]) - 0.3,
@@ -474,7 +474,7 @@ def test_loss_gradients_are_the_same(loss, sample_weight):
     assert np.shares_memory(g3, out_g3)
 
     if hasattr(loss, "gradient_proba"):
-        assert loss.is_multiclass  # only for CategoricalCrossEntropy
+        assert loss.is_multiclass  # only for HalfMultinomialLoss
         out_g4 = np.empty_like(raw_prediction)
         out_proba = np.empty_like(raw_prediction)
         g4, proba = loss.gradient_proba(
@@ -605,7 +605,7 @@ def test_loss_of_perfect_prediction(loss, sample_weight):
         raw_prediction = np.array([-10, -0.1, 0, 0.1, 3, 10])
         y_true = loss.link.inverse(raw_prediction)
     else:
-        # CategoricalCrossEntropy
+        # HalfMultinomialLoss
         y_true = np.arange(loss.n_classes).astype(float)
         # raw_prediction with entries -exp(10), but +exp(10) on the diagonal
         # this is close enough to np.inf which would produce nan
@@ -729,12 +729,12 @@ def grad_func(x):
         ("squared_error", -2.0, 42),
         ("squared_error", 117.0, 1.05),
         ("squared_error", 0.0, 0.0),
-        # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp.
+        # The argmin of binomial_loss for y_true=0 and y_true=1 is resp.
         # -inf and +inf due to logit, cf. "complete separation". Therefore, we
         # use 0 < y_true < 1.
-        ("binary_crossentropy", 0.3, 0.1),
-        ("binary_crossentropy", -12, 0.2),
-        ("binary_crossentropy", 30, 0.9),
+        ("binomial_loss", 0.3, 0.1),
+        ("binomial_loss", -12, 0.2),
+        ("binomial_loss", 30, 0.9),
         ("poisson_loss", 12.0, 1.0),
         ("poisson_loss", 0.0, 2.0),
         ("poisson_loss", -22.0, 10.0),
@@ -872,7 +872,7 @@ def fun(x):
         (HalfPoissonLoss(), np.mean, "poisson"),
         (HalfGammaLoss(), np.mean, "exponential"),
         (HalfTweedieLoss(), np.mean, "exponential"),
-        (BinaryCrossEntropy(), np.mean, "binomial"),
+        (HalfBinomialLoss(), np.mean, "binomial"),
     ],
 )
 def test_specific_fit_intercept_only(loss, func, random_dist):
@@ -906,11 +906,11 @@ def test_specific_fit_intercept_only(loss, func, random_dist):
         assert_all_finite(baseline_prediction)
 
 
-def test_categorical_crossentropy_fit_intercept_only():
+def test_multinomial_loss_fit_intercept_only():
     """Test that fit_intercept_only returns the mean functional for CCE."""
     rng = np.random.RandomState(0)
     n_classes = 4
-    loss = CategoricalCrossEntropy(n_classes=n_classes)
+    loss = HalfMultinomialLoss(n_classes=n_classes)
     # Same logic as test_specific_fit_intercept_only. Here inverse link
     # function = softmax and link function = log - symmetry term.
     y_train = rng.randint(0, n_classes + 1, size=100).astype(np.float64)
@@ -929,20 +929,20 @@ def test_categorical_crossentropy_fit_intercept_only():
         assert_all_finite(baseline_prediction)
 
 
-def test_binary_and_categorical_crossentropy():
-    """Test that CCE with n_classes = 2 is the same as BinaryCrossEntropy."""
+def test_binomial_and_multinomial_loss():
+    """Test that multinomial loss with n_classes = 2 is the same as binomial loss."""
     rng = np.random.RandomState(0)
     n_samples = 20
-    bce = BinaryCrossEntropy()
-    cce = CategoricalCrossEntropy(n_classes=2)
+    binom = HalfBinomialLoss()
+    multinom = HalfMultinomialLoss(n_classes=2)
     y_train = rng.randint(0, 2, size=n_samples).astype(np.float64)
     raw_prediction = rng.normal(size=n_samples)
-    raw_cce = np.empty((n_samples, 2))
-    raw_cce[:, 0] = -0.5 * raw_prediction
-    raw_cce[:, 1] = 0.5 * raw_prediction
+    raw_multinom = np.empty((n_samples, 2))
+    raw_multinom[:, 0] = -0.5 * raw_prediction
+    raw_multinom[:, 1] = 0.5 * raw_prediction
     assert_allclose(
-        bce.loss(y_true=y_train, raw_prediction=raw_prediction),
-        cce.loss(y_true=y_train, raw_prediction=raw_cce),
+        binom.loss(y_true=y_train, raw_prediction=raw_prediction),
+        multinom.loss(y_true=y_train, raw_prediction=raw_multinom),
     )
 
 
From 98d479067c6c67a8a1613279e506009c9659fc54 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 20 Nov 2021 09:13:31 +0100
Subject: [PATCH 121/143] trigger CI


From 3b9403fba5585d783265bae1c6931733b8ebc968 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 20 Nov 2021 11:11:08 +0100
Subject: [PATCH 122/143] TST aligned create_memmap_backed_data

---
 sklearn/_loss/tests/test_loss.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index e9af49a56591f..2ad5633037c4a 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -275,9 +275,10 @@ def test_loss_dtype(
         out2 = np.empty_like(raw_prediction, dtype=dtype_out)
 
     if readonly_memmap:
-        y_true, raw_prediction = create_memmap_backed_data([y_true, raw_prediction])
+        y_true = create_memmap_backed_data(y_true, aligned=True)
+        raw_prediction = create_memmap_backed_data(raw_prediction, aligned=True)
         if sample_weight is not None:
-            sample_weight = create_memmap_backed_data(sample_weight)
+            sample_weight = create_memmap_backed_data(sample_weight, aligned=True)
 
     loss.loss(
         y_true=y_true,

From f5949d3ca1ac6b7e1fbd262eee453b16b7fd6130 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 21 Nov 2021 19:19:10 +0100
Subject: [PATCH 123/143] FIX replace CyBinaryCrossEntropy by
 CyHalfBinomialLoss

---
 sklearn/_loss/_loss.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/_loss.pxd b/sklearn/_loss/_loss.pxd
index e2e44ca712b35..7255243d331dc 100644
--- a/sklearn/_loss/_loss.pxd
+++ b/sklearn/_loss/_loss.pxd
@@ -69,7 +69,7 @@ cdef class CyHalfTweedieLoss(CyLossFunction):
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil
 
 
-cdef class CyBinaryCrossEntropy(CyLossFunction):
+cdef class CyHalfBinomialLoss(CyLossFunction):
     cdef double cy_loss(self, double y_true, double raw_prediction) nogil
     cdef double cy_gradient(self, double y_true, double raw_prediction) nogil
     cdef double_pair cy_grad_hess(self, double y_true, double raw_prediction) nogil

From d96774042211c23ba259ee9991673679d9f9c082 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 21 Nov 2021 19:24:00 +0100
Subject: [PATCH 124/143] MNT remove Cython compiler directives due to #21512

---
 sklearn/_loss/_loss.pyx.tp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sklearn/_loss/_loss.pyx.tp b/sklearn/_loss/_loss.pyx.tp
index 4440b12114a42..7c343c2881975 100644
--- a/sklearn/_loss/_loss.pyx.tp
+++ b/sklearn/_loss/_loss.pyx.tp
@@ -162,11 +162,6 @@ WARNING: Do not edit `sklearn/_loss/_loss.pyx` file directly, as it is generated
 """
 #------------------------------------------------------------------------------
 
-# cython: cdivision=True
-# cython: boundscheck=False
-# cython: wraparound=False
-# cython: language_level=3
-
 # Design:
 # See https://github.com/scikit-learn/scikit-learn/issues/15123 for reasons.
 # a) Merge link functions into loss functions for speed and numerical

From 64c2e331b5c62e2d6d563b4671aeb735fbd2d945 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 22 Nov 2021 07:27:19 +0100
Subject: [PATCH 125/143] ENH replace loss by common loss in HGBT

---
 .../_hist_gradient_boosting/_loss.pyx         | 219 --------
 .../gradient_boosting.py                      | 257 +++++++---
 .../ensemble/_hist_gradient_boosting/loss.py  | 466 ------------------
 .../tests/test_gradient_boosting.py           |  74 ++-
 .../tests/test_loss.py                        | 348 -------------
 sklearn/ensemble/setup.py                     |   6 -
 6 files changed, 260 insertions(+), 1110 deletions(-)
 delete mode 100644 sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
 delete mode 100644 sklearn/ensemble/_hist_gradient_boosting/loss.py
 delete mode 100644 sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py

diff --git a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx b/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
deleted file mode 100644
index 23e7d2841443b..0000000000000
--- a/sklearn/ensemble/_hist_gradient_boosting/_loss.pyx
+++ /dev/null
@@ -1,219 +0,0 @@
-# Author: Nicolas Hug
-
-cimport cython
-from cython.parallel import prange
-import numpy as np
-cimport numpy as np
-
-from libc.math cimport exp, log
-
-from .common cimport Y_DTYPE_C
-from .common cimport G_H_DTYPE_C
-
-np.import_array()
-
-
-def _update_gradients_least_squares(
-        G_H_DTYPE_C [::1] gradients,  # OUT
-        const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions, # IN
-        int n_threads,  # IN
-):
-
-    cdef:
-        int n_samples
-        int i
-
-    n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-        # Note: a more correct expression is 2 * (raw_predictions - y_true)
-        # but since we use 1 for the constant hessian value (and not 2) this
-        # is strictly equivalent for the leaves values.
-        gradients[i] = raw_predictions[i] - y_true[i]
-
-
-def _update_gradients_hessians_least_squares(
-        G_H_DTYPE_C [::1] gradients,  # OUT
-        G_H_DTYPE_C [::1] hessians,  # OUT
-        const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight,  # IN
-        int n_threads,  # IN
-):
-
-    cdef:
-        int n_samples
-        int i
-
-    n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-        # Note: a more correct exp is 2 * (raw_predictions - y_true) * sample_weight
-        # but since we use 1 for the constant hessian value (and not 2) this
-        # is strictly equivalent for the leaves values.
-        gradients[i] = (raw_predictions[i] - y_true[i]) * sample_weight[i]
-        hessians[i] = sample_weight[i]
-
-
-def _update_gradients_hessians_least_absolute_deviation(
-        G_H_DTYPE_C [::1] gradients,  # OUT
-        G_H_DTYPE_C [::1] hessians,  # OUT
-        const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight, # IN
-        int n_threads,  # IN
-):
-    cdef:
-        int n_samples
-        int i
-
-    n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-        # gradient = sign(raw_predicition - y_pred) * sample_weight
-        gradients[i] = sample_weight[i] * (2 *
-                        (y_true[i] - raw_predictions[i] < 0) - 1)
-        hessians[i] = sample_weight[i]
-
-
-def _update_gradients_least_absolute_deviation(
-        G_H_DTYPE_C [::1] gradients,  # OUT
-        const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions,  # IN
-        int n_threads,  # IN
-):
-    cdef:
-        int n_samples
-        int i
-
-    n_samples = raw_predictions.shape[0]
-    for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-        # gradient = sign(raw_predicition - y_pred)
-        gradients[i] = 2 * (y_true[i] - raw_predictions[i] < 0) - 1
-
-
-def _update_gradients_hessians_poisson(
-        G_H_DTYPE_C [::1] gradients,  # OUT
-        G_H_DTYPE_C [::1] hessians,  # OUT
-        const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight, # IN
-        int n_threads,  # IN
-):
-    cdef:
-        int n_samples
-        int i
-        Y_DTYPE_C y_pred
-
-    n_samples = raw_predictions.shape[0]
-    if sample_weight is None:
-        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-            # Note: We use only half of the deviance loss. Therefore, there is
-            # no factor of 2.
-            y_pred = exp(raw_predictions[i])
-            gradients[i] = (y_pred - y_true[i])
-            hessians[i] = y_pred
-    else:
-        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-            # Note: We use only half of the deviance loss. Therefore, there is
-            # no factor of 2.
-            y_pred = exp(raw_predictions[i])
-            gradients[i] = (y_pred - y_true[i]) * sample_weight[i]
-            hessians[i] = y_pred * sample_weight[i]
-
-
-def _update_gradients_hessians_binary_crossentropy(
-        G_H_DTYPE_C [::1] gradients,  # OUT
-        G_H_DTYPE_C [::1] hessians,  # OUT
-        const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight,  # IN
-        int n_threads,  # IN
-):
-    cdef:
-        int n_samples
-        Y_DTYPE_C p_i  # proba that ith sample belongs to positive class
-        int i
-
-    n_samples = raw_predictions.shape[0]
-    if sample_weight is None:
-        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-            p_i = _cexpit(raw_predictions[i])
-            gradients[i] = p_i - y_true[i]
-            hessians[i] = p_i * (1. - p_i)
-    else:
-        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-            p_i = _cexpit(raw_predictions[i])
-            gradients[i] = (p_i - y_true[i]) * sample_weight[i]
-            hessians[i] = p_i * (1. - p_i) * sample_weight[i]
-
-
-def _update_gradients_hessians_categorical_crossentropy(
-        G_H_DTYPE_C [:, ::1] gradients,  # OUT
-        G_H_DTYPE_C [:, ::1] hessians,  # OUT
-        const Y_DTYPE_C [::1] y_true,  # IN
-        const Y_DTYPE_C [:, ::1] raw_predictions,  # IN
-        const Y_DTYPE_C [::1] sample_weight,  # IN
-        int n_threads,  # IN
-):
-    cdef:
-        int prediction_dim = raw_predictions.shape[0]
-        int n_samples = raw_predictions.shape[1]
-        int k  # class index
-        int i  # sample index
-        Y_DTYPE_C sw
-        # p[i, k] is the probability that class(ith sample) == k.
-        # It's the softmax of the raw predictions
-        Y_DTYPE_C [:, ::1] p = np.empty(shape=(n_samples, prediction_dim))
-        Y_DTYPE_C p_i_k
-
-    if sample_weight is None:
-        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-            # first compute softmaxes of sample i for each class
-            for k in range(prediction_dim):
-                p[i, k] = raw_predictions[k, i]  # prepare softmax
-            _compute_softmax(p, i)
-            # then update gradients and hessians
-            for k in range(prediction_dim):
-                p_i_k = p[i, k]
-                gradients[k, i] = p_i_k - (y_true[i] == k)
-                hessians[k, i] = p_i_k * (1. - p_i_k)
-    else:
-        for i in prange(n_samples, schedule='static', nogil=True, num_threads=n_threads):
-            # first compute softmaxes of sample i for each class
-            for k in range(prediction_dim):
-                p[i, k] = raw_predictions[k, i]  # prepare softmax
-            _compute_softmax(p, i)
-            # then update gradients and hessians
-            sw = sample_weight[i]
-            for k in range(prediction_dim):
-                p_i_k = p[i, k]
-                gradients[k, i] = (p_i_k - (y_true[i] == k)) * sw
-                hessians[k, i] = (p_i_k * (1. - p_i_k)) * sw
-
-
-cdef inline void _compute_softmax(Y_DTYPE_C [:, ::1] p, const int i) nogil:
-    """Compute softmaxes of values in p[i, :]."""
-    # i needs to be passed (and stays constant) because otherwise Cython does
-    # not generate optimal code
-
-    cdef:
-        Y_DTYPE_C max_value = p[i, 0]
-        Y_DTYPE_C sum_exps = 0.
-        unsigned int k
-        unsigned prediction_dim = p.shape[1]
-
-    # Compute max value of array for numerical stability
-    for k in range(1, prediction_dim):
-        if max_value < p[i, k]:
-            max_value = p[i, k]
-
-    for k in range(prediction_dim):
-        p[i, k] = exp(p[i, k] - max_value)
-        sum_exps += p[i, k]
-
-    for k in range(prediction_dim):
-        p[i, k] /= sum_exps
-
-
-cdef inline Y_DTYPE_C _cexpit(const Y_DTYPE_C x) nogil:
-    """Custom expit (logistic sigmoid function)"""
-    return 1. / (1. + exp(-x))
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 097ceeeadc588..5d23852334b4f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -7,6 +7,15 @@
 
 import numpy as np
 from timeit import default_timer as time
+from ..._loss.loss import (
+    _LOSSES,
+    BaseLoss,
+    AbsoluteError,
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+)
 from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
 from ...utils import check_random_state, resample
 from ...utils.validation import (
@@ -20,12 +29,95 @@
 from ...model_selection import train_test_split
 from ...preprocessing import LabelEncoder
 from ._gradient_boosting import _update_raw_predictions
-from .common import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE
+from .common import Y_DTYPE, X_DTYPE, X_BINNED_DTYPE, G_H_DTYPE
 
 from .binning import _BinMapper
 from .grower import TreeGrower
-from .loss import _LOSSES
-from .loss import BaseLoss
+
+
+_LOSSES = _LOSSES.copy()
+# TODO: Remove least_squares and least_absolute_deviation in v1.2
+_LOSSES.update(
+    {
+        "least_squares": HalfSquaredError,
+        "least_absolute_deviation": AbsoluteError,
+        "poisson": HalfPoissonLoss,
+        "binary_crossentropy": HalfBinomialLoss,
+        "categorical_crossentropy": HalfMultinomialLoss,
+    }
+)
+
+
+def _init_gradients_and_hessians(constant_hessian, n_samples, prediction_dim):
+    """Return initial gradients and hessians.
+
+    Unless hessians are constant, arrays are initialized with undefined values.
+
+    Parameters
+    ----------
+    constant_hessian : bool
+        Usual input is loss.constant_hessian.
+    n_samples : int
+        The number of samples passed to `fit()`.
+    prediction_dim : int
+        The dimension of a raw prediction, i.e. the number of trees
+        built at each iteration. Equals 1 for regression and binary
+        classification, or K where K is the number of classes for
+        multiclass classification.
+
+    Returns
+    -------
+    gradients : ndarray, shape (prediction_dim, n_samples)
+        The initial gradients. The array is not initialized.
+    hessians : ndarray, shape (prediction_dim, n_samples)
+        If hessians are constant (e.g. for `LeastSquares` loss, the
+        array is initialized to ``1``. Otherwise, the array is allocated
+        without being initialized.
+    """
+    shape = (prediction_dim, n_samples)
+    gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
+
+    if constant_hessian:
+        # If the hessians are constant, we consider they are equal to 1.
+        # - This is correct for the half LS loss
+        # - For the Absolute Error, hessians are actually 0, but they are
+        # always ignored anyway.
+        hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)
+    else:
+        hessians = np.empty(shape=shape, dtype=G_H_DTYPE)
+
+    return gradients, hessians
+
+
+def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight):
+    """Update the leaf values to be predicted by the tree.
+
+    Update equals:
+        loss.fit_intercept_only(y_true - raw_predictions)
+
+    This is only applied if loss.need_update_leaves_values is True.
+    Note: It only works, if the loss is a function of the residual, as is the
+    case for AbsoluteError and PinballLoss. Otherwise, one would need to get
+    the minimum of loss(y_true, raw_prediction + x) in x. A few examples:
+      - AbsoluteError: median(y_true - raw_predictions).
+      - PinballLoss: quantile(y_true - raw_predictions).
+    See also notes about need_update_leaves_values in BaseLoss.
+    """
+    # TODO: Ideally this should be computed in parallel over the leaves using something
+    # similar to _update_raw_predictions(), but this requires a cython version of
+    # median().
+    for leaf in grower.finalized_leaves:
+        indices = leaf.sample_indices
+        if sample_weight is None:
+            sw = None
+        else:
+            sw = sample_weight[indices]
+        update = loss.fit_intercept_only(
+            y_true=y_true[indices] - raw_prediction[indices],
+            sample_weight=sw,
+        )
+        leaf.value = grower.shrinkage * update
+        # Note that the regularization is ignored here
 
 
 class BaseHistGradientBoosting(BaseEstimator, ABC):
@@ -270,9 +362,7 @@ def fit(self, X, y, sample_weight=None):
         n_threads = _openmp_effective_n_threads()
 
         if isinstance(self.loss, str):
-            self._loss = self._get_loss(
-                sample_weight=sample_weight, n_threads=n_threads
-            )
+            self._loss = self._get_loss(sample_weight=sample_weight)
         elif isinstance(self.loss, BaseLoss):
             self._loss = self.loss
 
@@ -285,6 +375,7 @@ def fit(self, X, y, sample_weight=None):
         self._use_validation_data = self.validation_fraction is not None
         if self.do_early_stopping_ and self._use_validation_data:
             # stratify for classification
+            # instead of checking predict_proba, loss.n_classes >= 2 would also work
             stratify = y if hasattr(self._loss, "predict_proba") else None
 
             # Save the state of the RNG for the training and validation split.
@@ -366,14 +457,16 @@ def fit(self, X, y, sample_weight=None):
             # shape (n_trees_per_iteration, n_samples) where
             # n_trees_per_iterations is n_classes in multiclass classification,
             # else 1.
-            self._baseline_prediction = self._loss.get_baseline_prediction(
-                y_train, sample_weight_train, self.n_trees_per_iteration_
-            )
+            self._baseline_prediction = np.atleast_1d(
+                self._loss.fit_intercept_only(
+                    y_true=y_train, sample_weight=sample_weight_train
+                )
+            ).T
             raw_predictions = np.zeros(
                 shape=(self.n_trees_per_iteration_, n_samples),
                 dtype=self._baseline_prediction.dtype,
             )
-            raw_predictions += self._baseline_prediction
+            raw_predictions += self._baseline_prediction[:, None]
 
             # predictors is a matrix (list of lists) of TreePredictor objects
             # with shape (n_iter_, n_trees_per_iteration)
@@ -405,15 +498,16 @@ def fit(self, X, y, sample_weight=None):
                             dtype=self._baseline_prediction.dtype,
                         )
 
-                        raw_predictions_val += self._baseline_prediction
+                        raw_predictions_val += self._baseline_prediction[:, None]
 
                     self._check_early_stopping_loss(
-                        raw_predictions,
-                        y_train,
-                        sample_weight_train,
-                        raw_predictions_val,
-                        y_val,
-                        sample_weight_val,
+                        raw_predictions=raw_predictions,
+                        y_train=y_train,
+                        sample_weight_train=sample_weight_train,
+                        raw_predictions_val=raw_predictions_val,
+                        y_val=y_val,
+                        sample_weight_val=sample_weight_val,
+                        n_threads=n_threads,
                     )
                 else:
                     self._scorer = check_scoring(self, self.scoring)
@@ -483,10 +577,10 @@ def fit(self, X, y, sample_weight=None):
 
         # initialize gradients and hessians (empty arrays).
         # shape = (n_trees_per_iteration, n_samples).
-        gradients, hessians = self._loss.init_gradients_and_hessians(
+        gradients, hessians = _init_gradients_and_hessians(
+            constant_hessian=self._loss.constant_hessian,
             n_samples=n_samples,
             prediction_dim=self.n_trees_per_iteration_,
-            sample_weight=sample_weight_train,
         )
 
         for iteration in range(begin_at_stage, self.max_iter):
@@ -498,9 +592,26 @@ def fit(self, X, y, sample_weight=None):
                 )
 
             # Update gradients and hessians, inplace
-            self._loss.update_gradients_and_hessians(
-                gradients, hessians, y_train, raw_predictions, sample_weight_train
-            )
+            # Note that self._loss expects shape (n_samples,) for
+            # n_trees_per_iteration = 1 else shape (n_samples, n_trees_per_iteration).
+            # T (transpose) returns a view.
+            if self._loss.constant_hessian:
+                self._loss.gradient(
+                    y_true=y_train,
+                    raw_prediction=raw_predictions.T,
+                    sample_weight=sample_weight_train,
+                    gradient_out=gradients.T,
+                    n_threads=n_threads,
+                )
+            else:
+                self._loss.gradient_hessian(
+                    y_true=y_train,
+                    raw_prediction=raw_predictions.T,
+                    sample_weight=sample_weight_train,
+                    gradient_out=gradients.T,
+                    hessian_out=hessians.T,
+                    n_threads=n_threads,
+                )
 
             # Append a list since there may be more than 1 predictor per iter
             predictors.append([])
@@ -530,8 +641,12 @@ def fit(self, X, y, sample_weight=None):
                 acc_compute_hist_time += grower.total_compute_hist_time
 
                 if self._loss.need_update_leaves_values:
-                    self._loss.update_leaves_values(
-                        grower, y_train, raw_predictions[k, :], sample_weight_train
+                    _update_leaves_values(
+                        loss=self._loss,
+                        grower=grower,
+                        y_true=y_train,
+                        raw_prediction=raw_predictions[k, :],
+                        sample_weight=sample_weight_train,
                     )
 
                 predictor = grower.make_predictor(
@@ -559,12 +674,13 @@ def fit(self, X, y, sample_weight=None):
                             )
 
                     should_early_stop = self._check_early_stopping_loss(
-                        raw_predictions,
-                        y_train,
-                        sample_weight_train,
-                        raw_predictions_val,
-                        y_val,
-                        sample_weight_val,
+                        raw_predictions=raw_predictions,
+                        y_train=y_train,
+                        sample_weight_train=sample_weight_train,
+                        raw_predictions_val=raw_predictions_val,
+                        y_val=y_val,
+                        sample_weight_val=sample_weight_val,
+                        n_threads=n_threads,
                     )
 
                 else:
@@ -715,19 +831,29 @@ def _check_early_stopping_loss(
         raw_predictions_val,
         y_val,
         sample_weight_val,
+        n_threads=1,
     ):
         """Check if fitting should be early-stopped based on loss.
 
         Scores are computed on validation data or on training data.
         """
-
         self.train_score_.append(
-            -self._loss(y_train, raw_predictions, sample_weight_train)
+            -self._loss(
+                y_true=y_train,
+                raw_prediction=raw_predictions.T,
+                sample_weight=sample_weight_train,
+                n_threads=n_threads,
+            )
         )
 
         if self._use_validation_data:
             self.validation_score_.append(
-                -self._loss(y_val, raw_predictions_val, sample_weight_val)
+                -self._loss(
+                    y_true=y_val,
+                    raw_prediction=raw_predictions_val.T,
+                    sample_weight=sample_weight_val,
+                    n_threads=n_threads,
+                )
             )
             return self._should_stop(self.validation_score_)
         else:
@@ -855,7 +981,7 @@ def _raw_predict(self, X, n_threads=None):
             shape=(self.n_trees_per_iteration_, n_samples),
             dtype=self._baseline_prediction.dtype,
         )
-        raw_predictions += self._baseline_prediction
+        raw_predictions += self._baseline_prediction[:, None]
 
         # We intentionally decouple the number of threads used at prediction
         # time from the number of threads used at fit time because the model
@@ -921,7 +1047,7 @@ def _staged_raw_predict(self, X):
             shape=(self.n_trees_per_iteration_, n_samples),
             dtype=self._baseline_prediction.dtype,
         )
-        raw_predictions += self._baseline_prediction
+        raw_predictions += self._baseline_prediction[:, None]
 
         # We intentionally decouple the number of threads used at prediction
         # time from the number of threads used at fit time because the model
@@ -983,7 +1109,7 @@ def _more_tags(self):
         return {"allow_nan": True}
 
     @abstractmethod
-    def _get_loss(self, sample_weight, n_threads):
+    def _get_loss(self, sample_weight):
         pass
 
     @abstractmethod
@@ -1261,7 +1387,8 @@ def predict(self, X):
         check_is_fitted(self)
         # Return inverse link of raw predictions after converting
         # shape (n_samples, 1) to (n_samples,)
-        return self._loss.inverse_link_function(self._raw_predict(X).ravel())
+        # loss.link.inverse is the inverse link function
+        return self._loss.link.inverse(self._raw_predict(X).ravel())
 
     def staged_predict(self, X):
         """Predict regression target for each iteration.
@@ -1282,7 +1409,7 @@ def staged_predict(self, X):
             The predicted values of the input samples, for each iteration.
         """
         for raw_predictions in self._staged_raw_predict(X):
-            yield self._loss.inverse_link_function(raw_predictions.ravel())
+            yield self._loss.link.inverse(raw_predictions.ravel())
 
     def _encode_y(self, y):
         # Just convert y to the expected dtype
@@ -1296,7 +1423,7 @@ def _encode_y(self, y):
                 )
         return y
 
-    def _get_loss(self, sample_weight, n_threads):
+    def _get_loss(self, sample_weight):
         # TODO: Remove in v1.2
         if self.loss == "least_squares":
             warnings.warn(
@@ -1305,9 +1432,7 @@ def _get_loss(self, sample_weight, n_threads):
                 "equivalent.",
                 FutureWarning,
             )
-            return _LOSSES["squared_error"](
-                sample_weight=sample_weight, n_threads=n_threads
-            )
+            return _LOSSES["squared_error"](sample_weight=sample_weight)
         elif self.loss == "least_absolute_deviation":
             warnings.warn(
                 "The loss 'least_absolute_deviation' was deprecated in v1.0 "
@@ -1315,11 +1440,9 @@ def _get_loss(self, sample_weight, n_threads):
                 "which is equivalent.",
                 FutureWarning,
             )
-            return _LOSSES["absolute_error"](
-                sample_weight=sample_weight, n_threads=n_threads
-            )
+            return _LOSSES["absolute_error"](sample_weight=sample_weight)
 
-        return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)
+        return _LOSSES[self.loss](sample_weight=sample_weight)
 
 
 class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
@@ -1606,7 +1729,7 @@ def predict_proba(self, X):
             The class probabilities of the input samples.
         """
         raw_predictions = self._raw_predict(X)
-        return self._loss.predict_proba(raw_predictions)
+        return self._loss.predict_proba(raw_predictions.T)
 
     def staged_predict_proba(self, X):
         """Predict class probabilities at each iteration.
@@ -1626,7 +1749,7 @@ def staged_predict_proba(self, X):
             for each iteration.
         """
         for raw_predictions in self._staged_raw_predict(X):
-            yield self._loss.predict_proba(raw_predictions)
+            yield self._loss.predict_proba(raw_predictions.T)
 
     def decision_function(self, X):
         """Compute the decision function of ``X``.
@@ -1688,22 +1811,34 @@ def _encode_y(self, y):
         encoded_y = encoded_y.astype(Y_DTYPE, copy=False)
         return encoded_y
 
-    def _get_loss(self, sample_weight, n_threads):
-        if self.loss == "categorical_crossentropy" and self.n_trees_per_iteration_ == 1:
-            raise ValueError(
-                "'categorical_crossentropy' is not suitable for "
-                "a binary classification problem. Please use "
-                "'auto' or 'binary_crossentropy' instead."
-            )
-
+    def _get_loss(self, sample_weight):
         if self.loss == "auto":
             if self.n_trees_per_iteration_ == 1:
-                return _LOSSES["binary_crossentropy"](
-                    sample_weight=sample_weight, n_threads=n_threads
-                )
+                return _LOSSES["binary_crossentropy"](sample_weight=sample_weight)
             else:
                 return _LOSSES["categorical_crossentropy"](
-                    sample_weight=sample_weight, n_threads=n_threads
+                    sample_weight=sample_weight,
+                    n_classes=self.n_trees_per_iteration_,
                 )
 
-        return _LOSSES[self.loss](sample_weight=sample_weight, n_threads=n_threads)
+        if self.loss == "categorical_crossentropy":
+            if self.n_trees_per_iteration_ == 1:
+                raise ValueError(
+                    "'categorical_crossentropy' is not suitable for "
+                    "a binary classification problem. Please use "
+                    "'auto' or 'binary_crossentropy' instead."
+                )
+            else:
+                return _LOSSES[self.loss](
+                    sample_weight=sample_weight, n_classes=self.n_trees_per_iteration_
+                )
+        else:
+            if self.n_trees_per_iteration_ > 1:
+                raise ValueError(
+                    "loss='binary_crossentropy' is not defined for multiclass"
+                    " classification with n_classes="
+                    f"{self.n_trees_per_iteration_}, use loss="
+                    "'categorical_crossentropy' instead"
+                )
+            else:
+                return _LOSSES[self.loss](sample_weight=sample_weight)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/loss.py b/sklearn/ensemble/_hist_gradient_boosting/loss.py
deleted file mode 100644
index c5870f97f900e..0000000000000
--- a/sklearn/ensemble/_hist_gradient_boosting/loss.py
+++ /dev/null
@@ -1,466 +0,0 @@
-"""
-This module contains the loss classes.
-
-Specific losses are used for regression, binary classification or multiclass
-classification.
-"""
-# Author: Nicolas Hug
-
-from abc import ABC, abstractmethod
-
-import numpy as np
-from scipy.special import expit, logsumexp, xlogy
-
-from .common import Y_DTYPE
-from .common import G_H_DTYPE
-from ._loss import _update_gradients_least_squares
-from ._loss import _update_gradients_hessians_least_squares
-from ._loss import _update_gradients_least_absolute_deviation
-from ._loss import _update_gradients_hessians_least_absolute_deviation
-from ._loss import _update_gradients_hessians_binary_crossentropy
-from ._loss import _update_gradients_hessians_categorical_crossentropy
-from ._loss import _update_gradients_hessians_poisson
-from ...utils._openmp_helpers import _openmp_effective_n_threads
-from ...utils.stats import _weighted_percentile
-
-
-class BaseLoss(ABC):
-    """Base class for a loss."""
-
-    def __init__(self, hessians_are_constant, n_threads=None):
-        self.hessians_are_constant = hessians_are_constant
-        self.n_threads = _openmp_effective_n_threads(n_threads)
-
-    def __call__(self, y_true, raw_predictions, sample_weight):
-        """Return the weighted average loss"""
-        return np.average(
-            self.pointwise_loss(y_true, raw_predictions), weights=sample_weight
-        )
-
-    @abstractmethod
-    def pointwise_loss(self, y_true, raw_predictions):
-        """Return loss value for each input"""
-
-    # This variable indicates whether the loss requires the leaves values to
-    # be updated once the tree has been trained. The trees are trained to
-    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
-    # some losses (e.g. least absolute deviation) we need to adjust the tree
-    # values to account for the "line search" of the gradient descent
-    # procedure. See the original paper Greedy Function Approximation: A
-    # Gradient Boosting Machine by Friedman
-    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
-    need_update_leaves_values = False
-
-    def init_gradients_and_hessians(self, n_samples, prediction_dim, sample_weight):
-        """Return initial gradients and hessians.
-
-        Unless hessians are constant, arrays are initialized with undefined
-        values.
-
-        Parameters
-        ----------
-        n_samples : int
-            The number of samples passed to `fit()`.
-
-        prediction_dim : int
-            The dimension of a raw prediction, i.e. the number of trees
-            built at each iteration. Equals 1 for regression and binary
-            classification, or K where K is the number of classes for
-            multiclass classification.
-
-        sample_weight : array-like of shape(n_samples,) default=None
-            Weights of training data.
-
-        Returns
-        -------
-        gradients : ndarray, shape (prediction_dim, n_samples)
-            The initial gradients. The array is not initialized.
-        hessians : ndarray, shape (prediction_dim, n_samples)
-            If hessians are constant (e.g. for `LeastSquares` loss, the
-            array is initialized to ``1``. Otherwise, the array is allocated
-            without being initialized.
-        """
-        shape = (prediction_dim, n_samples)
-        gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
-
-        if self.hessians_are_constant:
-            # If the hessians are constant, we consider they are equal to 1.
-            # - This is correct for the half LS loss
-            # - For LAD loss, hessians are actually 0, but they are always
-            #   ignored anyway.
-            hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)
-        else:
-            hessians = np.empty(shape=shape, dtype=G_H_DTYPE)
-
-        return gradients, hessians
-
-    @abstractmethod
-    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
-        """Return initial predictions (before the first iteration).
-
-        Parameters
-        ----------
-        y_train : ndarray, shape (n_samples,)
-            The target training values.
-
-        sample_weight : array-like of shape(n_samples,) default=None
-            Weights of training data.
-
-        prediction_dim : int
-            The dimension of one prediction: 1 for binary classification and
-            regression, n_classes for multiclass classification.
-
-        Returns
-        -------
-        baseline_prediction : float or ndarray, shape (1, prediction_dim)
-            The baseline prediction.
-        """
-
-    @abstractmethod
-    def update_gradients_and_hessians(
-        self, gradients, hessians, y_true, raw_predictions, sample_weight
-    ):
-        """Update gradients and hessians arrays, inplace.
-
-        The gradients (resp. hessians) are the first (resp. second) order
-        derivatives of the loss for each sample with respect to the
-        predictions of model, evaluated at iteration ``i - 1``.
-
-        Parameters
-        ----------
-        gradients : ndarray, shape (prediction_dim, n_samples)
-            The gradients (treated as OUT array).
-
-        hessians : ndarray, shape (prediction_dim, n_samples) or \
-            (1,)
-            The hessians (treated as OUT array).
-
-        y_true : ndarray, shape (n_samples,)
-            The true target values or each training sample.
-
-        raw_predictions : ndarray, shape (prediction_dim, n_samples)
-            The raw_predictions (i.e. values from the trees) of the tree
-            ensemble at iteration ``i - 1``.
-
-        sample_weight : array-like of shape(n_samples,) default=None
-            Weights of training data.
-        """
-
-
-class LeastSquares(BaseLoss):
-    """Least squares loss, for regression.
-
-    For a given sample x_i, least squares loss is defined as::
-
-        loss(x_i) = 0.5 * (y_true_i - raw_pred_i)**2
-
-    This actually computes the half least squares loss to simplify
-    the computation of the gradients and get a unit hessian (and be consistent
-    with what is done in LightGBM).
-    """
-
-    def __init__(self, sample_weight, n_threads=None):
-        # If sample weights are provided, the hessians and gradients
-        # are multiplied by sample_weight, which means the hessians are
-        # equal to sample weights.
-        super().__init__(
-            hessians_are_constant=sample_weight is None, n_threads=n_threads
-        )
-
-    def pointwise_loss(self, y_true, raw_predictions):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        loss = 0.5 * np.power(y_true - raw_predictions, 2)
-        return loss
-
-    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
-        return np.average(y_train, weights=sample_weight)
-
-    @staticmethod
-    def inverse_link_function(raw_predictions):
-        return raw_predictions
-
-    def update_gradients_and_hessians(
-        self, gradients, hessians, y_true, raw_predictions, sample_weight
-    ):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        gradients = gradients.reshape(-1)
-        if sample_weight is None:
-            _update_gradients_least_squares(
-                gradients, y_true, raw_predictions, self.n_threads
-            )
-        else:
-            hessians = hessians.reshape(-1)
-            _update_gradients_hessians_least_squares(
-                gradients,
-                hessians,
-                y_true,
-                raw_predictions,
-                sample_weight,
-                self.n_threads,
-            )
-
-
-class LeastAbsoluteDeviation(BaseLoss):
-    """Least absolute deviation, for regression.
-
-    For a given sample x_i, the loss is defined as::
-
-        loss(x_i) = |y_true_i - raw_pred_i|
-    """
-
-    def __init__(self, sample_weight, n_threads=None):
-        # If sample weights are provided, the hessians and gradients
-        # are multiplied by sample_weight, which means the hessians are
-        # equal to sample weights.
-        super().__init__(
-            hessians_are_constant=sample_weight is None, n_threads=n_threads
-        )
-
-    # This variable indicates whether the loss requires the leaves values to
-    # be updated once the tree has been trained. The trees are trained to
-    # predict a Newton-Raphson step (see grower._finalize_leaf()). But for
-    # some losses (e.g. least absolute deviation) we need to adjust the tree
-    # values to account for the "line search" of the gradient descent
-    # procedure. See the original paper Greedy Function Approximation: A
-    # Gradient Boosting Machine by Friedman
-    # (https://statweb.stanford.edu/~jhf/ftp/trebst.pdf) for the theory.
-    need_update_leaves_values = True
-
-    def pointwise_loss(self, y_true, raw_predictions):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        loss = np.abs(y_true - raw_predictions)
-        return loss
-
-    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
-        if sample_weight is None:
-            return np.median(y_train)
-        else:
-            return _weighted_percentile(y_train, sample_weight, 50)
-
-    @staticmethod
-    def inverse_link_function(raw_predictions):
-        return raw_predictions
-
-    def update_gradients_and_hessians(
-        self, gradients, hessians, y_true, raw_predictions, sample_weight
-    ):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        gradients = gradients.reshape(-1)
-        if sample_weight is None:
-            _update_gradients_least_absolute_deviation(
-                gradients,
-                y_true,
-                raw_predictions,
-                self.n_threads,
-            )
-        else:
-            hessians = hessians.reshape(-1)
-            _update_gradients_hessians_least_absolute_deviation(
-                gradients,
-                hessians,
-                y_true,
-                raw_predictions,
-                sample_weight,
-                self.n_threads,
-            )
-
-    def update_leaves_values(self, grower, y_true, raw_predictions, sample_weight):
-        # Update the values predicted by the tree with
-        # median(y_true - raw_predictions).
-        # See note about need_update_leaves_values in BaseLoss.
-
-        # TODO: ideally this should be computed in parallel over the leaves
-        # using something similar to _update_raw_predictions(), but this
-        # requires a cython version of median()
-        for leaf in grower.finalized_leaves:
-            indices = leaf.sample_indices
-            if sample_weight is None:
-                median_res = np.median(y_true[indices] - raw_predictions[indices])
-            else:
-                median_res = _weighted_percentile(
-                    y_true[indices] - raw_predictions[indices],
-                    sample_weight=sample_weight[indices],
-                    percentile=50,
-                )
-            leaf.value = grower.shrinkage * median_res
-            # Note that the regularization is ignored here
-
-
-class Poisson(BaseLoss):
-    """Poisson deviance loss with log-link, for regression.
-
-    For a given sample x_i, Poisson deviance loss is defined as::
-
-        loss(x_i) = y_true_i * log(y_true_i/exp(raw_pred_i))
-                    - y_true_i + exp(raw_pred_i))
-
-    This actually computes half the Poisson deviance to simplify
-    the computation of the gradients.
-    """
-
-    def __init__(self, sample_weight, n_threads=None):
-        super().__init__(hessians_are_constant=False, n_threads=n_threads)
-
-    inverse_link_function = staticmethod(np.exp)
-
-    def pointwise_loss(self, y_true, raw_predictions):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        # TODO: For speed, we could remove the constant xlogy(y_true, y_true)
-        # Advantage of this form: minimum of zero at raw_predictions = y_true.
-        loss = (
-            xlogy(y_true, y_true)
-            - y_true * (raw_predictions + 1)
-            + np.exp(raw_predictions)
-        )
-        return loss
-
-    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
-        y_pred = np.average(y_train, weights=sample_weight)
-        eps = np.finfo(y_train.dtype).eps
-        y_pred = np.clip(y_pred, eps, None)
-        return np.log(y_pred)
-
-    def update_gradients_and_hessians(
-        self, gradients, hessians, y_true, raw_predictions, sample_weight
-    ):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        gradients = gradients.reshape(-1)
-        hessians = hessians.reshape(-1)
-        _update_gradients_hessians_poisson(
-            gradients,
-            hessians,
-            y_true,
-            raw_predictions,
-            sample_weight,
-            self.n_threads,
-        )
-
-
-class BinaryCrossEntropy(BaseLoss):
-    """Binary cross-entropy loss, for binary classification.
-
-    For a given sample x_i, the binary cross-entropy loss is defined as the
-    negative log-likelihood of the model which can be expressed as::
-
-        loss(x_i) = log(1 + exp(raw_pred_i)) - y_true_i * raw_pred_i
-
-    See The Elements of Statistical Learning, by Hastie, Tibshirani, Friedman,
-    section 4.4.1 (about logistic regression).
-    """
-
-    def __init__(self, sample_weight, n_threads=None):
-        super().__init__(hessians_are_constant=False, n_threads=n_threads)
-
-    inverse_link_function = staticmethod(expit)
-
-    def pointwise_loss(self, y_true, raw_predictions):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        # logaddexp(0, x) = log(1 + exp(x))
-        loss = np.logaddexp(0, raw_predictions) - y_true * raw_predictions
-        return loss
-
-    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
-        if prediction_dim > 2:
-            raise ValueError(
-                "loss='binary_crossentropy' is not defined for multiclass"
-                " classification with n_classes=%d, use"
-                " loss='categorical_crossentropy' instead" % prediction_dim
-            )
-        proba_positive_class = np.average(y_train, weights=sample_weight)
-        eps = np.finfo(y_train.dtype).eps
-        proba_positive_class = np.clip(proba_positive_class, eps, 1 - eps)
-        # log(x / 1 - x) is the anti function of sigmoid, or the link function
-        # of the Binomial model.
-        return np.log(proba_positive_class / (1 - proba_positive_class))
-
-    def update_gradients_and_hessians(
-        self, gradients, hessians, y_true, raw_predictions, sample_weight
-    ):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        gradients = gradients.reshape(-1)
-        hessians = hessians.reshape(-1)
-        _update_gradients_hessians_binary_crossentropy(
-            gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads
-        )
-
-    def predict_proba(self, raw_predictions):
-        # shape (1, n_samples) --> (n_samples,). reshape(-1) is more likely to
-        # return a view.
-        raw_predictions = raw_predictions.reshape(-1)
-        proba = np.empty((raw_predictions.shape[0], 2), dtype=Y_DTYPE)
-        proba[:, 1] = expit(raw_predictions)
-        proba[:, 0] = 1 - proba[:, 1]
-        return proba
-
-
-class CategoricalCrossEntropy(BaseLoss):
-    """Categorical cross-entropy loss, for multiclass classification.
-
-    For a given sample x_i, the categorical cross-entropy loss is defined as
-    the negative log-likelihood of the model and generalizes the binary
-    cross-entropy to more than 2 classes.
-    """
-
-    def __init__(self, sample_weight, n_threads=None):
-        super().__init__(hessians_are_constant=False, n_threads=n_threads)
-
-    def pointwise_loss(self, y_true, raw_predictions):
-        one_hot_true = np.zeros_like(raw_predictions)
-        prediction_dim = raw_predictions.shape[0]
-        for k in range(prediction_dim):
-            one_hot_true[k, :] = y_true == k
-
-        loss = logsumexp(raw_predictions, axis=0) - (
-            one_hot_true * raw_predictions
-        ).sum(axis=0)
-        return loss
-
-    def get_baseline_prediction(self, y_train, sample_weight, prediction_dim):
-        init_value = np.zeros(shape=(prediction_dim, 1), dtype=Y_DTYPE)
-        eps = np.finfo(y_train.dtype).eps
-        for k in range(prediction_dim):
-            proba_kth_class = np.average(y_train == k, weights=sample_weight)
-            proba_kth_class = np.clip(proba_kth_class, eps, 1 - eps)
-            init_value[k, :] += np.log(proba_kth_class)
-
-        return init_value
-
-    def update_gradients_and_hessians(
-        self, gradients, hessians, y_true, raw_predictions, sample_weight
-    ):
-        _update_gradients_hessians_categorical_crossentropy(
-            gradients, hessians, y_true, raw_predictions, sample_weight, self.n_threads
-        )
-
-    def predict_proba(self, raw_predictions):
-        # TODO: This could be done in parallel
-        # compute softmax (using exp(log(softmax)))
-        proba = np.exp(
-            raw_predictions - logsumexp(raw_predictions, axis=0)[np.newaxis, :]
-        )
-        return proba.T
-
-
-_LOSSES = {
-    "squared_error": LeastSquares,
-    "absolute_error": LeastAbsoluteDeviation,
-    "binary_crossentropy": BinaryCrossEntropy,
-    "categorical_crossentropy": CategoricalCrossEntropy,
-    "poisson": Poisson,
-}
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 79581525b50bb..1e72a1e3f9903 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -1,6 +1,13 @@
 import numpy as np
 import pytest
 from numpy.testing import assert_allclose, assert_array_equal
+from sklearn._loss.loss import (
+    AbsoluteError,
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+    HalfSquaredError,
+)
 from sklearn.datasets import make_classification, make_regression
 from sklearn.datasets import make_low_rank_matrix
 from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder
@@ -15,16 +22,25 @@
 
 from sklearn.ensemble import HistGradientBoostingRegressor
 from sklearn.ensemble import HistGradientBoostingClassifier
-from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
-from sklearn.ensemble._hist_gradient_boosting.loss import LeastSquares
-from sklearn.ensemble._hist_gradient_boosting.loss import BinaryCrossEntropy
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
+    _init_gradients_and_hessians,
+)
 from sklearn.utils import shuffle
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 
+
 n_threads = _openmp_effective_n_threads()
 
+_LOSSES = {
+    "squared_error": HalfSquaredError,
+    "absolute_error": AbsoluteError,
+    "poisson": HalfPoissonLoss,
+    "binary_crossentropy": HalfBinomialLoss,
+    "categorical_crossentropy": HalfMultinomialLoss,
+}
+
 
 X_classification, y_classification = make_classification(random_state=0)
 X_regression, y_regression = make_regression(random_state=0)
@@ -681,6 +697,37 @@ def test_sample_weight_effect(problem, duplication):
     assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup))
 
 
+@pytest.mark.parametrize("lossclass", _LOSSES.values())
+def test_init_gradient_and_hessians(lossclass):
+    """Test that _init_gradients_and_hessians works as expected.
+
+    passing sample_weight to a loss correctly influences the
+    hessians_are_constant attribute, and consequently the shape of the
+    hessians array.
+    """
+    prediction_dim = 2
+    n_samples = 5
+    loss = lossclass(sample_weight=None)
+    _, hessians = _init_gradients_and_hessians(
+        constant_hessian=loss.constant_hessian,
+        n_samples=n_samples,
+        prediction_dim=prediction_dim,
+    )
+    if loss.constant_hessian:
+        assert hessians.shape == (1, 1)
+    else:
+        assert hessians.shape == (prediction_dim, n_samples)
+
+    loss = lossclass(sample_weight=np.ones(n_samples))
+    _, hessians = _init_gradients_and_hessians(
+        constant_hessian=loss.constant_hessian,
+        n_samples=n_samples,
+        prediction_dim=prediction_dim,
+    )
+    assert not loss.constant_hessian
+    assert hessians.shape == (prediction_dim, n_samples)
+
+
 @pytest.mark.parametrize("loss_name", ("squared_error", "absolute_error"))
 def test_sum_hessians_are_sample_weight(loss_name):
     # For losses with constant hessians, the sum_hessians field of the
@@ -696,13 +743,20 @@ def test_sum_hessians_are_sample_weight(loss_name):
 
     sample_weight = rng.normal(size=n_samples)
 
-    loss = _LOSSES[loss_name](sample_weight=sample_weight, n_threads=n_threads)
-    gradients, hessians = loss.init_gradients_and_hessians(
-        n_samples=n_samples, prediction_dim=1, sample_weight=sample_weight
+    loss = _LOSSES[loss_name](sample_weight=sample_weight)
+    gradients, hessians = _init_gradients_and_hessians(
+        constant_hessian=loss.constant_hessian,
+        n_samples=n_samples,
+        prediction_dim=1,
     )
     raw_predictions = rng.normal(size=(1, n_samples))
-    loss.update_gradients_and_hessians(
-        gradients, hessians, y, raw_predictions, sample_weight
+    loss.gradient_hessian(
+        y_true=y,
+        raw_prediction=raw_predictions.T,
+        sample_weight=sample_weight,
+        gradient_out=gradients.T,
+        hessian_out=hessians.T,
+        n_threads=n_threads,
     )
 
     # build sum_sample_weight which contains the sum of the sample weights at
@@ -789,13 +843,13 @@ def test_single_node_trees(Est):
     [
         (
             HistGradientBoostingClassifier,
-            BinaryCrossEntropy(sample_weight=None),
+            HalfBinomialLoss(sample_weight=None),
             X_classification,
             y_classification,
         ),
         (
             HistGradientBoostingRegressor,
-            LeastSquares(sample_weight=None),
+            HalfSquaredError(sample_weight=None),
             X_regression,
             y_regression,
         ),
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
deleted file mode 100644
index 813163802f956..0000000000000
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_loss.py
+++ /dev/null
@@ -1,348 +0,0 @@
-import numpy as np
-from numpy.testing import assert_almost_equal
-from numpy.testing import assert_allclose
-from scipy.optimize import newton
-from scipy.special import logit
-from sklearn.utils import assert_all_finite
-from sklearn.utils.fixes import sp_version, parse_version
-import pytest
-
-from sklearn.ensemble._hist_gradient_boosting.loss import _LOSSES
-from sklearn.ensemble._hist_gradient_boosting.common import Y_DTYPE
-from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
-from sklearn.utils._testing import skip_if_32bit
-from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
-
-n_threads = _openmp_effective_n_threads()
-
-
-def get_derivatives_helper(loss):
-    """Return get_gradients() and get_hessians() functions for a given loss."""
-
-    def get_gradients(y_true, raw_predictions):
-        # create gradients and hessians array, update inplace, and return
-        gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
-        hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
-        loss.update_gradients_and_hessians(
-            gradients, hessians, y_true, raw_predictions, None
-        )
-        return gradients
-
-    def get_hessians(y_true, raw_predictions):
-        # create gradients and hessians array, update inplace, and return
-        gradients = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
-        hessians = np.empty_like(raw_predictions, dtype=G_H_DTYPE)
-        loss.update_gradients_and_hessians(
-            gradients, hessians, y_true, raw_predictions, None
-        )
-
-        if loss.__class__.__name__ == "LeastSquares":
-            # hessians aren't updated because they're constant:
-            # the value is 1 (and not 2) because the loss is actually an half
-            # least squares loss.
-            hessians = np.full_like(raw_predictions, fill_value=1)
-        elif loss.__class__.__name__ == "LeastAbsoluteDeviation":
-            # hessians aren't updated because they're constant
-            hessians = np.full_like(raw_predictions, fill_value=0)
-
-        return hessians
-
-    return get_gradients, get_hessians
-
-
-@pytest.mark.parametrize(
-    "loss, x0, y_true",
-    [
-        ("squared_error", -2.0, 42),
-        ("squared_error", 117.0, 1.05),
-        ("squared_error", 0.0, 0.0),
-        # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp. -inf
-        # and +inf due to logit, cf. "complete separation". Therefore, we use
-        # 0 < y_true < 1.
-        ("binary_crossentropy", 0.3, 0.1),
-        ("binary_crossentropy", -12, 0.2),
-        ("binary_crossentropy", 30, 0.9),
-        ("poisson", 12.0, 1.0),
-        ("poisson", 0.0, 2.0),
-        ("poisson", -22.0, 10.0),
-    ],
-)
-@pytest.mark.skipif(
-    sp_version == parse_version("1.2.0"),
-    reason="bug in scipy 1.2.0, see scipy issue #9608",
-)
-@skip_if_32bit
-def test_derivatives(loss, x0, y_true):
-    # Check that gradients are zero when the loss is minimized on a single
-    # value/sample using Halley's method with the first and second order
-    # derivatives computed by the Loss instance.
-    # Note that methods of Loss instances operate on arrays while the newton
-    # root finder expects a scalar or a one-element array for this purpose.
-
-    loss = _LOSSES[loss](sample_weight=None)
-    y_true = np.array([y_true], dtype=Y_DTYPE)
-    x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1)
-    get_gradients, get_hessians = get_derivatives_helper(loss)
-
-    def func(x: np.ndarray) -> np.ndarray:
-        if isinstance(loss, _LOSSES["binary_crossentropy"]):
-            # Subtract a constant term such that the binary cross entropy
-            # has its minimum at zero, which is needed for the newton method.
-            actual_min = loss.pointwise_loss(y_true, logit(y_true))
-            return loss.pointwise_loss(y_true, x) - actual_min
-        else:
-            return loss.pointwise_loss(y_true, x)
-
-    def fprime(x: np.ndarray) -> np.ndarray:
-        return get_gradients(y_true, x)
-
-    def fprime2(x: np.ndarray) -> np.ndarray:
-        return get_hessians(y_true, x)
-
-    optimum = newton(func, x0=x0, fprime=fprime, fprime2=fprime2, maxiter=70, tol=2e-8)
-
-    # Need to ravel arrays because assert_allclose requires matching dimensions
-    y_true = y_true.ravel()
-    optimum = optimum.ravel()
-    assert_allclose(loss.inverse_link_function(optimum), y_true)
-    assert_allclose(func(optimum), 0, atol=1e-14)
-    assert_allclose(get_gradients(y_true, optimum), 0, atol=1e-6)
-
-
-@pytest.mark.parametrize(
-    "loss, n_classes, prediction_dim",
-    [
-        ("squared_error", 0, 1),
-        ("absolute_error", 0, 1),
-        ("binary_crossentropy", 2, 1),
-        ("categorical_crossentropy", 3, 3),
-        ("poisson", 0, 1),
-    ],
-)
-@pytest.mark.skipif(
-    Y_DTYPE != np.float64, reason="Need 64 bits float precision for numerical checks"
-)
-def test_numerical_gradients(loss, n_classes, prediction_dim, seed=0):
-    # Make sure gradients and hessians computed in the loss are correct, by
-    # comparing with their approximations computed with finite central
-    # differences.
-    # See https://en.wikipedia.org/wiki/Finite_difference.
-
-    rng = np.random.RandomState(seed)
-    n_samples = 100
-    if loss in ("squared_error", "absolute_error"):
-        y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
-    elif loss in ("poisson"):
-        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
-    else:
-        y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
-    raw_predictions = rng.normal(size=(prediction_dim, n_samples)).astype(Y_DTYPE)
-    loss = _LOSSES[loss](sample_weight=None, n_threads=n_threads)
-    get_gradients, get_hessians = get_derivatives_helper(loss)
-
-    # only take gradients and hessians of first tree / class.
-    gradients = get_gradients(y_true, raw_predictions)[0, :].ravel()
-    hessians = get_hessians(y_true, raw_predictions)[0, :].ravel()
-
-    # Approximate gradients
-    # For multiclass loss, we should only change the predictions of one tree
-    # (here the first), hence the use of offset[0, :] += eps
-    # As a softmax is computed, offsetting the whole array by a constant would
-    # have no effect on the probabilities, and thus on the loss
-    eps = 1e-9
-    offset = np.zeros_like(raw_predictions)
-    offset[0, :] = eps
-    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset / 2)
-    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset / 2)
-    numerical_gradients = (f_plus_eps - f_minus_eps) / eps
-
-    # Approximate hessians
-    eps = 1e-4  # need big enough eps as we divide by its square
-    offset[0, :] = eps
-    f_plus_eps = loss.pointwise_loss(y_true, raw_predictions + offset)
-    f_minus_eps = loss.pointwise_loss(y_true, raw_predictions - offset)
-    f = loss.pointwise_loss(y_true, raw_predictions)
-    numerical_hessians = (f_plus_eps + f_minus_eps - 2 * f) / eps ** 2
-
-    assert_allclose(numerical_gradients, gradients, rtol=1e-4, atol=1e-7)
-    assert_allclose(numerical_hessians, hessians, rtol=1e-4, atol=1e-7)
-
-
-def test_baseline_least_squares():
-    rng = np.random.RandomState(0)
-
-    loss = _LOSSES["squared_error"](sample_weight=None)
-    y_train = rng.normal(size=100)
-    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
-    assert baseline_prediction.shape == tuple()  # scalar
-    assert baseline_prediction.dtype == y_train.dtype
-    # Make sure baseline prediction is the mean of all targets
-    assert_almost_equal(baseline_prediction, y_train.mean())
-    assert np.allclose(
-        loss.inverse_link_function(baseline_prediction), baseline_prediction
-    )
-
-
-def test_baseline_absolute_error():
-    rng = np.random.RandomState(0)
-
-    loss = _LOSSES["absolute_error"](sample_weight=None)
-    y_train = rng.normal(size=100)
-    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
-    assert baseline_prediction.shape == tuple()  # scalar
-    assert baseline_prediction.dtype == y_train.dtype
-    # Make sure baseline prediction is the median of all targets
-    assert np.allclose(
-        loss.inverse_link_function(baseline_prediction), baseline_prediction
-    )
-    assert baseline_prediction == pytest.approx(np.median(y_train))
-
-
-def test_baseline_poisson():
-    rng = np.random.RandomState(0)
-
-    loss = _LOSSES["poisson"](sample_weight=None)
-    y_train = rng.poisson(size=100).astype(np.float64)
-    # Sanity check, make sure at least one sample is non-zero so we don't take
-    # log(0)
-    assert y_train.sum() > 0
-    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
-    assert np.isscalar(baseline_prediction)
-    assert baseline_prediction.dtype == y_train.dtype
-    assert_all_finite(baseline_prediction)
-    # Make sure baseline prediction produces the log of the mean of all targets
-    assert_almost_equal(np.log(y_train.mean()), baseline_prediction)
-
-    # Test baseline for y_true = 0
-    y_train.fill(0.0)
-    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
-    assert_all_finite(baseline_prediction)
-
-
-def test_baseline_binary_crossentropy():
-    rng = np.random.RandomState(0)
-
-    loss = _LOSSES["binary_crossentropy"](sample_weight=None)
-    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
-        y_train = y_train.astype(np.float64)
-        baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
-        assert_all_finite(baseline_prediction)
-        assert np.allclose(loss.inverse_link_function(baseline_prediction), y_train[0])
-
-    # Make sure baseline prediction is equal to link_function(p), where p
-    # is the proba of the positive class. We want predict_proba() to return p,
-    # and by definition
-    # p = inverse_link_function(raw_prediction) = sigmoid(raw_prediction)
-    # So we want raw_prediction = link_function(p) = log(p / (1 - p))
-    y_train = rng.randint(0, 2, size=100).astype(np.float64)
-    baseline_prediction = loss.get_baseline_prediction(y_train, None, 1)
-    assert baseline_prediction.shape == tuple()  # scalar
-    assert baseline_prediction.dtype == y_train.dtype
-    p = y_train.mean()
-    assert np.allclose(baseline_prediction, np.log(p / (1 - p)))
-
-
-def test_baseline_categorical_crossentropy():
-    rng = np.random.RandomState(0)
-
-    prediction_dim = 4
-    loss = _LOSSES["categorical_crossentropy"](sample_weight=None)
-    for y_train in (np.zeros(shape=100), np.ones(shape=100)):
-        y_train = y_train.astype(np.float64)
-        baseline_prediction = loss.get_baseline_prediction(
-            y_train, None, prediction_dim
-        )
-        assert baseline_prediction.dtype == y_train.dtype
-        assert_all_finite(baseline_prediction)
-
-    # Same logic as for above test. Here inverse_link_function = softmax and
-    # link_function = log
-    y_train = rng.randint(0, prediction_dim + 1, size=100).astype(np.float32)
-    baseline_prediction = loss.get_baseline_prediction(y_train, None, prediction_dim)
-    assert baseline_prediction.shape == (prediction_dim, 1)
-    for k in range(prediction_dim):
-        p = (y_train == k).mean()
-        assert np.allclose(baseline_prediction[k, :], np.log(p))
-
-
-@pytest.mark.parametrize(
-    "loss, problem",
-    [
-        ("squared_error", "regression"),
-        ("absolute_error", "regression"),
-        ("binary_crossentropy", "classification"),
-        ("categorical_crossentropy", "classification"),
-        ("poisson", "poisson_regression"),
-    ],
-)
-@pytest.mark.parametrize("sample_weight", ["ones", "random"])
-def test_sample_weight_multiplies_gradients(loss, problem, sample_weight):
-    # Make sure that passing sample weights to the gradient and hessians
-    # computation methods is equivalent to multiplying by the weights.
-
-    rng = np.random.RandomState(42)
-    n_samples = 1000
-
-    if loss == "categorical_crossentropy":
-        n_classes = prediction_dim = 3
-    else:
-        n_classes = prediction_dim = 1
-
-    if problem == "regression":
-        y_true = rng.normal(size=n_samples).astype(Y_DTYPE)
-    elif problem == "poisson_regression":
-        y_true = rng.poisson(size=n_samples).astype(Y_DTYPE)
-    else:
-        y_true = rng.randint(0, n_classes, size=n_samples).astype(Y_DTYPE)
-
-    if sample_weight == "ones":
-        sample_weight = np.ones(shape=n_samples, dtype=Y_DTYPE)
-    else:
-        sample_weight = rng.normal(size=n_samples).astype(Y_DTYPE)
-
-    loss_ = _LOSSES[loss](sample_weight=sample_weight, n_threads=n_threads)
-
-    baseline_prediction = loss_.get_baseline_prediction(y_true, None, prediction_dim)
-    raw_predictions = np.zeros(
-        shape=(prediction_dim, n_samples), dtype=baseline_prediction.dtype
-    )
-    raw_predictions += baseline_prediction
-
-    gradients = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
-    hessians = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
-    loss_.update_gradients_and_hessians(
-        gradients, hessians, y_true, raw_predictions, None
-    )
-
-    gradients_sw = np.empty(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
-    hessians_sw = np.ones(shape=(prediction_dim, n_samples), dtype=G_H_DTYPE)
-    loss_.update_gradients_and_hessians(
-        gradients_sw, hessians_sw, y_true, raw_predictions, sample_weight
-    )
-
-    assert np.allclose(gradients * sample_weight, gradients_sw)
-    assert np.allclose(hessians * sample_weight, hessians_sw)
-
-
-def test_init_gradient_and_hessians_sample_weight():
-    # Make sure that passing sample_weight to a loss correctly influences the
-    # hessians_are_constant attribute, and consequently the shape of the
-    # hessians array.
-
-    prediction_dim = 2
-    n_samples = 5
-    sample_weight = None
-    loss = _LOSSES["squared_error"](sample_weight=sample_weight)
-    _, hessians = loss.init_gradients_and_hessians(
-        n_samples=n_samples, prediction_dim=prediction_dim, sample_weight=None
-    )
-    assert loss.hessians_are_constant
-    assert hessians.shape == (1, 1)
-
-    sample_weight = np.ones(n_samples)
-    loss = _LOSSES["squared_error"](sample_weight=sample_weight)
-    _, hessians = loss.init_gradients_and_hessians(
-        n_samples=n_samples, prediction_dim=prediction_dim, sample_weight=sample_weight
-    )
-    assert not loss.hessians_are_constant
-    assert hessians.shape == (prediction_dim, n_samples)
diff --git a/sklearn/ensemble/setup.py b/sklearn/ensemble/setup.py
index 9f46a7e3cd303..a9594757dbeb2 100644
--- a/sklearn/ensemble/setup.py
+++ b/sklearn/ensemble/setup.py
@@ -44,12 +44,6 @@ def configuration(parent_package="", top_path=None):
         include_dirs=[numpy.get_include()],
     )
 
-    config.add_extension(
-        "_hist_gradient_boosting._loss",
-        sources=["_hist_gradient_boosting/_loss.pyx"],
-        include_dirs=[numpy.get_include()],
-    )
-
     config.add_extension(
         "_hist_gradient_boosting._bitset",
         sources=["_hist_gradient_boosting/_bitset.pyx"],

From 78b322f381850ac96fa4b06e576e574c43aeacbc Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 22 Nov 2021 07:27:35 +0100
Subject: [PATCH 126/143] DOC add whatsnew

---
 doc/whats_new/v1.1.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 6a5b2d226cabe..3bde415d22f4b 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -148,6 +148,9 @@ Changelog
   and :class:`ensemble.RandomTreesEmbedding` now raise a ``ValueError`` when
   ``bootstrap=False`` and ``max_samples`` is not ``None``.
   :pr:`21295` :user:`Haoyin Xu <PSSF23>`.
+- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is
+  faster for multiclass problems thanks to a new private loss function module.
+  :pr:`20567` and :pr:`20811` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 :mod:`sklearn.impute`
 .....................

From c75ffa293e83d601406823932ff221ef37ea8c62 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 28 Nov 2021 11:56:24 +0100
Subject: [PATCH 127/143] DOC better whatsnew

---
 doc/whats_new/v1.1.rst | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 3bde415d22f4b..8de69950f61d8 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -142,15 +142,17 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
+- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is
+  a bit faster, for binary as well as for multiclass problems thanks the new
+  private loss function module.
+  :pr:`20567` and :pr:`20811` by :user:`Christian Lorentzen <lorentzenchr>`.
+
 - |Fix| :class:`ensemble.RandomForestClassifier`,
   :class:`ensemble.RandomForestRegressor`,
   :class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`,
   and :class:`ensemble.RandomTreesEmbedding` now raise a ``ValueError`` when
   ``bootstrap=False`` and ``max_samples`` is not ``None``.
   :pr:`21295` :user:`Haoyin Xu <PSSF23>`.
-- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is
-  faster for multiclass problems thanks to a new private loss function module.
-  :pr:`20567` and :pr:`20811` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 :mod:`sklearn.impute`
 .....................

From 2425e12c9604832b2ce783f88e96be09749d5371 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 28 Nov 2021 15:27:49 +0100
Subject: [PATCH 128/143] DOC typo

---
 doc/whats_new/v1.1.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 8de69950f61d8..ba0e93f84f2d0 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -142,9 +142,9 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
-- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is
-  a bit faster, for binary as well as for multiclass problems thanks the new
-  private loss function module.
+- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is a bit
+  faster, for binary as well as for multiclass problems thanks to the new private loss
+  function module.
   :pr:`20567` and :pr:`20811` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Fix| :class:`ensemble.RandomForestClassifier`,

From 64328031502dfab24f9f82bc9c835825971c1222 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 1 Dec 2021 22:05:04 +0100
Subject: [PATCH 129/143] DOC update whatsnew

---
 doc/whats_new/v1.1.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index ba0e93f84f2d0..b196759a75c22 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -142,10 +142,11 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
-- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is a bit
-  faster, for binary as well as for multiclass problems thanks to the new private loss
+- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is faster,
+  for binary and in particular for multiclass problems thanks to the new private loss
   function module.
-  :pr:`20567` and :pr:`20811` by :user:`Christian Lorentzen <lorentzenchr>`.
+  :pr:`20811`, :pr:`20567` and :pr:`21814` by
+  :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Fix| :class:`ensemble.RandomForestClassifier`,
   :class:`ensemble.RandomForestRegressor`,

From 11e3697da5f4353722e87b898fa13b9d617ae0ed Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Thu, 2 Dec 2021 12:04:27 +0100
Subject: [PATCH 130/143] ENH reshape _baseline_prediction only once

---
 .../_hist_gradient_boosting/gradient_boosting.py    | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 5d23852334b4f..a5a9fc9096ea2 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -453,20 +453,21 @@ def fit(self, X, y, sample_weight=None):
             self._clear_state()
 
             # initialize raw_predictions: those are the accumulated values
-            # predicted by the trees for the training data. raw_predictions has
+            # predicted by the trees for the training data. raw_prediction has
             # shape (n_trees_per_iteration, n_samples) where
             # n_trees_per_iterations is n_classes in multiclass classification,
             # else 1.
+            # self._baseline_prediction has shape (n_trees_per_iteration, 1)
             self._baseline_prediction = np.atleast_1d(
                 self._loss.fit_intercept_only(
                     y_true=y_train, sample_weight=sample_weight_train
                 )
-            ).T
+            ).reshape((-1, 1))
             raw_predictions = np.zeros(
                 shape=(self.n_trees_per_iteration_, n_samples),
                 dtype=self._baseline_prediction.dtype,
             )
-            raw_predictions += self._baseline_prediction[:, None]
+            raw_predictions += self._baseline_prediction
 
             # predictors is a matrix (list of lists) of TreePredictor objects
             # with shape (n_iter_, n_trees_per_iteration)
@@ -498,7 +499,7 @@ def fit(self, X, y, sample_weight=None):
                             dtype=self._baseline_prediction.dtype,
                         )
 
-                        raw_predictions_val += self._baseline_prediction[:, None]
+                        raw_predictions_val += self._baseline_prediction
 
                     self._check_early_stopping_loss(
                         raw_predictions=raw_predictions,
@@ -981,7 +982,7 @@ def _raw_predict(self, X, n_threads=None):
             shape=(self.n_trees_per_iteration_, n_samples),
             dtype=self._baseline_prediction.dtype,
         )
-        raw_predictions += self._baseline_prediction[:, None]
+        raw_predictions += self._baseline_prediction
 
         # We intentionally decouple the number of threads used at prediction
         # time from the number of threads used at fit time because the model
@@ -1047,7 +1048,7 @@ def _staged_raw_predict(self, X):
             shape=(self.n_trees_per_iteration_, n_samples),
             dtype=self._baseline_prediction.dtype,
         )
-        raw_predictions += self._baseline_prediction[:, None]
+        raw_predictions += self._baseline_prediction
 
         # We intentionally decouple the number of threads used at prediction
         # time from the number of threads used at fit time because the model

From 34dc4144f346fd832c919cda3c388a587563ef18 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 3 Dec 2021 17:24:52 +0100
Subject: [PATCH 131/143] ENH add init_gradient_and_hessian to loss class

---
 sklearn/_loss/loss.py                         | 56 +++++++++++++++++++
 sklearn/_loss/tests/test_loss.py              | 40 +++++++++++++
 .../gradient_boosting.py                      | 30 ++++++----
 3 files changed, 114 insertions(+), 12 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index a394bd9de06c3..2604497344631 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -461,6 +461,62 @@ def constant_to_optimal_zero(self, y_true, sample_weight=None):
         """
         return np.zeros_like(y_true)
 
+    def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"):
+        """Initialize arrays for gradients and hessians.
+
+        Unless hessians are constant, arrays are initialized with undefined values.
+
+        Parameters
+        ----------
+        n_samples : int
+            The number of samples, usually passed to `fit()`.
+        dtype : {np.float64, np.float32}, default=np.float64
+            The dtype of the arrays gradient and hessian.
+        order : {'C', 'F'}, default='F'
+            Order of the arrays gradient and hessian. The default 'F' makes the arrays
+            contiguous along samples.
+
+        Returns
+        -------
+        gradient : C-contiguous array of shape (n_samples,) or array of shape \
+            (n_samples, n_classes)
+            Empty array (allocated but not initialized) to be used as argument
+            gradient_out.
+        hessian : C-contiguous array of shape (n_samples,), array of shape
+            (n_samples, n_classes) or shape (1,)
+            Empty (allocated but not initialized) array to be used as argument
+            hessian_out.
+            If constant_hessian is True (e.g. `HalfSquaredError`), the array is
+            initialized to ``1``.
+        """
+        supported_dtype = (np.float64, np.float32)
+        if dtype not in supported_dtype:
+            raise ValueError(
+                f"Valid options for 'dtype' are {supported_dtype}. "
+                f"Got dtype={dtype} instead."
+            )
+        if order not in ("C", "F"):
+            raise ValueError(
+                f"Valid options for 'order' are {order}. Got dtype={dtype} instead."
+            )
+
+        if self.is_multiclass:
+            shape = (n_samples, self.n_classes)
+        else:
+            shape = (n_samples,)
+        gradient = np.empty(shape=shape, dtype=dtype, order=order)
+
+        if self.constant_hessian:
+            # If the hessians are constant, we consider them equal to 1.
+            # - This is correct for HalfSquaredError
+            # - For AbsoluteError, hessians are actually 0, but they are
+            #   always ignored anyway.
+            hessian = np.ones(shape=(1,), dtype=dtype)
+        else:
+            hessian = np.empty(shape=shape, dtype=dtype, order=order)
+
+        return gradient, hessian
+
 
 # Note: Naturally, we would inherit in the following order
 #         class HalfSquaredError(IdentityLink, CyHalfSquaredError, BaseLoss)
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 2ad5633037c4a..94fae460c772e 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -991,6 +991,46 @@ def test_predict_proba(loss):
             )
 
 
+@pytest.mark.parametrize("loss", ALL_LOSSES, ids=loss_instance_name)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("dtype", (np.float32, np.float64))
+@pytest.mark.parametrize("order", ("C", "F"))
+def test_init_gradient_and_hessians(loss, sample_weight, dtype, order):
+    """Test that init_gradient_and_hessian works as expected.
+
+    passing sample_weight to a loss correctly influences the constant_hessian
+    attribute, and consequently the shape of the hessian array.
+    """
+    n_samples = 5
+    if sample_weight == "range":
+        sample_weight = np.ones(n_samples)
+    loss = loss(sample_weight=sample_weight)
+    gradient, hessian = loss.init_gradient_and_hessian(
+        n_samples=n_samples,
+        dtype=dtype,
+        order=order,
+    )
+    if loss.constant_hessian:
+        assert gradient.shape == (n_samples,)
+        assert hessian.shape == (1,)
+    elif loss.is_multiclass:
+        assert gradient.shape == (n_samples, loss.n_classes)
+        assert hessian.shape == (n_samples, loss.n_classes)
+    else:
+        assert hessian.shape == (n_samples,)
+        assert hessian.shape == (n_samples,)
+
+    assert gradient.dtype == dtype
+    assert hessian.dtype == dtype
+
+    if order == "C":
+        assert gradient.flags.c_contiguous
+        assert hessian.flags.c_contiguous
+    else:
+        assert gradient.flags.f_contiguous
+        assert hessian.flags.f_contiguous
+
+
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_loss_pickle(loss):
     """Test that losses can be pickled."""
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index a5a9fc9096ea2..ac50e3050e1d5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -577,11 +577,9 @@ def fit(self, X, y, sample_weight=None):
             begin_at_stage = self.n_iter_
 
         # initialize gradients and hessians (empty arrays).
-        # shape = (n_trees_per_iteration, n_samples).
-        gradients, hessians = _init_gradients_and_hessians(
-            constant_hessian=self._loss.constant_hessian,
-            n_samples=n_samples,
-            prediction_dim=self.n_trees_per_iteration_,
+        # shape = (n_samples, n_trees_per_iteration).
+        gradient, hessian = self._loss.init_gradient_and_hessian(
+            n_samples=n_samples, dtype=G_H_DTYPE, order="F"
         )
 
         for iteration in range(begin_at_stage, self.max_iter):
@@ -595,13 +593,12 @@ def fit(self, X, y, sample_weight=None):
             # Update gradients and hessians, inplace
             # Note that self._loss expects shape (n_samples,) for
             # n_trees_per_iteration = 1 else shape (n_samples, n_trees_per_iteration).
-            # T (transpose) returns a view.
             if self._loss.constant_hessian:
                 self._loss.gradient(
                     y_true=y_train,
                     raw_prediction=raw_predictions.T,
                     sample_weight=sample_weight_train,
-                    gradient_out=gradients.T,
+                    gradient_out=gradient,
                     n_threads=n_threads,
                 )
             else:
@@ -609,20 +606,29 @@ def fit(self, X, y, sample_weight=None):
                     y_true=y_train,
                     raw_prediction=raw_predictions.T,
                     sample_weight=sample_weight_train,
-                    gradient_out=gradients.T,
-                    hessian_out=hessians.T,
+                    gradient_out=gradient,
+                    hessian_out=hessian,
                     n_threads=n_threads,
                 )
 
             # Append a list since there may be more than 1 predictor per iter
             predictors.append([])
 
+            # 2-d views of shape (n_samples, n_trees_per_iteration_) or (n_samples, 1)
+            # on gradient and hessian to simplify the loop over n_trees_per_iteration_.
+            if gradient.ndim == 1:
+                g_view = gradient.reshape((-1, 1))
+                h_view = hessian.reshape((-1, 1))
+            else:
+                g_view = gradient
+                h_view = hessian
+
             # Build `n_trees_per_iteration` trees.
             for k in range(self.n_trees_per_iteration_):
                 grower = TreeGrower(
-                    X_binned_train,
-                    gradients[k, :],
-                    hessians[k, :],
+                    X_binned=X_binned_train,
+                    gradients=g_view[:, k],
+                    hessians=h_view[:, k],
                     n_bins=n_bins,
                     n_bins_non_missing=self._bin_mapper.n_bins_non_missing_,
                     has_missing_values=has_missing_values,

From 409a99f6ad01787524f7888456d37eea7995b880 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Fri, 3 Dec 2021 17:27:56 +0100
Subject: [PATCH 132/143] MNT remove _init_gradients_and_hessians

---
 .../gradient_boosting.py                      | 41 ------------------
 .../tests/test_gradient_boosting.py           | 42 ++-----------------
 2 files changed, 3 insertions(+), 80 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index ac50e3050e1d5..104833764aa00 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -48,47 +48,6 @@
 )
 
 
-def _init_gradients_and_hessians(constant_hessian, n_samples, prediction_dim):
-    """Return initial gradients and hessians.
-
-    Unless hessians are constant, arrays are initialized with undefined values.
-
-    Parameters
-    ----------
-    constant_hessian : bool
-        Usual input is loss.constant_hessian.
-    n_samples : int
-        The number of samples passed to `fit()`.
-    prediction_dim : int
-        The dimension of a raw prediction, i.e. the number of trees
-        built at each iteration. Equals 1 for regression and binary
-        classification, or K where K is the number of classes for
-        multiclass classification.
-
-    Returns
-    -------
-    gradients : ndarray, shape (prediction_dim, n_samples)
-        The initial gradients. The array is not initialized.
-    hessians : ndarray, shape (prediction_dim, n_samples)
-        If hessians are constant (e.g. for `LeastSquares` loss, the
-        array is initialized to ``1``. Otherwise, the array is allocated
-        without being initialized.
-    """
-    shape = (prediction_dim, n_samples)
-    gradients = np.empty(shape=shape, dtype=G_H_DTYPE)
-
-    if constant_hessian:
-        # If the hessians are constant, we consider they are equal to 1.
-        # - This is correct for the half LS loss
-        # - For the Absolute Error, hessians are actually 0, but they are
-        # always ignored anyway.
-        hessians = np.ones(shape=(1, 1), dtype=G_H_DTYPE)
-    else:
-        hessians = np.empty(shape=shape, dtype=G_H_DTYPE)
-
-    return gradients, hessians
-
-
 def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight):
     """Update the leaf values to be predicted by the tree.
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 1e72a1e3f9903..876c65310f966 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -24,9 +24,6 @@
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
-from sklearn.ensemble._hist_gradient_boosting.gradient_boosting import (
-    _init_gradients_and_hessians,
-)
 from sklearn.utils import shuffle
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 
@@ -697,37 +694,6 @@ def test_sample_weight_effect(problem, duplication):
     assert np.allclose(est_sw._raw_predict(X_dup), est_dup._raw_predict(X_dup))
 
 
-@pytest.mark.parametrize("lossclass", _LOSSES.values())
-def test_init_gradient_and_hessians(lossclass):
-    """Test that _init_gradients_and_hessians works as expected.
-
-    passing sample_weight to a loss correctly influences the
-    hessians_are_constant attribute, and consequently the shape of the
-    hessians array.
-    """
-    prediction_dim = 2
-    n_samples = 5
-    loss = lossclass(sample_weight=None)
-    _, hessians = _init_gradients_and_hessians(
-        constant_hessian=loss.constant_hessian,
-        n_samples=n_samples,
-        prediction_dim=prediction_dim,
-    )
-    if loss.constant_hessian:
-        assert hessians.shape == (1, 1)
-    else:
-        assert hessians.shape == (prediction_dim, n_samples)
-
-    loss = lossclass(sample_weight=np.ones(n_samples))
-    _, hessians = _init_gradients_and_hessians(
-        constant_hessian=loss.constant_hessian,
-        n_samples=n_samples,
-        prediction_dim=prediction_dim,
-    )
-    assert not loss.constant_hessian
-    assert hessians.shape == (prediction_dim, n_samples)
-
-
 @pytest.mark.parametrize("loss_name", ("squared_error", "absolute_error"))
 def test_sum_hessians_are_sample_weight(loss_name):
     # For losses with constant hessians, the sum_hessians field of the
@@ -744,18 +710,16 @@ def test_sum_hessians_are_sample_weight(loss_name):
     sample_weight = rng.normal(size=n_samples)
 
     loss = _LOSSES[loss_name](sample_weight=sample_weight)
-    gradients, hessians = _init_gradients_and_hessians(
-        constant_hessian=loss.constant_hessian,
+    gradients, hessians = loss.init_gradient_and_hessian(
         n_samples=n_samples,
-        prediction_dim=1,
     )
     raw_predictions = rng.normal(size=(1, n_samples))
     loss.gradient_hessian(
         y_true=y,
         raw_prediction=raw_predictions.T,
         sample_weight=sample_weight,
-        gradient_out=gradients.T,
-        hessian_out=hessians.T,
+        gradient_out=gradients,
+        hessian_out=hessians,
         n_threads=n_threads,
     )
 

From cddecb234aa14dada9e162a12ab46032985440f1 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 4 Dec 2021 14:10:09 +0100
Subject: [PATCH 133/143] FIX test_sum_hessians_are_sample_weight

---
 .../tests/test_gradient_boosting.py                      | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index 876c65310f966..da730c3c02e9f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -24,6 +24,7 @@
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.grower import TreeGrower
 from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.common import G_H_DTYPE
 from sklearn.utils import shuffle
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 
@@ -707,12 +708,14 @@ def test_sum_hessians_are_sample_weight(loss_name):
     bin_mapper = _BinMapper()
     X_binned = bin_mapper.fit_transform(X)
 
+    # While sample weights are supposed to be positive, this still works.
     sample_weight = rng.normal(size=n_samples)
 
     loss = _LOSSES[loss_name](sample_weight=sample_weight)
     gradients, hessians = loss.init_gradient_and_hessian(
-        n_samples=n_samples,
+        n_samples=n_samples, dtype=G_H_DTYPE
     )
+    gradients, hessians = gradients.reshape((-1, 1)), hessians.reshape((-1, 1))
     raw_predictions = rng.normal(size=(1, n_samples))
     loss.gradient_hessian(
         y_true=y,
@@ -734,7 +737,9 @@ def test_sum_hessians_are_sample_weight(loss_name):
             ]
 
     # Build histogram
-    grower = TreeGrower(X_binned, gradients[0], hessians[0], n_bins=bin_mapper.n_bins)
+    grower = TreeGrower(
+        X_binned, gradients[:, 0], hessians[:, 0], n_bins=bin_mapper.n_bins
+    )
     histograms = grower.histogram_builder.compute_histograms_brute(
         grower.root.sample_indices
     )

From da016e6dc76d50dd466e30d62150361d2e687fdd Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 5 Dec 2021 10:30:41 +0100
Subject: [PATCH 134/143] TST that init_gradient_and_hessian raises exceptions

---
 sklearn/_loss/loss.py            |  9 ++-------
 sklearn/_loss/tests/test_loss.py | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 2604497344631..d95bb7dc23376 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -489,16 +489,11 @@ def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"):
             If constant_hessian is True (e.g. `HalfSquaredError`), the array is
             initialized to ``1``.
         """
-        supported_dtype = (np.float64, np.float32)
-        if dtype not in supported_dtype:
+        if dtype not in (np.float32, np.float64):
             raise ValueError(
-                f"Valid options for 'dtype' are {supported_dtype}. "
+                "Valid options for 'dtype' are np.float32 and np.flaot64. "
                 f"Got dtype={dtype} instead."
             )
-        if order not in ("C", "F"):
-            raise ValueError(
-                f"Valid options for 'order' are {order}. Got dtype={dtype} instead."
-            )
 
         if self.is_multiclass:
             shape = (n_samples, self.n_classes)
diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index 94fae460c772e..e7ca10d2468b9 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -991,7 +991,7 @@ def test_predict_proba(loss):
             )
 
 
-@pytest.mark.parametrize("loss", ALL_LOSSES, ids=loss_instance_name)
+@pytest.mark.parametrize("loss", ALL_LOSSES)
 @pytest.mark.parametrize("sample_weight", [None, "range"])
 @pytest.mark.parametrize("dtype", (np.float32, np.float64))
 @pytest.mark.parametrize("order", ("C", "F"))
@@ -1031,6 +1031,24 @@ def test_init_gradient_and_hessians(loss, sample_weight, dtype, order):
         assert hessian.flags.f_contiguous
 
 
+@pytest.mark.parametrize("loss", ALL_LOSSES)
+@pytest.mark.parametrize(
+    "params, err_msg",
+    [
+        (
+            {"dtype": np.int64},
+            f"Valid options for 'dtype' are .* Got dtype={np.int64} instead.",
+        ),
+        ({"order": "nonsense"}, "order must be one of 'C', 'F'"),
+    ],
+)
+def test_init_gradient_and_hessian_raises(loss, params, err_msg):
+    """Test that init_gradient_and_hessian raises errors for invalid input."""
+    loss = loss()
+    with pytest.raises(ValueError, match=err_msg):
+        gradient, hessian = loss.init_gradient_and_hessian(n_samples=5, **params)
+
+
 @pytest.mark.parametrize("loss", LOSS_INSTANCES, ids=loss_instance_name)
 def test_loss_pickle(loss):
     """Test that losses can be pickled."""

From 9e7d52776a841a60eb98da1f2294d124b2c6f59a Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 5 Dec 2021 11:39:41 +0100
Subject: [PATCH 135/143] CLN remove tested error message for order

---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index e7ca10d2468b9..a1b7032ec324e 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1039,7 +1039,7 @@ def test_init_gradient_and_hessians(loss, sample_weight, dtype, order):
             {"dtype": np.int64},
             f"Valid options for 'dtype' are .* Got dtype={np.int64} instead.",
         ),
-        ({"order": "nonsense"}, "order must be one of 'C', 'F'"),
+        ({"order": "nonsense"}, ""),
     ],
 )
 def test_init_gradient_and_hessian_raises(loss, params, err_msg):

From 357423f59f3b661c1006a55d70e66a185bd01134 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sun, 5 Dec 2021 11:42:56 +0100
Subject: [PATCH 136/143] CLN add TypeError to pytest.raises

---
 sklearn/_loss/tests/test_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index a1b7032ec324e..ecd55beeaba61 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1045,7 +1045,7 @@ def test_init_gradient_and_hessians(loss, sample_weight, dtype, order):
 def test_init_gradient_and_hessian_raises(loss, params, err_msg):
     """Test that init_gradient_and_hessian raises errors for invalid input."""
     loss = loss()
-    with pytest.raises(ValueError, match=err_msg):
+    with pytest.raises((ValueError, TypeError), match=err_msg):
         gradient, hessian = loss.init_gradient_and_hessian(n_samples=5, **params)
 
 
From 907cb8aa719943c52b78b18da5fea1694b8337ee Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 6 Dec 2021 19:14:21 +0100
Subject: [PATCH 137/143] CLN whatsnew

---
 doc/whats_new/v1.1.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 4195ad88dddb8..2db5c0b248069 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -147,7 +147,7 @@ Changelog
 :mod:`sklearn.ensemble`
 .......................
 
-- |Enhancement| :class:`~sklearn.ensemble.HistGradientBoostingClassifier` is faster,
+- |Enhancement| :class:`ensemble.HistGradientBoostingClassifier` is faster,
   for binary and in particular for multiclass problems thanks to the new private loss
   function module.
   :pr:`20811`, :pr:`20567` and :pr:`21814` by

From 131988b67af95455384742b75c60f9facd9183c6 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Dec 2021 16:35:05 +0100
Subject: [PATCH 138/143] TST do not test invalid order argument

---
 sklearn/_loss/tests/test_loss.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/_loss/tests/test_loss.py b/sklearn/_loss/tests/test_loss.py
index ecd55beeaba61..5426ed296f01a 100644
--- a/sklearn/_loss/tests/test_loss.py
+++ b/sklearn/_loss/tests/test_loss.py
@@ -1039,7 +1039,6 @@ def test_init_gradient_and_hessians(loss, sample_weight, dtype, order):
             {"dtype": np.int64},
             f"Valid options for 'dtype' are .* Got dtype={np.int64} instead.",
         ),
-        ({"order": "nonsense"}, ""),
     ],
 )
 def test_init_gradient_and_hessian_raises(loss, params, err_msg):

From 1985c947a97aff7130965c977796347be11399d0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Dec 2021 17:06:38 +0100
Subject: [PATCH 139/143] CLN use numpy scalar and mention it in docstring

---
 sklearn/_loss/loss.py                                       | 2 +-
 .../ensemble/_hist_gradient_boosting/gradient_boosting.py   | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index 087c7427a1864..c77d2f927c370 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -427,7 +427,7 @@ def fit_intercept_only(self, y_true, sample_weight=None):
 
         Returns
         -------
-        raw_prediction : float or (n_classes,)
+        raw_prediction : numpy scalar or array of shape (n_classes,)
             Raw predictions of an intercept-only model.
         """
         # As default, take weighted average of the target over the samples
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 104833764aa00..16157439f89a3 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -417,10 +417,8 @@ def fit(self, X, y, sample_weight=None):
             # n_trees_per_iterations is n_classes in multiclass classification,
             # else 1.
             # self._baseline_prediction has shape (n_trees_per_iteration, 1)
-            self._baseline_prediction = np.atleast_1d(
-                self._loss.fit_intercept_only(
-                    y_true=y_train, sample_weight=sample_weight_train
-                )
+            self._baseline_prediction = self._loss.fit_intercept_only(
+                y_true=y_train, sample_weight=sample_weight_train
             ).reshape((-1, 1))
             raw_predictions = np.zeros(
                 shape=(self.n_trees_per_iteration_, n_samples),

From 27e818ffd37a04d70d682ff24294917179c1a446 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Sat, 18 Dec 2021 17:27:16 +0100
Subject: [PATCH 140/143] CLN always make raw_predictions.shape=(n_samples,
 n_trees_per_iteration)

---
 .../gradient_boosting.py                      | 50 ++++++++++---------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 16157439f89a3..788b9ad949e7e 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -413,16 +413,17 @@ def fit(self, X, y, sample_weight=None):
 
             # initialize raw_predictions: those are the accumulated values
             # predicted by the trees for the training data. raw_prediction has
-            # shape (n_trees_per_iteration, n_samples) where
+            # shape (n_samples, n_trees_per_iteration) where
             # n_trees_per_iterations is n_classes in multiclass classification,
             # else 1.
-            # self._baseline_prediction has shape (n_trees_per_iteration, 1)
+            # self._baseline_prediction has shape (1, n_trees_per_iteration)
             self._baseline_prediction = self._loss.fit_intercept_only(
                 y_true=y_train, sample_weight=sample_weight_train
-            ).reshape((-1, 1))
+            ).reshape((1, -1))
             raw_predictions = np.zeros(
-                shape=(self.n_trees_per_iteration_, n_samples),
+                shape=(n_samples, self.n_trees_per_iteration_),
                 dtype=self._baseline_prediction.dtype,
+                order="F",
             )
             raw_predictions += self._baseline_prediction
 
@@ -452,8 +453,9 @@ def fit(self, X, y, sample_weight=None):
 
                     if self._use_validation_data:
                         raw_predictions_val = np.zeros(
-                            shape=(self.n_trees_per_iteration_, X_binned_val.shape[0]),
+                            shape=(X_binned_val.shape[0], self.n_trees_per_iteration_),
                             dtype=self._baseline_prediction.dtype,
+                            order="F",
                         )
 
                         raw_predictions_val += self._baseline_prediction
@@ -553,7 +555,7 @@ def fit(self, X, y, sample_weight=None):
             if self._loss.constant_hessian:
                 self._loss.gradient(
                     y_true=y_train,
-                    raw_prediction=raw_predictions.T,
+                    raw_prediction=raw_predictions,
                     sample_weight=sample_weight_train,
                     gradient_out=gradient,
                     n_threads=n_threads,
@@ -561,7 +563,7 @@ def fit(self, X, y, sample_weight=None):
             else:
                 self._loss.gradient_hessian(
                     y_true=y_train,
-                    raw_prediction=raw_predictions.T,
+                    raw_prediction=raw_predictions,
                     sample_weight=sample_weight_train,
                     gradient_out=gradient,
                     hessian_out=hessian,
@@ -609,7 +611,7 @@ def fit(self, X, y, sample_weight=None):
                         loss=self._loss,
                         grower=grower,
                         y_true=y_train,
-                        raw_prediction=raw_predictions[k, :],
+                        raw_prediction=raw_predictions[:, k],
                         sample_weight=sample_weight_train,
                     )
 
@@ -621,7 +623,7 @@ def fit(self, X, y, sample_weight=None):
                 # Update raw_predictions with the predictions of the newly
                 # created tree.
                 tic_pred = time()
-                _update_raw_predictions(raw_predictions[k, :], grower, n_threads)
+                _update_raw_predictions(raw_predictions[:, k], grower, n_threads)
                 toc_pred = time()
                 acc_prediction_time += toc_pred - tic_pred
 
@@ -631,7 +633,7 @@ def fit(self, X, y, sample_weight=None):
                     # Update raw_predictions_val with the newest tree(s)
                     if self._use_validation_data:
                         for k, pred in enumerate(self._predictors[-1]):
-                            raw_predictions_val[k, :] += pred.predict_binned(
+                            raw_predictions_val[:, k] += pred.predict_binned(
                                 X_binned_val,
                                 self._bin_mapper.missing_values_bin_idx_,
                                 n_threads,
@@ -804,7 +806,7 @@ def _check_early_stopping_loss(
         self.train_score_.append(
             -self._loss(
                 y_true=y_train,
-                raw_prediction=raw_predictions.T,
+                raw_prediction=raw_predictions,
                 sample_weight=sample_weight_train,
                 n_threads=n_threads,
             )
@@ -814,7 +816,7 @@ def _check_early_stopping_loss(
             self.validation_score_.append(
                 -self._loss(
                     y_true=y_val,
-                    raw_prediction=raw_predictions_val.T,
+                    raw_prediction=raw_predictions_val,
                     sample_weight=sample_weight_val,
                     n_threads=n_threads,
                 )
@@ -928,7 +930,7 @@ def _raw_predict(self, X, n_threads=None):
 
         Returns
         -------
-        raw_predictions : array, shape (n_trees_per_iteration, n_samples)
+        raw_predictions : array, shape (n_samples, n_trees_per_iteration)
             The raw predicted values.
         """
         is_binned = getattr(self, "_in_fit", False)
@@ -942,8 +944,9 @@ def _raw_predict(self, X, n_threads=None):
             )
         n_samples = X.shape[0]
         raw_predictions = np.zeros(
-            shape=(self.n_trees_per_iteration_, n_samples),
+            shape=(n_samples, self.n_trees_per_iteration_),
             dtype=self._baseline_prediction.dtype,
+            order="F",
         )
         raw_predictions += self._baseline_prediction
 
@@ -979,7 +982,7 @@ def _predict_iterations(self, X, predictors, raw_predictions, is_binned, n_threa
                         f_idx_map=f_idx_map,
                         n_threads=n_threads,
                     )
-                raw_predictions[k, :] += predict(X)
+                raw_predictions[:, k] += predict(X)
 
     def _staged_raw_predict(self, X):
         """Compute raw predictions of ``X`` for each iteration.
@@ -995,7 +998,7 @@ def _staged_raw_predict(self, X):
         Yields
         -------
         raw_predictions : generator of ndarray of shape \
-            (n_trees_per_iteration, n_samples)
+            (n_samples, n_trees_per_iteration)
             The raw predictions of the input samples. The order of the
             classes corresponds to that in the attribute :term:`classes_`.
         """
@@ -1008,8 +1011,9 @@ def _staged_raw_predict(self, X):
             )
         n_samples = X.shape[0]
         raw_predictions = np.zeros(
-            shape=(self.n_trees_per_iteration_, n_samples),
+            shape=(n_samples, self.n_trees_per_iteration_),
             dtype=self._baseline_prediction.dtype,
+            order="F",
         )
         raw_predictions += self._baseline_prediction
 
@@ -1693,7 +1697,7 @@ def predict_proba(self, X):
             The class probabilities of the input samples.
         """
         raw_predictions = self._raw_predict(X)
-        return self._loss.predict_proba(raw_predictions.T)
+        return self._loss.predict_proba(raw_predictions)
 
     def staged_predict_proba(self, X):
         """Predict class probabilities at each iteration.
@@ -1713,7 +1717,7 @@ def staged_predict_proba(self, X):
             for each iteration.
         """
         for raw_predictions in self._staged_raw_predict(X):
-            yield self._loss.predict_proba(raw_predictions.T)
+            yield self._loss.predict_proba(raw_predictions)
 
     def decision_function(self, X):
         """Compute the decision function of ``X``.
@@ -1732,9 +1736,9 @@ def decision_function(self, X):
             classes in multiclass classification.
         """
         decision = self._raw_predict(X)
-        if decision.shape[0] == 1:
+        if decision.shape[1] == 1:
             decision = decision.ravel()
-        return decision.T
+        return decision
 
     def staged_decision_function(self, X):
         """Compute decision function of ``X`` for each iteration.
@@ -1756,9 +1760,9 @@ def staged_decision_function(self, X):
             classes corresponds to that in the attribute :term:`classes_`.
         """
         for staged_decision in self._staged_raw_predict(X):
-            if staged_decision.shape[0] == 1:
+            if staged_decision.shape[1] == 1:
                 staged_decision = staged_decision.ravel()
-            yield staged_decision.T
+            yield staged_decision
 
     def _encode_y(self, y):
         # encode classes into 0 ... n_classes - 1 and sets attributes classes_

From 4df17828e439ad09a7784e99cc3d8d956eb50fe0 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 20 Dec 2021 00:34:37 +0100
Subject: [PATCH 141/143] CLN raw_prediction.shape=(n_sample,1) in test

---
 .../_hist_gradient_boosting/tests/test_gradient_boosting.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index da730c3c02e9f..b607cd865830b 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -716,10 +716,10 @@ def test_sum_hessians_are_sample_weight(loss_name):
         n_samples=n_samples, dtype=G_H_DTYPE
     )
     gradients, hessians = gradients.reshape((-1, 1)), hessians.reshape((-1, 1))
-    raw_predictions = rng.normal(size=(1, n_samples))
+    raw_predictions = rng.normal(size=(n_samples, 1))
     loss.gradient_hessian(
         y_true=y,
-        raw_prediction=raw_predictions.T,
+        raw_prediction=raw_predictions,
         sample_weight=sample_weight,
         gradient_out=gradients,
         hessian_out=hessians,

From a0af877994c194293b5ab5f9e43e85b6cdf796ae Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Mon, 10 Jan 2022 17:08:19 +0100
Subject: [PATCH 142/143] CLN address review comments

---
 sklearn/_loss/loss.py                             |  2 +-
 .../_hist_gradient_boosting/gradient_boosting.py  | 15 +++++++--------
 .../tests/test_gradient_boosting.py               |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/sklearn/_loss/loss.py b/sklearn/_loss/loss.py
index c77d2f927c370..1a2353d18df3b 100644
--- a/sklearn/_loss/loss.py
+++ b/sklearn/_loss/loss.py
@@ -491,7 +491,7 @@ def init_gradient_and_hessian(self, n_samples, dtype=np.float64, order="F"):
         """
         if dtype not in (np.float32, np.float64):
             raise ValueError(
-                "Valid options for 'dtype' are np.float32 and np.flaot64. "
+                "Valid options for 'dtype' are np.float32 and np.float64. "
                 f"Got dtype={dtype} instead."
             )
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 788b9ad949e7e..e7388f62568c8 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -52,14 +52,14 @@ def _update_leaves_values(loss, grower, y_true, raw_prediction, sample_weight):
     """Update the leaf values to be predicted by the tree.
 
     Update equals:
-        loss.fit_intercept_only(y_true - raw_predictions)
+        loss.fit_intercept_only(y_true - raw_prediction)
 
     This is only applied if loss.need_update_leaves_values is True.
     Note: It only works, if the loss is a function of the residual, as is the
     case for AbsoluteError and PinballLoss. Otherwise, one would need to get
     the minimum of loss(y_true, raw_prediction + x) in x. A few examples:
-      - AbsoluteError: median(y_true - raw_predictions).
-      - PinballLoss: quantile(y_true - raw_predictions).
+      - AbsoluteError: median(y_true - raw_prediction).
+      - PinballLoss: quantile(y_true - raw_prediction).
     See also notes about need_update_leaves_values in BaseLoss.
     """
     # TODO: Ideally this should be computed in parallel over the leaves using something
@@ -412,7 +412,7 @@ def fit(self, X, y, sample_weight=None):
             self._clear_state()
 
             # initialize raw_predictions: those are the accumulated values
-            # predicted by the trees for the training data. raw_prediction has
+            # predicted by the trees for the training data. raw_predictions has
             # shape (n_samples, n_trees_per_iteration) where
             # n_trees_per_iterations is n_classes in multiclass classification,
             # else 1.
@@ -1355,7 +1355,6 @@ def predict(self, X):
         check_is_fitted(self)
         # Return inverse link of raw predictions after converting
         # shape (n_samples, 1) to (n_samples,)
-        # loss.link.inverse is the inverse link function
         return self._loss.link.inverse(self._raw_predict(X).ravel())
 
     def staged_predict(self, X):
@@ -1792,9 +1791,9 @@ def _get_loss(self, sample_weight):
         if self.loss == "categorical_crossentropy":
             if self.n_trees_per_iteration_ == 1:
                 raise ValueError(
-                    "'categorical_crossentropy' is not suitable for "
+                    "loss='categorical_crossentropy' is not suitable for "
                     "a binary classification problem. Please use "
-                    "'auto' or 'binary_crossentropy' instead."
+                    "loss='auto' or loss='binary_crossentropy' instead."
                 )
             else:
                 return _LOSSES[self.loss](
@@ -1806,7 +1805,7 @@ def _get_loss(self, sample_weight):
                     "loss='binary_crossentropy' is not defined for multiclass"
                     " classification with n_classes="
                     f"{self.n_trees_per_iteration_}, use loss="
-                    "'categorical_crossentropy' instead"
+                    "'categorical_crossentropy' instead."
                 )
             else:
                 return _LOSSES[self.loss](sample_weight=sample_weight)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
index b607cd865830b..c3c4816044d3f 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -586,7 +586,7 @@ def test_crossentropy_binary_problem():
     y = [0, 1]
     gbrt = HistGradientBoostingClassifier(loss="categorical_crossentropy")
     with pytest.raises(
-        ValueError, match="'categorical_crossentropy' is not suitable for"
+        ValueError, match="loss='categorical_crossentropy' is not suitable for"
     ):
         gbrt.fit(X, y)
 

From 502d3b6b5dbcd225cfc264b7ff7a86727ba93934 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Tue, 11 Jan 2022 18:06:08 +0100
Subject: [PATCH 143/143] CLN remove old whatsnew entry

---
 doc/whats_new/v1.1.rst | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 83659f6687323..eb7133b510c81 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -216,13 +216,6 @@ Changelog
   :pr:`20811`, :pr:`20567` and :pr:`21814` by
   :user:`Christian Lorentzen <lorentzenchr>`.
 
-- |Fix| :class:`ensemble.RandomForestClassifier`,
-  :class:`ensemble.RandomForestRegressor`,
-  :class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`,
-  and :class:`ensemble.RandomTreesEmbedding` now raise a ``ValueError`` when
-  ``bootstrap=False`` and ``max_samples`` is not ``None``.
-  :pr:`21295` :user:`Haoyin Xu <PSSF23>`.
-
 - |API| Changed the default of :func:`max_features` to 1.0 for
   :class:`ensemble.RandomForestRegressor` and to `"sqrt"` for
   :class:`ensemble.RandomForestClassifier`. Note that these give the same fit