scikit-learn
diff --git a/‎sklearn/linear_model/tests/test_linear_loss.py
Lines changed: 302 additions & 0 deletions b/‎sklearn/linear_model/tests/test_linear_loss.py
Lines changed: 302 additions & 0 deletions
@@ -0,0 +1,302 @@
+"""
+Tests for LinearModelLoss
+
+Note that correctness of losses is already well covered in the _loss module.
+"""
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+from scipy import linalg, optimize, sparse
+
+from sklearn._loss.loss import (
+    HalfBinomialLoss,
+    HalfMultinomialLoss,
+    HalfPoissonLoss,
+)
+from sklearn.datasets import make_low_rank_matrix
+from sklearn.linear_model._linear_loss import LinearModelLoss
+from sklearn.utils.extmath import squared_norm
+
+
+# We don not need to test all losses, just what LinearModelLoss does on top of the
+# base losses.
+LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss]
+
+
+def random_X_y_coef(
+    linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42
+):
+    """Random generate y, X and coef in valid range."""
+    rng = np.random.RandomState(seed)
+    n_dof = n_features + linear_model_loss.fit_intercept
+    X = make_low_rank_matrix(
+        n_samples=n_samples,
+        n_features=n_features,
+        random_state=rng,
+    )
+
+    if linear_model_loss._loss.is_multiclass:
+        n_classes = linear_model_loss._loss.n_classes
+        coef = np.empty((n_classes, n_dof))
+        coef.flat[:] = rng.uniform(
+            low=coef_bound[0],
+            high=coef_bound[1],
+            size=n_classes * n_dof,
+        )
+        if linear_model_loss.fit_intercept:
+            raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
+        else:
+            raw_prediction = X @ coef.T
+        proba = linear_model_loss._loss.link.inverse(raw_prediction)
+
+        # y = rng.choice(np.arange(n_classes), p=proba) does not work, see
+        # See https://stackoverflow.com/a/34190035/16761084
+        def choice_vectorized(items, p):
+            s = p.cumsum(axis=1)
+            r = np.random.rand(p.shape[0])[:, None]
+            k = (s < r).sum(axis=1)
+            return items[k]
+
+        y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
+    else:
+        coef = np.empty((n_dof,))
+        coef.flat[:] = rng.uniform(
+            low=coef_bound[0],
+            high=coef_bound[1],
+            size=n_dof,
+        )
+        if linear_model_loss.fit_intercept:
+            raw_prediction = X @ coef[:-1] + coef[-1]
+        else:
+            raw_prediction = X @ coef
+        y = linear_model_loss._loss.link.inverse(
+            raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)
+        )
+
+    return X, y, coef
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+def test_loss_gradients_are_the_same(
+    base_loss, fit_intercept, sample_weight, l2_reg_strength
+):
+    """Test that loss and gradient are the same across different functions."""
+    loss = LinearModelLoss(loss=base_loss(), fit_intercept=fit_intercept)
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=10, n_features=5, seed=42
+    )
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    l1 = loss.loss(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g1 = loss.gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l2, g2 = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g3, h3 = loss.gradient_hessp(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+
+    assert_allclose(l1, l2)
+    assert_allclose(g1, g2)
+    assert_allclose(g1, g3)
+
+    # same for sparse X
+    X = sparse.csr_matrix(X)
+    l1_sp = loss.loss(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g1_sp = loss.gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l2_sp, g2_sp = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    g3_sp, h3_sp = loss.gradient_hessp(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+
+    assert_allclose(l1, l1_sp)
+    assert_allclose(l1, l2_sp)
+    assert_allclose(g1, g1_sp)
+    assert_allclose(g1, g2_sp)
+    assert_allclose(g1, g3_sp)
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+@pytest.mark.parametrize("X_sparse", [False, True])
+def test_loss_gradients_hessp_intercept(
+    base_loss, sample_weight, l2_reg_strength, X_sparse
+):
+    """Test that loss and gradient handle intercept correctly."""
+    loss = LinearModelLoss(loss=base_loss(), fit_intercept=False)
+    loss_inter = LinearModelLoss(loss=base_loss(), fit_intercept=True)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
+    )
+
+    X[:, -1] = 1  # make last column of 1 to mimic intercept term
+    X_inter = X[
+        :, :-1
+    ]  # exclude intercept column as it is added automatically by loss_inter
+
+    if X_sparse:
+        X = sparse.csr_matrix(X)
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    l, g = loss.loss_gradient(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    _, hessp = loss.gradient_hessp(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    l_inter, g_inter = loss_inter.loss_gradient(
+        coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    _, hessp_inter = loss_inter.gradient_hessp(
+        coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+
+    # Note, that intercept gets no L2 penalty.
+    assert l == pytest.approx(
+        l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])
+    )
+
+    g_inter_corrected = g_inter
+    g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1]
+    assert_allclose(g, g_inter_corrected)
+
+    s = np.random.RandomState(42).randn(*coef.shape)
+    h = hessp(s)
+    h_inter = hessp_inter(s)
+    h_inter_corrected = h_inter
+    h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1]
+    assert_allclose(h, h_inter_corrected)
+
+
+@pytest.mark.parametrize("base_loss", LOSSES)
+@pytest.mark.parametrize("fit_intercept", [False, True])
+@pytest.mark.parametrize("sample_weight", [None, "range"])
+@pytest.mark.parametrize("l2_reg_strength", [0, 1])
+def test_gradients_hessians_numerically(
+    base_loss, fit_intercept, sample_weight, l2_reg_strength
+):
+    """Test gradients and hessians with numerical derivatives.
+
+    Gradient should equal the numerical derivatives of the loss function.
+    Hessians should equal the numerical derivatives of gradients.
+    """
+    loss = LinearModelLoss(loss=base_loss(), fit_intercept=fit_intercept)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
+    )
+    coef = coef.ravel(order="F")  # this is important only for multinomial loss
+
+    if sample_weight == "range":
+        sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
+
+    # 1. Check gradients numerically
+    eps = 1e-6
+    g, hessp = loss.gradient_hessp(
+        coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
+    )
+    # Use a trick to get central finte difference of accuracy 4 (five-point stencil)
+    # https://en.wikipedia.org/wiki/Numerical_differentiation
+    # https://en.wikipedia.org/wiki/Finite_difference_coefficient
+    approx_g1 = optimize.approx_fprime(
+        coef,
+        lambda coef: loss.loss(
+            coef - eps,
+            X,
+            y,
+            sample_weight=sample_weight,
+            l2_reg_strength=l2_reg_strength,
+        ),
+        2 * eps,
+    )  # (f(x + eps) - f(x - eps)) / (2*eps)
+    approx_g2 = optimize.approx_fprime(
+        coef,
+        lambda coef: loss.loss(
+            coef - 2 * eps,
+            X,
+            y,
+            sample_weight=sample_weight,
+            l2_reg_strength=l2_reg_strength,
+        ),
+        4 * eps,
+    )  # (f(x + 2*eps) - f(x - 2*eps)) / (4*eps)
+    approx_g = 4 / 3 * approx_g1 - 1 / 3 * approx_g2
+    assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8)
+
+    # 2. Check hessp numerically along the second direction of the gradient
+    vector = np.zeros_like(g)
+    vector[1] = 1
+    hess_col = hessp(vector)
+    # Computation of the Hessian is particularly fragile to numerical errors when doing
+    # simple finite differences. Here we compute the grad along a path in the direction
+    # of the vector and then use a least-square regression to estimate the slope
+    eps = 1e-3
+    d_x = np.linspace(-eps, eps, 30)
+    d_grad = np.array(
+        [
+            loss.gradient(
+                coef + t * vector,
+                X,
+                y,
+                sample_weight=sample_weight,
+                l2_reg_strength=l2_reg_strength,
+            )
+            for t in d_x
+        ]
+    )
+    d_grad -= d_grad.mean(axis=0)
+    approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
+    assert_allclose(approx_hess_col, hess_col, rtol=1e-3)
+
+
+@pytest.mark.parametrize("fit_intercept", [False, True])
+def test_multinomial_coef_shape(fit_intercept):
+    """Test that multinomial LinearModelLoss respects shape of coef."""
+    loss = LinearModelLoss(loss=HalfMultinomialLoss(), fit_intercept=fit_intercept)
+    n_samples, n_features = 10, 5
+    X, y, coef = random_X_y_coef(
+        linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
+    )
+    s = np.random.RandomState(42).randn(*coef.shape)
+
+    l, g = loss.loss_gradient(coef, X, y)
+    g1 = loss.gradient(coef, X, y)
+    g2, hessp = loss.gradient_hessp(coef, X, y)
+    h = hessp(s)
+    assert g.shape == coef.shape
+    assert h.shape == coef.shape
+    assert_allclose(g, g1)
+    assert_allclose(g, g2)
+
+    coef_r = coef.ravel(order="F")
+    s_r = s.ravel(order="F")
+    l_r, g_r = loss.loss_gradient(coef_r, X, y)
+    g1_r = loss.gradient(coef_r, X, y)
+    g2_r, hessp_r = loss.gradient_hessp(coef_r, X, y)
+    h_r = hessp_r(s_r)
+    assert g_r.shape == coef_r.shape
+    assert h_r.shape == coef_r.shape
+    assert_allclose(g_r, g1_r)
+    assert_allclose(g_r, g2_r)
+
+    assert_allclose(g, g_r.reshape(loss._loss.n_classes, -1, order="F"))
+    assert_allclose(h, h_r.reshape(loss._loss.n_classes, -1, order="F"))