|
| 1 | +""" |
| 2 | +Tests for LinearModelLoss |
| 3 | +
|
| 4 | +Note that correctness of losses is already well covered in the _loss module. |
| 5 | +""" |
| 6 | +import pytest |
| 7 | +import numpy as np |
| 8 | +from numpy.testing import assert_allclose |
| 9 | +from scipy import linalg, optimize, sparse |
| 10 | + |
| 11 | +from sklearn._loss.loss import ( |
| 12 | + HalfBinomialLoss, |
| 13 | + HalfMultinomialLoss, |
| 14 | + HalfPoissonLoss, |
| 15 | +) |
| 16 | +from sklearn.datasets import make_low_rank_matrix |
| 17 | +from sklearn.linear_model._linear_loss import LinearModelLoss |
| 18 | +from sklearn.utils.extmath import squared_norm |
| 19 | + |
| 20 | + |
| 21 | +# We don not need to test all losses, just what LinearModelLoss does on top of the |
| 22 | +# base losses. |
| 23 | +LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss] |
| 24 | + |
| 25 | + |
| 26 | +def random_X_y_coef( |
| 27 | + linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42 |
| 28 | +): |
| 29 | + """Random generate y, X and coef in valid range.""" |
| 30 | + rng = np.random.RandomState(seed) |
| 31 | + n_dof = n_features + linear_model_loss.fit_intercept |
| 32 | + X = make_low_rank_matrix( |
| 33 | + n_samples=n_samples, |
| 34 | + n_features=n_features, |
| 35 | + random_state=rng, |
| 36 | + ) |
| 37 | + |
| 38 | + if linear_model_loss._loss.is_multiclass: |
| 39 | + n_classes = linear_model_loss._loss.n_classes |
| 40 | + coef = np.empty((n_classes, n_dof)) |
| 41 | + coef.flat[:] = rng.uniform( |
| 42 | + low=coef_bound[0], |
| 43 | + high=coef_bound[1], |
| 44 | + size=n_classes * n_dof, |
| 45 | + ) |
| 46 | + if linear_model_loss.fit_intercept: |
| 47 | + raw_prediction = X @ coef[:, :-1].T + coef[:, -1] |
| 48 | + else: |
| 49 | + raw_prediction = X @ coef.T |
| 50 | + proba = linear_model_loss._loss.link.inverse(raw_prediction) |
| 51 | + |
| 52 | + # y = rng.choice(np.arange(n_classes), p=proba) does not work, see |
| 53 | + # See https://stackoverflow.com/a/34190035/16761084 |
| 54 | + def choice_vectorized(items, p): |
| 55 | + s = p.cumsum(axis=1) |
| 56 | + r = np.random.rand(p.shape[0])[:, None] |
| 57 | + k = (s < r).sum(axis=1) |
| 58 | + return items[k] |
| 59 | + |
| 60 | + y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64) |
| 61 | + else: |
| 62 | + coef = np.empty((n_dof,)) |
| 63 | + coef.flat[:] = rng.uniform( |
| 64 | + low=coef_bound[0], |
| 65 | + high=coef_bound[1], |
| 66 | + size=n_dof, |
| 67 | + ) |
| 68 | + if linear_model_loss.fit_intercept: |
| 69 | + raw_prediction = X @ coef[:-1] + coef[-1] |
| 70 | + else: |
| 71 | + raw_prediction = X @ coef |
| 72 | + y = linear_model_loss._loss.link.inverse( |
| 73 | + raw_prediction + rng.uniform(low=-1, high=1, size=n_samples) |
| 74 | + ) |
| 75 | + |
| 76 | + return X, y, coef |
| 77 | + |
| 78 | + |
| 79 | +@pytest.mark.parametrize("base_loss", LOSSES) |
| 80 | +@pytest.mark.parametrize("fit_intercept", [False, True]) |
| 81 | +@pytest.mark.parametrize("sample_weight", [None, "range"]) |
| 82 | +@pytest.mark.parametrize("l2_reg_strength", [0, 1]) |
| 83 | +def test_loss_gradients_are_the_same( |
| 84 | + base_loss, fit_intercept, sample_weight, l2_reg_strength |
| 85 | +): |
| 86 | + """Test that loss and gradient are the same across different functions.""" |
| 87 | + loss = LinearModelLoss(loss=base_loss(), fit_intercept=fit_intercept) |
| 88 | + X, y, coef = random_X_y_coef( |
| 89 | + linear_model_loss=loss, n_samples=10, n_features=5, seed=42 |
| 90 | + ) |
| 91 | + |
| 92 | + if sample_weight == "range": |
| 93 | + sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) |
| 94 | + |
| 95 | + l1 = loss.loss( |
| 96 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 97 | + ) |
| 98 | + g1 = loss.gradient( |
| 99 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 100 | + ) |
| 101 | + l2, g2 = loss.loss_gradient( |
| 102 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 103 | + ) |
| 104 | + g3, h3 = loss.gradient_hessp( |
| 105 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 106 | + ) |
| 107 | + |
| 108 | + assert_allclose(l1, l2) |
| 109 | + assert_allclose(g1, g2) |
| 110 | + assert_allclose(g1, g3) |
| 111 | + |
| 112 | + # same for sparse X |
| 113 | + X = sparse.csr_matrix(X) |
| 114 | + l1_sp = loss.loss( |
| 115 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 116 | + ) |
| 117 | + g1_sp = loss.gradient( |
| 118 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 119 | + ) |
| 120 | + l2_sp, g2_sp = loss.loss_gradient( |
| 121 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 122 | + ) |
| 123 | + g3_sp, h3_sp = loss.gradient_hessp( |
| 124 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 125 | + ) |
| 126 | + |
| 127 | + assert_allclose(l1, l1_sp) |
| 128 | + assert_allclose(l1, l2_sp) |
| 129 | + assert_allclose(g1, g1_sp) |
| 130 | + assert_allclose(g1, g2_sp) |
| 131 | + assert_allclose(g1, g3_sp) |
| 132 | + |
| 133 | + |
| 134 | +@pytest.mark.parametrize("base_loss", LOSSES) |
| 135 | +@pytest.mark.parametrize("sample_weight", [None, "range"]) |
| 136 | +@pytest.mark.parametrize("l2_reg_strength", [0, 1]) |
| 137 | +@pytest.mark.parametrize("X_sparse", [False, True]) |
| 138 | +def test_loss_gradients_hessp_intercept( |
| 139 | + base_loss, sample_weight, l2_reg_strength, X_sparse |
| 140 | +): |
| 141 | + """Test that loss and gradient handle intercept correctly.""" |
| 142 | + loss = LinearModelLoss(loss=base_loss(), fit_intercept=False) |
| 143 | + loss_inter = LinearModelLoss(loss=base_loss(), fit_intercept=True) |
| 144 | + n_samples, n_features = 10, 5 |
| 145 | + X, y, coef = random_X_y_coef( |
| 146 | + linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42 |
| 147 | + ) |
| 148 | + |
| 149 | + X[:, -1] = 1 # make last column of 1 to mimic intercept term |
| 150 | + X_inter = X[ |
| 151 | + :, :-1 |
| 152 | + ] # exclude intercept column as it is added automatically by loss_inter |
| 153 | + |
| 154 | + if X_sparse: |
| 155 | + X = sparse.csr_matrix(X) |
| 156 | + |
| 157 | + if sample_weight == "range": |
| 158 | + sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) |
| 159 | + |
| 160 | + l, g = loss.loss_gradient( |
| 161 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 162 | + ) |
| 163 | + _, hessp = loss.gradient_hessp( |
| 164 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 165 | + ) |
| 166 | + l_inter, g_inter = loss_inter.loss_gradient( |
| 167 | + coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 168 | + ) |
| 169 | + _, hessp_inter = loss_inter.gradient_hessp( |
| 170 | + coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 171 | + ) |
| 172 | + |
| 173 | + # Note, that intercept gets no L2 penalty. |
| 174 | + assert l == pytest.approx( |
| 175 | + l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1]) |
| 176 | + ) |
| 177 | + |
| 178 | + g_inter_corrected = g_inter |
| 179 | + g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1] |
| 180 | + assert_allclose(g, g_inter_corrected) |
| 181 | + |
| 182 | + s = np.random.RandomState(42).randn(*coef.shape) |
| 183 | + h = hessp(s) |
| 184 | + h_inter = hessp_inter(s) |
| 185 | + h_inter_corrected = h_inter |
| 186 | + h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1] |
| 187 | + assert_allclose(h, h_inter_corrected) |
| 188 | + |
| 189 | + |
| 190 | +@pytest.mark.parametrize("base_loss", LOSSES) |
| 191 | +@pytest.mark.parametrize("fit_intercept", [False, True]) |
| 192 | +@pytest.mark.parametrize("sample_weight", [None, "range"]) |
| 193 | +@pytest.mark.parametrize("l2_reg_strength", [0, 1]) |
| 194 | +def test_gradients_hessians_numerically( |
| 195 | + base_loss, fit_intercept, sample_weight, l2_reg_strength |
| 196 | +): |
| 197 | + """Test gradients and hessians with numerical derivatives. |
| 198 | +
|
| 199 | + Gradient should equal the numerical derivatives of the loss function. |
| 200 | + Hessians should equal the numerical derivatives of gradients. |
| 201 | + """ |
| 202 | + loss = LinearModelLoss(loss=base_loss(), fit_intercept=fit_intercept) |
| 203 | + n_samples, n_features = 10, 5 |
| 204 | + X, y, coef = random_X_y_coef( |
| 205 | + linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42 |
| 206 | + ) |
| 207 | + coef = coef.ravel(order="F") # this is important only for multinomial loss |
| 208 | + |
| 209 | + if sample_weight == "range": |
| 210 | + sample_weight = np.linspace(1, y.shape[0], num=y.shape[0]) |
| 211 | + |
| 212 | + # 1. Check gradients numerically |
| 213 | + eps = 1e-6 |
| 214 | + g, hessp = loss.gradient_hessp( |
| 215 | + coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength |
| 216 | + ) |
| 217 | + # Use a trick to get central finte difference of accuracy 4 (five-point stencil) |
| 218 | + # https://en.wikipedia.org/wiki/Numerical_differentiation |
| 219 | + # https://en.wikipedia.org/wiki/Finite_difference_coefficient |
| 220 | + approx_g1 = optimize.approx_fprime( |
| 221 | + coef, |
| 222 | + lambda coef: loss.loss( |
| 223 | + coef - eps, |
| 224 | + X, |
| 225 | + y, |
| 226 | + sample_weight=sample_weight, |
| 227 | + l2_reg_strength=l2_reg_strength, |
| 228 | + ), |
| 229 | + 2 * eps, |
| 230 | + ) # (f(x + eps) - f(x - eps)) / (2*eps) |
| 231 | + approx_g2 = optimize.approx_fprime( |
| 232 | + coef, |
| 233 | + lambda coef: loss.loss( |
| 234 | + coef - 2 * eps, |
| 235 | + X, |
| 236 | + y, |
| 237 | + sample_weight=sample_weight, |
| 238 | + l2_reg_strength=l2_reg_strength, |
| 239 | + ), |
| 240 | + 4 * eps, |
| 241 | + ) # (f(x + 2*eps) - f(x - 2*eps)) / (4*eps) |
| 242 | + approx_g = 4 / 3 * approx_g1 - 1 / 3 * approx_g2 |
| 243 | + assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8) |
| 244 | + |
| 245 | + # 2. Check hessp numerically along the second direction of the gradient |
| 246 | + vector = np.zeros_like(g) |
| 247 | + vector[1] = 1 |
| 248 | + hess_col = hessp(vector) |
| 249 | + # Computation of the Hessian is particularly fragile to numerical errors when doing |
| 250 | + # simple finite differences. Here we compute the grad along a path in the direction |
| 251 | + # of the vector and then use a least-square regression to estimate the slope |
| 252 | + eps = 1e-3 |
| 253 | + d_x = np.linspace(-eps, eps, 30) |
| 254 | + d_grad = np.array( |
| 255 | + [ |
| 256 | + loss.gradient( |
| 257 | + coef + t * vector, |
| 258 | + X, |
| 259 | + y, |
| 260 | + sample_weight=sample_weight, |
| 261 | + l2_reg_strength=l2_reg_strength, |
| 262 | + ) |
| 263 | + for t in d_x |
| 264 | + ] |
| 265 | + ) |
| 266 | + d_grad -= d_grad.mean(axis=0) |
| 267 | + approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel() |
| 268 | + assert_allclose(approx_hess_col, hess_col, rtol=1e-3) |
| 269 | + |
| 270 | + |
| 271 | +@pytest.mark.parametrize("fit_intercept", [False, True]) |
| 272 | +def test_multinomial_coef_shape(fit_intercept): |
| 273 | + """Test that multinomial LinearModelLoss respects shape of coef.""" |
| 274 | + loss = LinearModelLoss(loss=HalfMultinomialLoss(), fit_intercept=fit_intercept) |
| 275 | + n_samples, n_features = 10, 5 |
| 276 | + X, y, coef = random_X_y_coef( |
| 277 | + linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42 |
| 278 | + ) |
| 279 | + s = np.random.RandomState(42).randn(*coef.shape) |
| 280 | + |
| 281 | + l, g = loss.loss_gradient(coef, X, y) |
| 282 | + g1 = loss.gradient(coef, X, y) |
| 283 | + g2, hessp = loss.gradient_hessp(coef, X, y) |
| 284 | + h = hessp(s) |
| 285 | + assert g.shape == coef.shape |
| 286 | + assert h.shape == coef.shape |
| 287 | + assert_allclose(g, g1) |
| 288 | + assert_allclose(g, g2) |
| 289 | + |
| 290 | + coef_r = coef.ravel(order="F") |
| 291 | + s_r = s.ravel(order="F") |
| 292 | + l_r, g_r = loss.loss_gradient(coef_r, X, y) |
| 293 | + g1_r = loss.gradient(coef_r, X, y) |
| 294 | + g2_r, hessp_r = loss.gradient_hessp(coef_r, X, y) |
| 295 | + h_r = hessp_r(s_r) |
| 296 | + assert g_r.shape == coef_r.shape |
| 297 | + assert h_r.shape == coef_r.shape |
| 298 | + assert_allclose(g_r, g1_r) |
| 299 | + assert_allclose(g_r, g2_r) |
| 300 | + |
| 301 | + assert_allclose(g, g_r.reshape(loss._loss.n_classes, -1, order="F")) |
| 302 | + assert_allclose(h, h_r.reshape(loss._loss.n_classes, -1, order="F")) |
0 commit comments