From d5e88108a71c6c256b67948515f20955cc96fabf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 18 Jul 2017 21:50:10 +0200 Subject: [PATCH 001/269] [WIP] Add Generalized Linear Model, issue #5975, initial commit --- sklearn/linear_model/__init__.py | 7 + sklearn/linear_model/glm.py | 872 +++++++++++++++++++++++++ sklearn/linear_model/tests/test_glm.py | 73 +++ 3 files changed, 952 insertions(+) create mode 100644 sklearn/linear_model/glm.py create mode 100644 sklearn/linear_model/tests/test_glm.py diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 2e01990ccce8c..5acc51e9dc87f 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,6 +18,12 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) +from .glm import (Link, IdentityLink, LogLink, + ExponentialDispersionModel, TweedieDistribution, + NormalDistribution, GaussianDistribution, + PoissonDistribution, GammaDistribution, + InverseGaussianDistribution, GeneralizedHyperbolicSecand, + GeneralizedLinearModel) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -38,6 +44,7 @@ 'BayesianRidge', 'ElasticNet', 'ElasticNetCV', + 'GeneralizedLinearModel', 'Hinge', 'Huber', 'HuberRegressor', diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py new file mode 100644 index 0000000000000..8b6eb8f3bf16c --- /dev/null +++ b/sklearn/linear_model/glm.py @@ -0,0 +1,872 @@ +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression. +# So far, it is GeneralizedLinearModel, since it could very easily +# extended by Bernoulli/Binomial distribution. +# TODO: Which name/symbol for coefficients and weights in docu? +# sklearn.linear_models uses w for coefficients. +# So far, coefficients=beta and weight=w (as standard literature) +# TODO: Add l2-penalty +# TODO: Add l1-penalty (elastic net) +# TODO: Add cross validation +# TODO: Write docu and examples + +# Design Decisions: +# - The link funtion (instance of class Link) is necessary for the evaluation +# of deviance, score, Fisher and Hessian matrix as functions of the +# coefficients, which is needed by optimizers. +# Solution: link as argument in those functions + +from __future__ import division +from abc import ABCMeta, abstractmethod, abstractproperty +import numbers +import numpy as np +from scipy import linalg, optimize, sparse +import warnings +from .base import LinearModel, LinearRegression +from ..base import RegressorMixin +from ..utils import check_X_y +from ..utils.extmath import safe_sparse_dot +from ..utils.optimize import newton_cg +from ..utils.validation import check_is_fitted + + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link funtions + """ + + @abstractmethod + def link(self, mu): + """The link function g(mu) with argument mu=E[Y] returns the + linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def derivative(self, mu): + """Derivative of the link g'(mu). + """ + raise NotImplementedError + + @abstractmethod + def inverse(self, lin_pred): + """The inverse link function h(lin_pred) with the linear predictor as + argument returns mu=E[Y]. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Derivative of the inverse link function h'(lin_pred). + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Second derivative of the inverse link function h''(lin_pred). + """ + raise NotImplementedError + +class IdentityLink(Link): + """The identity link function g(x)=x. + """ + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x). + """ + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class ExponentialDispersionModel(metaclass=ABCMeta): + """Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _score + _fisher_matrix + _observed_information + _deviance + _deviance_derivative + _deviance_hessian + + References + ---------- + See https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + + @abstractproperty + def lower_bound(self): + """The lower bound of values of Y~EDM. + """ + raise NotImplementedError() + + @abstractproperty + def upper_bound(self): + """The upper bound of values of Y~EDM. + """ + raise NotImplementedError() + + @abstractmethod + def in_y_range(self, x): + """Returns true if x is in the valid range of Y~EDM. + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance(self, mu): + """The unit variance :math:`v(mu)` determines the variance as + a function of the mean mu by + :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance_derivative(self, mu): + """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + """ + raise NotImplementedError() + + def variance(self, mu, phi=1, weight=1): + """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is + :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, + with unit variance v(mu). + """ + return phi/weight * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weight=1): + """The derivative of the variance w.r.t. mu, + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/w_i*v'(\mu_i)`, with unit variance v(mu). + """ + return phi/weight * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + """The unit_deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is given by + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + """ + raise NotImplementedError() + + def unit_deviance_derivative(self, y, mu): + """The derivative w.r.t. mu of the unit_deviance + :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Returns + ------- + derivative: array, shape = (n_samples,) + """ + return -2*(y-mu)/self.unit_variance(mu) + + def deviance(self, y, mu, weight=1): + """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) + with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`. + In terms of the likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{w}) + - loglike(y,y,\frac{phi}{w})\right).` + """ + return np.sum(weight*self.unit_deviance(y,mu)) + + def _deviance(self, coef, X, y, weight, link): + """The deviance as a function of the coefficients ``coef`` + (:math:`beta`). + """ + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + return self.deviance(y, mu, weight) + + def deviance_derivative(self, y, mu, weight=1): + """The derivative w.r.t. mu of the deviance.` + """ + return weight*self.unit_deviance_derivative(y,mu) + + def _score(self, coef, phi, X, y, weight, link): + """The score function :math:`s` is the derivative of the + log-likelihood w.r.t. the ``coef`` (:math:`\beta`). + It is given by + + .. math: + + \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D} + \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, + + with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + d = link.inverse_derivative(lin_pred) + d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) + score = safe_sparse_dot(X.T, temp, dense_output=False) + return score + + def _fisher_matrix(self, coef, phi, X, y, weight, link): + """The Fisher information matrix, also known as expected + information matrix. It is given by + + .. math: + + \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} + \partial\boldsymbol{\beta}^T}\right] + = \mathbf{X}^T W \mathbf{X} \,, + + with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, + see score function. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + d2 = link.inverse_derivative(lin_pred)**2 + d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) + fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) + return fisher_matrix + + def _observed_information(self, coef, phi, X, y, weight, link): + """The observed information matrix, also known as the negative of + the Hessian matrix of the log-likelihood. It is given by + + .. math: + + \mathbf{H}(\boldsymbol{\beta}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} + \partial\boldsymbol{\beta}^T} + = \mathbf{X}^T \legt[ + - \mathbf{D}' \mathbf{R} + + \mathbf{D}^2 \mathbf{V} \mathbf{R} + + \mathbf{D}^2 + \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, + + with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, + :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ + v(\mu_i)} + \right)`, + see score function and Fisher matrix. + """ + n_samples = X.shape[0] + lin_pred = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + dp = link.inverse_derivative2(lin_pred) + d2 = link.inverse_derivative(lin_pred)**2 + v = self.unit_variance_derivative(mu)/self.unit_variance(mu) + r = y - mu + temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(temp, X, dense_output=False) + observed_information = safe_sparse_dot(X.T, temp, dense_output=False) + return observed_information + + def _deviance_derivative(self, coef, X, y, weight, link): + """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a + function of the coefficients ``coef``. + This is equivalent to :math:`-2\phi` times the score function + :math:`s` (derivative of the log-likelihood). + """ + score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, + link=link) + return -2*score + + def _deviance_hessian(self, coef, X, y, weight, link): + """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance + as a function of the coefficients ``coef``. + This is equivalent to :math:`+2\phi` times the observed information + matrix. + """ + info_matrix = self._observed_information(coef=coef, phi=1, + X=X, y=y, weight=weight, link=link) + return 2*info_matrix + + def starting_mu(self, y, weight=1): + """Starting values for the mean mu_i in IRLS.""" + return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight)) + + +class TweedieDistribution(ExponentialDispersionModel): + """A class for the Tweedie distribution. + They have mu=E[X] and Var[X] \propto mu**power. + + Attributes + ---------- + power : float + The variance power of the unit_variance + :math:`v(mu) = mu^{power}`. + """ + def __init__(self, power=0): + self.power = power + self._upper_bound = np.Inf + self._upper_compare = lambda x: np.less(x, self.upper_bound) + if power < 0: + #Extreme Stable + self._lower_bound = -np.Inf + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power == 0: + #GaussianDistribution + self._lower_bound = -np.Inf + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif (power > 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + #Compound Poisson + self._lower_bound = 0 + self._lower_compare = ( + lambda x: np.greater_equal(x, self.lower_bound)) + elif power == 2: + #GammaDistribution + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif (power > 2) and (power < 3): + #Positive Stable + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power == 3: + #InverseGaussianDistribution + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + elif power > 3: + #Positive Stable + self._lower_bound = 0 + self._lower_compare = lambda x: np.greater(x, self.lower_bound) + + @property + def power(self): + return self._power + + @power.setter + def power(self, power): + if not isinstance(power, numbers.Real): + raise TypeError('power must be a real number, input was {0}' + .format(power)) + self._power = power + + @property + def lower_bound(self): + return self._lower_bound + + @property + def upper_bound(self): + return self._upper_bound + + def in_y_range(self, x): + return np.logical_and(self._lower_compare(x), self._upper_compare(x)) + + def unit_variance(self, mu): + """The unit variance of a Tweedie distribution is v(mu)=mu**power. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """The derivative of the unit variance of a Tweedie distribution is + v(mu)=power*mu**(power-1). + """ + return self.power*np.power(mu, self.power-1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + #NormalDistribution + return (y-mu)**2 + if p == 1: + #PoissonDistribution + return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu) + elif p == 2: + #GammaDistribution + return 2 * (np.log(mu/y)+y/mu-1) + else: + #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p)) + - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + def likelihood(self, y, X, beta, phi, weight=1): + raise NotImplementedError('This function is not (yet) implemented.') + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + +GaussianDistribution = NormalDistribution + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + +class GeneralizedHyperbolicSecand(ExponentialDispersionModel): + """A class for the von Generalized Hyperbolic Secand (GHS) distribution. + + The GHS distribution is for data y in (-inf, inf). + """ + def __init__(self): + self._lower_bound = -np.Inf + self._upper_bound = np.Inf + + @property + def lower_bound(self): + return self._lower_bound + + @property + def upper_bound(self): + return self._upper_bound + + def in_y_range(self, x): + np.logical_and( + np.greater(x, self.lower_bound), + np.less(x, self.lower_bound) + ) + + def unit_variance(self, mu): + return 1 + mu**2 + + def unit_variance_derivative(self, mu): + return 2*mu + + def unit_deviance(self, y, mu): + return (2*y*(np.arctan(y) - np.arctan(mu)) + + np.log((1+mu**2)/(1+y**2))) + + + +class GeneralizedLinearModel(LinearModel, RegressorMixin): + """ + Class to fit a Generalized Linear Model (GLM) based on reproductive + Exponential Dispersion Models (EDM). + + Assumptions: + + - The target values y_i are realizations of random variables + :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion + parameter :math:`\phi` and weights :math:`w_i`. + - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)` + whith the linear predictor :math:`\eta=X*\beta`, inverse link function + :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` + to be estimated. + + Note that the first assumption implies + :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance + function :math:`v(\mu)`. + + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and + :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)` + + The parameters :math:`\beta` are estimated by maximum likelihood which is + equivalent to minimizing the deviance. + + TODO: Estimation of the dispersion parameter phi. + + TODO: Notes on 'scaled' Poisson and weights + + Parameters + ---------- + fit_intercept : boolean, optional, default True + whether to calculate the intercept for this model. If set + to False, no intercept will be used in calculations + (e.g. data is expected to be already centered). + + family : ExponentialDispersionModel, optional, default NormalDistribution() + the distributional assumption of the GLM + + link : Link, optional, default IdentityLink() + the link function (class) of the GLM + + fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' + method for estimation of the dispersion parameter phi. Whether to use + the chi squared statisic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls' + Algorithm to use in the optimization problem. + + - 'irls' is iterated reweighted least squares. It is the standard + algorithm for GLMs. + + - 'newton-cg', 'lbfgs' + + max_iter : int, default 100 + TODO + + tol : float + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative of + the deviance). + + start_params : {array shape (n_features, ), 'ols'}, default None + sets the start values for coef_ in the fit. + If None, default values are taken. + If 'ols' the result of an ordinary least squares in the link space + (linear predictor) is taken. + If an array is given, these values are taken as coef_ to start with. + If fit_intercept is true, the first value is assumed to be the start + value for the intercept_. + + verbose : int, default: 0 + For the lbfgs solver set verbose to any positive + number for verbosity. + + Attributes + ---------- + coef_ : array, shape (1, n_features) + Estimated coefficients for the linear predictor (X*coef_) in the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\phi` if fit_dispersion is set. + + n_iter_ : int + Actual number of iterations of the solver. + + Notes + ----- + + References + ---------- + TODO + """ + + def __init__(self, fit_intercept=True, family=NormalDistribution(), + link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100, + tol=1e-4, start_params=None, verbose=0): + self.fit_intercept = fit_intercept + self.family = family + self.link = link + self.fit_dispersion = fit_dispersion + self.solver = solver + self.max_iter = 100 + self.tol = tol + self.start_params = start_params + self.verbose = verbose + + def fit(self, X, y, weight=None): + """ + Fit a generalized linear model. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Training data + + y : numpy array of shape [n_samples] + Target values + + weight : numpy array of shape [n_samples] + Individual weights for each sample. + Var[Y_i]=phi/weight_i * v(mu) + If Y_i ~ EDM(mu, phi/w_i) then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)) + + Returns + ------- + self : returns an instance of self. + """ + if not isinstance(self.family, ExponentialDispersionModel): + raise ValueError("The argument family must be an instance of class" + "ExponentialDispersionModel.") + if not isinstance(self.fit_intercept, bool): + raise ValueError("The argument fit_intercept must be bool," + " got {0}".format(self.fit_intercept)) + if not self.solver in ['irls', 'lbfgs', 'newton-cg']: + raise ValueError("GLM Regression supports only irls, lbfgs and" + "newton-cg solvers, got {0}".format(self.solver)) + if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: + raise ValueError("Maximum number of iteration must be positive;" + " got (max_iter={0!r})".format(self.max_iter)) + if not isinstance(self.tol, numbers.Number) or self.tol < 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol={0!r})".format(self.tol)) + start_params = self.start_params + if start_params is not None and start_params is not 'ols': + start_params = np.atleast_1d(start_params) + if start_params.shape[0] != X.shape[1] + self.fit_intercept: + raise ValueError("Start values for parameters must have the" + "right length; required length {0}, got {1}".format( + X.shape[1] + self.fit_intercept, start_params.shape[0])) + + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + y_numeric=True, multi_output=False) + y = y.astype(np.float64) + + if not np.all(self.family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}".format(self.family.__class__.__name__)) + + if weight is None: + weight = np.ones_like(y) + elif np.isscalar(weight): + weight = weight*np.ones_like(y) + else: + weight = np.atleast_1d(weight) + if weight.ndim > 1: + raise ValueError("Weights must be 1D array or scalar") + elif weight.shape[0] != y.shape[0]: + raise ValueError("Weights must have the same length as y") + + + if self.fit_intercept: + #intercept is first column <=> coef[0] is for intecept + if sparse.issparse(X): + Xnew = sparse.hstack([np.ones([X.shape[0],1]), X]) + else: + Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1) + else: + Xnew = X + + n_samples, n_features = Xnew.shape + + #Note: Since phi does not enter the estimation of mu_i=E[y_i] + # set it to 1 where convenient. + + #set start values for coef + coef = None + if start_params is None: + #Use mu_start and apply one irls step to calculate coef + mu = self.family.starting_mu(y, weight) + #linear predictor + eta = self.link.link(mu) + #h'(eta) + hp = self.link.inverse_derivative(eta) + #working weights w, in principle a diagonal matrix + #therefore here just as 1d array + w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + wroot = np.sqrt(w) + #working observations + yw = eta + (y-mu)/hp + #least squares rescaled with wroot + wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) + X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) + yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) + coef = linalg.lstsq(X_rescale, yw_rescale)[0] + elif start_params is 'ols': + reg = LinearRegression(copy_X=False, + fit_intercept=False) + reg.fit(Xnew, self.link.link(y)) + coef = reg.coef_ + else: + coef = start_params + + #algorithms for optimiation + #TODO: Parallelize it + self.n_iter_ = 0 + converged = False + if self.solver == 'irls': + #linear predictor + eta = safe_sparse_dot(Xnew, coef, dense_output=True) + mu = self.link.inverse(eta) + while self.n_iter_ < self.max_iter: + self.n_iter_ += 1 + #coef_old not used so far. + #coef_old = coef + #h'(eta) + hp = self.link.inverse_derivative(eta) + #working weights w, in principle a diagonal matrix + #therefore here just as 1d array + w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + wroot = np.sqrt(w) + #working observations + yw = eta + (y-mu)/hp + #least squares rescaled with wroot + wroot = sparse.dia_matrix((wroot, 0), + shape=(n_samples, n_samples)) + X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) + yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) + coef, residues, rank, singular_ = ( + linalg.lstsq(X_rescale, yw_rescale)) + + #updated linear predictor + #do it here for updated values for tolerance + eta = safe_sparse_dot(Xnew, coef, dense_output=True) + mu = self.link.inverse(eta) + + #which tolerace? |coef - coef_old| or gradient? + #use gradient for compliance with newton-cg and lbfgs + #TODO: faster computation of gradient, use mu and eta directly + gradient = self.family._deviance_derivative(coef=coef, + X=Xnew, y=y, weight=weight, link=self.link) + if (np.max(np.abs(gradient)) <= self.tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})".format(self.max_iter)) + + #TODO: performance: make one function return both deviance and gradient + elif self.solver == 'lbfgs': + func = self.family._deviance + fprime = self.family._deviance_derivative + args = (Xnew, y, weight, self.link) + coef, loss, info = optimize.fmin_l_bfgs_b( + func, coef, fprime=fprime, + args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.") + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}".format( + info["task"])) + self.n_iter_ = info['nit'] + elif self.solver == 'newton-cg': + func = self.family._deviance + grad = self.family._deviance_derivative + def grad_hess(coef, X, y, weight, link): + grad = (self.family + ._deviance_derivative(coef, X, y, weight, link)) + hessian = (self.family + ._deviance_hessian(coef, X, y, weight,link)) + def Hs(s): + ret = np.dot(hessian, s) + return ret + return grad, Hs + hess = grad_hess + args = (Xnew, y, weight, self.link) + coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, + maxiter=self.max_iter, tol=self.tol) + self.coef_ = coef + + if self.fit_intercept is True: + self.intercept_ = coef[0] + self.coef_ = coef[1:] + else: + self.coef_ = coef + + if self.fit_dispersion in ['chisqr', 'deviance']: + self.dispersion_ = self.estimate_phi(y, X, weight) + + return self + + def predict(self, X, weight=1): + check_is_fitted(self, "coef_") + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + return mu*weight + + def estimate_phi(self, y, X, weight): + n_samples, n_features = X.shape + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + if self.fit_dispersion == 'chisqr': + chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu)) + return chisq/(n_samples - n_features) + elif self.fit_dispersion == 'deviance': + dev = self.family.deviance(y, mu, weight) + return dev/(n_samples - n_features) + + def score(self, X, y, weight=1): + """The natural score for a GLM is -deviance. + Returns the weight averaged negitive deviance (the better the score, + the better the fit). Maximum score is therefore 0. + """ + #RegressorMixin has R^2 score. + #TODO: Make it more compatible with the score function in + # sklearn.metrics.regression.py + eta = safe_sparse_dot(X, self.coef_, dense_output=True) + if self.fit_intercept is True: + eta += self.intercept_ + mu = self.link.inverse(eta) + output_errors = self.family.unit_deviance(y,mu) + weight = weight * np.ones_like(y) + return np.average(output_errors, weights=weight) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py new file mode 100644 index 0000000000000..2a88a7ec899d9 --- /dev/null +++ b/sklearn/linear_model/tests/test_glm.py @@ -0,0 +1,73 @@ +import numpy as np + +from sklearn.linear_model.glm import (Link, IdentityLink, LogLink, + ExponentialDispersionModel, TweedieDistribution, + NormalDistribution, GaussianDistribution, + PoissonDistribution, GammaDistribution, + InverseGaussianDistribution, GeneralizedHyperbolicSecand, + GeneralizedLinearModel) + +from sklearn.utils.testing import (assert_equal, assert_array_equal, + assert_array_almost_equal) + +def test_family_bounds(): + """Test the valid range of distributions + """ + family = NormalDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [True, True, True]) + + family = PoissonDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, True, True]) + + family = TweedieDistribution(power=1.5) + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, True, True]) + + family = GammaDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + + family = InverseGaussianDistribution() + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + + family = TweedieDistribution(power=4.5) + result = family.in_y_range([-1,0,1]) + assert_array_equal(result, [False, False, True]) + +def test_glm_identiy_regression(): + """Test linear regression on a simple dataset + """ + coef = [1,2] + X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + y = np.dot(X, coef) + for solver in ['irls', 'lbfgs', 'newton-cg']: + for family in (GaussianDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)): + glm = GeneralizedLinearModel(family=family, + fit_intercept=False, solver=solver) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + +def test_glm_log_regression(): + """Test linear regression on a simple dataset + """ + coef = [1,2] + X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + y = np.exp(np.dot(X, coef)) + #for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls']: + #for family in [GaussianDistribution(), PoissonDistribution(), + # GammaDistribution(), InverseGaussianDistribution(), + # TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]: + for family in [GaussianDistribution()]: + glm = GeneralizedLinearModel(family=family, + link=LogLink(), + fit_intercept=False, solver=solver, start_params='ols') + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + +#TODO: Test compatibility with R's glm, glmnet From 2fc189d8351c9710c1329750545539afe3e6e40c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 19 Jul 2017 17:33:04 +0200 Subject: [PATCH 002/269] [WIP] Add Generalized Linear Models (#9405) * Fixed pep8 * Fixed flake8 * Rename GeneralizedLinearModel as GeneralizedLinearRegressor * Use of six.with_metaclass * PEP257: summary should be on same line as quotes * Docstring of class GeneralizedLinearRegressor: \ before mu * Arguments family and link accept strings * Use of ConvergenceWarning --- sklearn/linear_model/__init__.py | 13 +- sklearn/linear_model/glm.py | 231 ++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 75 ++++---- 3 files changed, 180 insertions(+), 139 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 5acc51e9dc87f..0c5840f343a3a 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,12 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from .glm import (Link, IdentityLink, LogLink, - ExponentialDispersionModel, TweedieDistribution, - NormalDistribution, GaussianDistribution, - PoissonDistribution, GammaDistribution, - InverseGaussianDistribution, GeneralizedHyperbolicSecand, - GeneralizedLinearModel) +from .glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -44,7 +40,6 @@ 'BayesianRidge', 'ElasticNet', 'ElasticNetCV', - 'GeneralizedLinearModel', 'Hinge', 'Huber', 'HuberRegressor', @@ -84,4 +79,6 @@ 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', - 'RANSACRegressor'] + 'RANSACRegressor', + 'GeneralizedLinearRegressor', + 'TweedieDistribution'] diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 8b6eb8f3bf16c..cf91a64fafc12 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -5,9 +5,6 @@ # Author: Christian Lorentzen # License: BSD 3 clause -# TODO: Which name? GeneralizedLinearModel vs GeneralizedLinearRegression. -# So far, it is GeneralizedLinearModel, since it could very easily -# extended by Bernoulli/Binomial distribution. # TODO: Which name/symbol for coefficients and weights in docu? # sklearn.linear_models uses w for coefficients. # So far, coefficients=beta and weight=w (as standard literature) @@ -17,6 +14,10 @@ # TODO: Write docu and examples # Design Decisions: +# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. +# So far, it is GeneralizedLinearModel, since it could very easily +# extended by Bernoulli/Binomial distribution. +# Solution: GeneralizedLinearRegressor # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. @@ -28,16 +29,17 @@ import numpy as np from scipy import linalg, optimize, sparse import warnings -from .base import LinearModel, LinearRegression -from ..base import RegressorMixin +from .base import LinearRegression +from ..base import BaseEstimator, RegressorMixin +from ..exceptions import ConvergenceWarning +from ..externals import six from ..utils import check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted - -class Link(metaclass=ABCMeta): +class Link(six.with_metaclass(ABCMeta)): """Abstract base class for Link funtions """ @@ -73,6 +75,7 @@ def inverse_derivative2(self, lin_pred): """ raise NotImplementedError + class IdentityLink(Link): """The identity link function g(x)=x. """ @@ -113,7 +116,7 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) -class ExponentialDispersionModel(metaclass=ABCMeta): +class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): """Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -235,7 +238,7 @@ def deviance(self, y, mu, weight=1): \left(loglike(y,\mu,\frac{phi}{w}) - loglike(y,y,\frac{phi}{w})\right).` """ - return np.sum(weight*self.unit_deviance(y,mu)) + return np.sum(weight*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weight, link): """The deviance as a function of the coefficients ``coef`` @@ -248,7 +251,7 @@ def _deviance(self, coef, X, y, weight, link): def deviance_derivative(self, y, mu, weight=1): """The derivative w.r.t. mu of the deviance.` """ - return weight*self.unit_deviance_derivative(y,mu) + return weight*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weight, link): """The score function :math:`s` is the derivative of the @@ -269,7 +272,7 @@ def _score(self, coef, phi, X, y, weight, link): sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) score = safe_sparse_dot(X.T, temp, dense_output=False) return score @@ -294,7 +297,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) return fisher_matrix @@ -329,7 +332,7 @@ def _observed_information(self, coef, phi, X, y, weight, link): v = self.unit_variance_derivative(mu)/self.unit_variance(mu) r = y - mu temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) temp = safe_sparse_dot(temp, X, dense_output=False) observed_information = safe_sparse_dot(X.T, temp, dense_output=False) return observed_information @@ -341,7 +344,7 @@ def _deviance_derivative(self, coef, X, y, weight, link): :math:`s` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, - link=link) + link=link) return -2*score def _deviance_hessian(self, coef, X, y, weight, link): @@ -350,8 +353,8 @@ def _deviance_hessian(self, coef, X, y, weight, link): This is equivalent to :math:`+2\phi` times the observed information matrix. """ - info_matrix = self._observed_information(coef=coef, phi=1, - X=X, y=y, weight=weight, link=link) + info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, + weight=weight, link=link) return 2*info_matrix def starting_mu(self, y, weight=1): @@ -374,39 +377,39 @@ def __init__(self, power=0): self._upper_bound = np.Inf self._upper_compare = lambda x: np.less(x, self.upper_bound) if power < 0: - #Extreme Stable + # Extreme Stable self._lower_bound = -np.Inf self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power == 0: - #GaussianDistribution + # GaussianDistribution self._lower_bound = -np.Inf self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): - #Compound Poisson + # Compound Poisson self._lower_bound = 0 self._lower_compare = ( lambda x: np.greater_equal(x, self.lower_bound)) elif power == 2: - #GammaDistribution + # GammaDistribution self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif (power > 2) and (power < 3): - #Positive Stable + # Positive Stable self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power == 3: - #InverseGaussianDistribution + # InverseGaussianDistribution self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) elif power > 3: - #Positive Stable + # Positive Stable self._lower_bound = 0 self._lower_compare = lambda x: np.greater(x, self.lower_bound) @@ -418,7 +421,7 @@ def power(self): def power(self, power): if not isinstance(power, numbers.Real): raise TypeError('power must be a real number, input was {0}' - .format(power)) + .format(power)) self._power = power @property @@ -446,19 +449,19 @@ def unit_variance_derivative(self, mu): def unit_deviance(self, y, mu): p = self.power if p == 0: - #NormalDistribution + # NormalDistribution return (y-mu)**2 if p == 1: - #PoissonDistribution - return 2 * (np.where(y==0,0,y*np.log(y/mu))-y+mu) + # PoissonDistribution + return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu) elif p == 2: - #GammaDistribution + # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) else: - #return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) - return 2 * (np.power(np.maximum(y,0), 2-p)/((1-p)*(2-p)) - - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) def likelihood(self, y, X, beta, phi, weight=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -469,23 +472,25 @@ class NormalDistribution(TweedieDistribution): def __init__(self): super(NormalDistribution, self).__init__(power=0) -GaussianDistribution = NormalDistribution class PoissonDistribution(TweedieDistribution): """Class for the scaled Poisson distribution""" def __init__(self): super(PoissonDistribution, self).__init__(power=1) + class GammaDistribution(TweedieDistribution): """Class for the Gamma distribution""" def __init__(self): super(GammaDistribution, self).__init__(power=2) + class InverseGaussianDistribution(TweedieDistribution): """Class for the scaled InverseGaussianDistribution distribution""" def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) + class GeneralizedHyperbolicSecand(ExponentialDispersionModel): """A class for the von Generalized Hyperbolic Secand (GHS) distribution. @@ -516,12 +521,11 @@ def unit_variance_derivative(self, mu): return 2*mu def unit_deviance(self, y, mu): - return (2*y*(np.arctan(y) - np.arctan(mu)) - + np.log((1+mu**2)/(1+y**2))) - + return (2*y*(np.arctan(y) - np.arctan(mu)) + + np.log((1+mu**2)/(1+y**2))) -class GeneralizedLinearModel(LinearModel, RegressorMixin): +class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """ Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). @@ -531,7 +535,7 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): - The target values y_i are realizations of random variables :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion parameter :math:`\phi` and weights :math:`w_i`. - - The expectation of :math:`Y_i` is :math:`mu_i=\mathrm{E}[Y]=h(\eta_i)` + - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)` whith the linear predictor :math:`\eta=X*\beta`, inverse link function :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` to be estimated. @@ -549,7 +553,9 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): TODO: Estimation of the dispersion parameter phi. - TODO: Notes on 'scaled' Poisson and weights + TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with + with x=counts and w=exposure (time, money, persons, ...) => y is a + ratio with weights w. Parameters ---------- @@ -558,10 +564,12 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): to False, no intercept will be used in calculations (e.g. data is expected to be already centered). - family : ExponentialDispersionModel, optional, default NormalDistribution() - the distributional assumption of the GLM + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance + of a subclass of ExponentialDispersionModel, optional, default 'normal' + the distributional assumption of the GLM. - link : Link, optional, default IdentityLink() + link : {'identity', 'log'} or an instance of a subclass of Link, + optional, default IdentityLink() the link function (class) of the GLM fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' @@ -622,8 +630,8 @@ class GeneralizedLinearModel(LinearModel, RegressorMixin): """ def __init__(self, fit_intercept=True, family=NormalDistribution(), - link=IdentityLink(), fit_dispersion='chisqr', solver='irls', max_iter=100, - tol=1e-4, start_params=None, verbose=0): + link=IdentityLink(), fit_dispersion='chisqr', solver='irls', + max_iter=100, tol=1e-4, start_params=None, verbose=0): self.fit_intercept = fit_intercept self.family = family self.link = link @@ -635,8 +643,7 @@ def __init__(self, fit_intercept=True, family=NormalDistribution(), self.verbose = verbose def fit(self, X, y, weight=None): - """ - Fit a generalized linear model. + """Fit a generalized linear model. Parameters ---------- @@ -657,12 +664,32 @@ def fit(self, X, y, weight=None): self : returns an instance of self. """ if not isinstance(self.family, ExponentialDispersionModel): - raise ValueError("The argument family must be an instance of class" - "ExponentialDispersionModel.") + if self.family == 'normal': + self.family = NormalDistribution() + elif self.family == 'poisson': + self.family = PoissonDistribution() + elif self.family == 'gamma': + self.family = GammaDistribution() + elif self.family == 'inverse.gaussian': + self.family = InverseGaussianDistribution() + else: + raise ValueError( + "The argument family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") + if not isinstance(self.link, Link): + if self.link == 'identity': + self.link = IdentityLink() + if self.link == 'log': + self.link = LogLink() + else: + raise ValueError( + "The argument link must be an instance of class Link or" + " an element of ['identity', 'log'].") if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool," " got {0}".format(self.fit_intercept)) - if not self.solver in ['irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['irls', 'lbfgs', 'newton-cg']: raise ValueError("GLM Regression supports only irls, lbfgs and" "newton-cg solvers, got {0}".format(self.solver)) if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: @@ -676,8 +703,9 @@ def fit(self, X, y, weight=None): start_params = np.atleast_1d(start_params) if start_params.shape[0] != X.shape[1] + self.fit_intercept: raise ValueError("Start values for parameters must have the" - "right length; required length {0}, got {1}".format( - X.shape[1] + self.fit_intercept, start_params.shape[0])) + "right length; required length {0}, got {1}" + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0])) X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], y_numeric=True, multi_output=False) @@ -685,7 +713,8 @@ def fit(self, X, y, weight=None): if not np.all(self.family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}".format(self.family.__class__.__name__)) + "range for family {0}" + .format(self.family.__class__.__name__)) if weight is None: weight = np.ones_like(y) @@ -698,96 +727,96 @@ def fit(self, X, y, weight=None): elif weight.shape[0] != y.shape[0]: raise ValueError("Weights must have the same length as y") - if self.fit_intercept: - #intercept is first column <=> coef[0] is for intecept + # intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0],1]), X]) + Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: - Xnew = np.concatenate((np.ones((X.shape[0],1)), X), axis=1) + Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) else: Xnew = X n_samples, n_features = Xnew.shape - #Note: Since phi does not enter the estimation of mu_i=E[y_i] - # set it to 1 where convenient. + # Note: Since dispersion_ alias phi does not enter the estimation + # of mu_i=E[y_i] set it to 1 where convenient. - #set start values for coef + # set start values for coef coef = None if start_params is None: - #Use mu_start and apply one irls step to calculate coef + # Use mu_start and apply one irls step to calculate coef mu = self.family.starting_mu(y, weight) - #linear predictor + # linear predictor eta = self.link.link(mu) - #h'(eta) + # h'(eta) hp = self.link.inverse_derivative(eta) - #working weights w, in principle a diagonal matrix - #therefore here just as 1d array + # working weights w, in principle a diagonal matrix + # therefore here just as 1d array w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) - #working observations + # working observations yw = eta + (y-mu)/hp - #least squares rescaled with wroot + # least squares rescaled with wroot wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) coef = linalg.lstsq(X_rescale, yw_rescale)[0] elif start_params is 'ols': - reg = LinearRegression(copy_X=False, - fit_intercept=False) + reg = LinearRegression(copy_X=False, fit_intercept=False) reg.fit(Xnew, self.link.link(y)) coef = reg.coef_ else: coef = start_params - #algorithms for optimiation - #TODO: Parallelize it + # algorithms for optimiation + # TODO: Parallelize it self.n_iter_ = 0 converged = False if self.solver == 'irls': - #linear predictor + # linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = self.link.inverse(eta) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 - #coef_old not used so far. - #coef_old = coef - #h'(eta) + # coef_old not used so far. + # coef_old = coef + # h'(eta) hp = self.link.inverse_derivative(eta) - #working weights w, in principle a diagonal matrix - #therefore here just as 1d array + # working weights w, in principle a diagonal matrix + # therefore here just as 1d array w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) - #working observations + # working observations yw = eta + (y-mu)/hp - #least squares rescaled with wroot + # least squares rescaled with wroot wroot = sparse.dia_matrix((wroot, 0), - shape=(n_samples, n_samples)) + shape=(n_samples, n_samples)) X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) - coef, residues, rank, singular_ = ( + coef, residues, rank, singular_ = ( linalg.lstsq(X_rescale, yw_rescale)) - #updated linear predictor - #do it here for updated values for tolerance + # updated linear predictor + # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = self.link.inverse(eta) - #which tolerace? |coef - coef_old| or gradient? - #use gradient for compliance with newton-cg and lbfgs - #TODO: faster computation of gradient, use mu and eta directly - gradient = self.family._deviance_derivative(coef=coef, - X=Xnew, y=y, weight=weight, link=self.link) + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # TODO: faster computation of gradient, use mu and eta directly + gradient = self.family._deviance_derivative( + coef=coef, X=Xnew, y=y, weight=weight, link=self.link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break if not converged: warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})".format(self.max_iter)) + "of iterations (currently {0})" + .format(self.max_iter), ConvergenceWarning) - #TODO: performance: make one function return both deviance and gradient + # TODO: performance: make one function return both deviance and + # gradient of deviance elif self.solver == 'lbfgs': func = self.family._deviance fprime = self.family._deviance_derivative @@ -800,7 +829,8 @@ def fit(self, X, y, weight=None): if self.verbose > 0: if info["warnflag"] == 1: warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.") + " Increase the number of iterations.", + ConvergenceWarning) elif info["warnflag"] == 2: warnings.warn("lbfgs failed for the reason: {0}".format( info["task"])) @@ -808,11 +838,13 @@ def fit(self, X, y, weight=None): elif self.solver == 'newton-cg': func = self.family._deviance grad = self.family._deviance_derivative + def grad_hess(coef, X, y, weight, link): - grad = (self.family - ._deviance_derivative(coef, X, y, weight, link)) - hessian = (self.family - ._deviance_hessian(coef, X, y, weight,link)) + grad = (self.family._deviance_derivative( + coef, X, y, weight, link)) + hessian = (self.family._deviance_hessian( + coef, X, y, weight, link)) + def Hs(s): ret = np.dot(hessian, s) return ret @@ -820,7 +852,7 @@ def Hs(s): hess = grad_hess args = (Xnew, y, weight, self.link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, - maxiter=self.max_iter, tol=self.tol) + maxiter=self.max_iter, tol=self.tol) self.coef_ = coef if self.fit_intercept is True: @@ -835,6 +867,9 @@ def Hs(s): return self def predict(self, X, weight=1): + """Prediction with features X. + If weights are given, returns prediction*weights. + """ check_is_fitted(self, "coef_") eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: @@ -860,13 +895,13 @@ def score(self, X, y, weight=1): Returns the weight averaged negitive deviance (the better the score, the better the fit). Maximum score is therefore 0. """ - #RegressorMixin has R^2 score. - #TODO: Make it more compatible with the score function in + # RegressorMixin has R^2 score. + # TODO: Make it more compatible with the score function in # sklearn.metrics.regression.py eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ mu = self.link.inverse(eta) - output_errors = self.family.unit_deviance(y,mu) + output_errors = self.family.unit_deviance(y, mu) weight = weight * np.ones_like(y) return np.average(output_errors, weights=weight) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 2a88a7ec899d9..a4d4ea8650860 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,73 +1,82 @@ import numpy as np -from sklearn.linear_model.glm import (Link, IdentityLink, LogLink, - ExponentialDispersionModel, TweedieDistribution, - NormalDistribution, GaussianDistribution, - PoissonDistribution, GammaDistribution, - InverseGaussianDistribution, GeneralizedHyperbolicSecand, - GeneralizedLinearModel) +from sklearn.linear_model.glm import ( + # Link, IdentityLink, + LogLink, + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, + # GeneralizedHyperbolicSecand, + GeneralizedLinearRegressor) + +from sklearn.utils.testing import ( + # assert_equal, + assert_array_equal, assert_array_almost_equal) -from sklearn.utils.testing import (assert_equal, assert_array_equal, - assert_array_almost_equal) def test_family_bounds(): """Test the valid range of distributions """ family = NormalDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [True, True, True]) family = PoissonDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, True, True]) family = TweedieDistribution(power=1.5) - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, True, True]) family = GammaDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) family = InverseGaussianDistribution() - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) family = TweedieDistribution(power=4.5) - result = family.in_y_range([-1,0,1]) + result = family.in_y_range([-1, 0, 1]) assert_array_equal(result, [False, False, True]) + def test_glm_identiy_regression(): """Test linear regression on a simple dataset """ - coef = [1,2] - X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) + families = ( + NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) for solver in ['irls', 'lbfgs', 'newton-cg']: - for family in (GaussianDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)): - glm = GeneralizedLinearModel(family=family, - fit_intercept=False, solver=solver) + for family in families: + glm = GeneralizedLinearRegressor( + family=family, fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) + def test_glm_log_regression(): """Test linear regression on a simple dataset """ - coef = [1,2] - X = np.array([[1,1,1,1,1],[0,1,2,3,4]]).T + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) - #for solver in ['irls', 'lbfgs', 'newton-cg']: - for solver in ['irls']: - #for family in [GaussianDistribution(), PoissonDistribution(), - # GammaDistribution(), InverseGaussianDistribution(), - # TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]: - for family in [GaussianDistribution()]: - glm = GeneralizedLinearModel(family=family, - link=LogLink(), - fit_intercept=False, solver=solver, start_params='ols') + families = ( + NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + for solver in ['irls', 'lbfgs', 'newton-cg']: + for family in families: + glm = GeneralizedLinearRegressor( + family=family, link=LogLink(), fit_intercept=False, + solver=solver, start_params='ols') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) -#TODO: Test compatibility with R's glm, glmnet + +# TODO: Test compatibility with R's glm, glmnet From a6137d85401ef72976327c211f44d721d9f81e00 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 19 Jul 2017 17:41:49 +0200 Subject: [PATCH 003/269] [WIP] Add Generalized Linear Models (#9405) * GeneralizedLinearRegressor added to doc/modules/classes.rst --- doc/modules/classes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index e09ca0422d8a7..2d451b6758eb1 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -720,6 +720,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV From b0be167080588a35dc1f4b762d961edb897b1019 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 9 Aug 2017 13:38:49 +0200 Subject: [PATCH 004/269] [WIP] Add Generalized Linear Models (#9405) * fixed bug: init parameter max_iter * fix API for family and link: default parameter changed to string non public variables self._family_instance and self._link_instance * fixed bug in score, minus sign forgotten * added check_is_fitted to estimate_phi and score * added check_array(X) in predict * replaced lambda functions in TweedieDistribution * some documentation --- sklearn/linear_model/glm.py | 196 ++++++++++++++++++++++-------------- 1 file changed, 122 insertions(+), 74 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index cf91a64fafc12..0ee1564049329 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -15,9 +15,10 @@ # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. -# So far, it is GeneralizedLinearModel, since it could very easily -# extended by Bernoulli/Binomial distribution. -# Solution: GeneralizedLinearRegressor +# Estimators in sklearn are either regressors or classifiers. A Generalized +# Linear Model does both depending on the chosen distribution, e.g. Normal => +# regressor, Bernoulli/Binomial => classifier. +# Solution: GeneralizedLinearRegressor since this is the focus. # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. @@ -33,7 +34,7 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six -from ..utils import check_X_y +from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted @@ -372,46 +373,67 @@ class TweedieDistribution(ExponentialDispersionModel): The variance power of the unit_variance :math:`v(mu) = mu^{power}`. """ + def _less_upper_bound(self, x): + return np.less(x, self.upper_bound) + + def _less_equal_upper_bound(self, x): + return np.less_equal(x, self.upper_bound) + + def _greater_lower_bound(self, x): + return np.greater(x, self.lower_bound) + + def _greater_equal_lower_bound(self, x): + return np.greater_equal(x, self.lower_bound) + def __init__(self, power=0): self.power = power self._upper_bound = np.Inf - self._upper_compare = lambda x: np.less(x, self.upper_bound) + # self._upper_compare = lambda x: np.less(x, self.upper_bound) + self._upper_compare = self._less_upper_bound if power < 0: # Extreme Stable self._lower_bound = -np.Inf - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power == 0: - # GaussianDistribution + # NormalDistribution self._lower_bound = -np.Inf - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): # Compound Poisson self._lower_bound = 0 - self._lower_compare = ( - lambda x: np.greater_equal(x, self.lower_bound)) + # self._lower_compare = ( + # lambda x: np.greater_equal(x, self.lower_bound)) + self._lower_compare = self._greater_equal_lower_bound elif power == 2: # GammaDistribution self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif (power > 2) and (power < 3): # Positive Stable self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power == 3: # InverseGaussianDistribution self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound elif power > 3: # Positive Stable self._lower_bound = 0 - self._lower_compare = lambda x: np.greater(x, self.lower_bound) + # self._lower_compare = lambda x: np.greater(x, self.lower_bound) + self._lower_compare = self._greater_lower_bound @property def power(self): @@ -530,6 +552,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). + #TODO: This belongs to User Guide Assumptions: - The target values y_i are realizations of random variables @@ -559,25 +582,26 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Parameters ---------- - fit_intercept : boolean, optional, default True - whether to calculate the intercept for this model. If set - to False, no intercept will be used in calculations - (e.g. data is expected to be already centered). + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance - of a subclass of ExponentialDispersionModel, optional, default 'normal' + of a subclass of ExponentialDispersionModel, optional + (default='normal') the distributional assumption of the GLM. link : {'identity', 'log'} or an instance of a subclass of Link, - optional, default IdentityLink() - the link function (class) of the GLM + optional (default='identity') + the link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (mu). - fit_dispersion : {None, 'chisqr', 'deviance'}, defaul 'chisqr' + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr') method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'newton-cg', 'lbfgs'}, defaul 'irls' + solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls') Algorithm to use in the optimization problem. - 'irls' is iterated reweighted least squares. It is the standard @@ -585,16 +609,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'newton-cg', 'lbfgs' - max_iter : int, default 100 + max_iter : int, optional (default=100) TODO - tol : float + tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative of the deviance). - start_params : {array shape (n_features, ), 'ols'}, default None + start_params : {array shape (n_features, ), 'ols'}, optional (default=None) sets the start values for coef_ in the fit. If None, default values are taken. If 'ols' the result of an ordinary least squares in the link space @@ -603,9 +627,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If fit_intercept is true, the first value is assumed to be the start value for the intercept_. - verbose : int, default: 0 - For the lbfgs solver set verbose to any positive - number for verbosity. + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. Attributes ---------- @@ -629,15 +652,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO """ - def __init__(self, fit_intercept=True, family=NormalDistribution(), - link=IdentityLink(), fit_dispersion='chisqr', solver='irls', + def __init__(self, fit_intercept=True, family='normal', + link='identity', fit_dispersion='chisqr', solver='irls', max_iter=100, tol=1e-4, start_params=None, verbose=0): self.fit_intercept = fit_intercept self.family = family self.link = link self.fit_dispersion = fit_dispersion self.solver = solver - self.max_iter = 100 + self.max_iter = max_iter self.tol = tol self.start_params = start_params self.verbose = verbose @@ -663,29 +686,38 @@ def fit(self, X, y, weight=None): ------- self : returns an instance of self. """ - if not isinstance(self.family, ExponentialDispersionModel): + # Garantee that self._family_instance is an instance of class + # ExponentialDispersionModel + if isinstance(self.family, ExponentialDispersionModel): + self._family_instance = self.family + else: if self.family == 'normal': - self.family = NormalDistribution() + self._family_instance = NormalDistribution() elif self.family == 'poisson': - self.family = PoissonDistribution() + self._family_instance = PoissonDistribution() elif self.family == 'gamma': - self.family = GammaDistribution() + self._family_instance = GammaDistribution() elif self.family == 'inverse.gaussian': - self.family = InverseGaussianDistribution() + self._family_instance = InverseGaussianDistribution() else: raise ValueError( - "The argument family must be an instance of class" + "The family must be an instance of class" " ExponentialDispersionModel or an element of" " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") - if not isinstance(self.link, Link): + + # Garantee that self._link_instance is set to an instance of class Link + if isinstance(self.link, Link): + self._link_instance = self.link + else: if self.link == 'identity': - self.link = IdentityLink() - if self.link == 'log': - self.link = LogLink() + self._link_instance = IdentityLink() + elif self.link == 'log': + self._link_instance = LogLink() else: raise ValueError( - "The argument link must be an instance of class Link or" + "The link must be an instance of class Link or" " an element of ['identity', 'log'].") + if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool," " got {0}".format(self.fit_intercept)) @@ -711,10 +743,13 @@ def fit(self, X, y, weight=None): y_numeric=True, multi_output=False) y = y.astype(np.float64) - if not np.all(self.family.in_y_range(y)): + family = self._family_instance + link = self._link_instance + + if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" - .format(self.family.__class__.__name__)) + .format(family.__class__.__name__)) if weight is None: weight = np.ones_like(y) @@ -745,14 +780,14 @@ def fit(self, X, y, weight=None): coef = None if start_params is None: # Use mu_start and apply one irls step to calculate coef - mu = self.family.starting_mu(y, weight) + mu = family.starting_mu(y, weight) # linear predictor - eta = self.link.link(mu) + eta = link.link(mu) # h'(eta) - hp = self.link.inverse_derivative(eta) + hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -763,7 +798,7 @@ def fit(self, X, y, weight=None): coef = linalg.lstsq(X_rescale, yw_rescale)[0] elif start_params is 'ols': reg = LinearRegression(copy_X=False, fit_intercept=False) - reg.fit(Xnew, self.link.link(y)) + reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: coef = start_params @@ -775,16 +810,16 @@ def fit(self, X, y, weight=None): if self.solver == 'irls': # linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = self.link.inverse(eta) + mu = link.inverse(eta) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 # coef_old not used so far. # coef_old = coef # h'(eta) - hp = self.link.inverse_derivative(eta) + hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / self.family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weight=weight)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -799,13 +834,13 @@ def fit(self, X, y, weight=None): # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = self.link.inverse(eta) + mu = link.inverse(eta) # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs # TODO: faster computation of gradient, use mu and eta directly - gradient = self.family._deviance_derivative( - coef=coef, X=Xnew, y=y, weight=weight, link=self.link) + gradient = family._deviance_derivative( + coef=coef, X=Xnew, y=y, weight=weight, link=link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -818,9 +853,9 @@ def fit(self, X, y, weight=None): # TODO: performance: make one function return both deviance and # gradient of deviance elif self.solver == 'lbfgs': - func = self.family._deviance - fprime = self.family._deviance_derivative - args = (Xnew, y, weight, self.link) + func = family._deviance + fprime = family._deviance_derivative + args = (Xnew, y, weight, link) coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, @@ -836,13 +871,13 @@ def fit(self, X, y, weight=None): info["task"])) self.n_iter_ = info['nit'] elif self.solver == 'newton-cg': - func = self.family._deviance - grad = self.family._deviance_derivative + func = family._deviance + grad = family._deviance_derivative def grad_hess(coef, X, y, weight, link): - grad = (self.family._deviance_derivative( + grad = (family._deviance_derivative( coef, X, y, weight, link)) - hessian = (self.family._deviance_hessian( + hessian = (family._deviance_hessian( coef, X, y, weight, link)) def Hs(s): @@ -850,7 +885,7 @@ def Hs(s): return ret return grad, Hs hess = grad_hess - args = (Xnew, y, weight, self.link) + args = (Xnew, y, weight, link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) self.coef_ = coef @@ -871,37 +906,50 @@ def predict(self, X, weight=1): If weights are given, returns prediction*weights. """ check_is_fitted(self, "coef_") + X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + # TODO: validation of weight eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) + mu = self._link_instance.inverse(eta) return mu*weight def estimate_phi(self, y, X, weight): + """Estimation of the dispersion parameter. + Returns the estimate. + """ + check_is_fitted(self, "coef_") n_samples, n_features = X.shape eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) + mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(weight*(y-mu)**2/self.family.unit_variance(mu)) + chisq = np.sum(weight*(y-mu)**2 / + self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self.family.deviance(y, mu, weight) + dev = self._family_instance.deviance(y, mu, weight) return dev/(n_samples - n_features) +# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" +# in check_estimator for score +# from sklearn.utils.estimator_checks import check_estimator +# from sklearn.linear_model import GeneralizedLinearRegressor +# check_estimator(GeneralizedLinearRegressor) def score(self, X, y, weight=1): """The natural score for a GLM is -deviance. - Returns the weight averaged negitive deviance (the better the score, + Returns the weight averaged negative deviance (the better the score, the better the fit). Maximum score is therefore 0. """ # RegressorMixin has R^2 score. # TODO: Make it more compatible with the score function in # sklearn.metrics.regression.py + check_is_fitted(self, "coef_") eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ - mu = self.link.inverse(eta) - output_errors = self.family.unit_deviance(y, mu) + mu = self._link_instance.inverse(eta) + output_errors = self._family_instance.unit_deviance(y, mu) weight = weight * np.ones_like(y) - return np.average(output_errors, weights=weight) + return -np.average(output_errors, weights=weight) From 85c52ec9c6adb3b1f75650cfa7fe0b770393d24e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 13 Aug 2017 01:46:16 +0200 Subject: [PATCH 005/269] [WIP] Add Generalized Linear Models (#9405) * make raw docstrings where appropriate * make ExponentialDispersionModel (i.e. TweedieDistribution) pickable: ExponentialDispersionModel has new properties include_lower_bound, method in_y_range is not abstract anymore. * set self.intercept_=0 if fit_intercept=False, such that it is always defined. * set score to D2, a generalized R2 with deviance instead of squared error, as does glmnet. This also solves issues with check_regressors_train(GeneralizedLinearRegressor), which assumes R2 score. * change of names: weight to weights in ExponentialDispersionModel and to sample_weight in GeneralizedLinearRegressor * add class method linear_predictor --- sklearn/linear_model/glm.py | 330 ++++++++++++++++++++++-------------- 1 file changed, 199 insertions(+), 131 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 0ee1564049329..b80842f817f4d 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -5,13 +5,16 @@ # Author: Christian Lorentzen # License: BSD 3 clause +# TODO: Write more tests # TODO: Which name/symbol for coefficients and weights in docu? # sklearn.linear_models uses w for coefficients. -# So far, coefficients=beta and weight=w (as standard literature) -# TODO: Add l2-penalty +# So far, coefficients=beta and weights=w (as standard literature) +# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix) # TODO: Add l1-penalty (elastic net) # TODO: Add cross validation # TODO: Write docu and examples +# TODO: Make it as much consistent to other estimators in linear_model as +# possible # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -118,7 +121,7 @@ def inverse_derivative2(self, lin_pred): class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): - """Base class for reproductive Exponential Dispersion Models (EDM). + r"""Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -136,6 +139,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): ---------- lower_bound upper_bound + include_lower_bound + include_upper_bound Methods ------- @@ -174,15 +179,39 @@ def upper_bound(self): """ raise NotImplementedError() - @abstractmethod + @abstractproperty + def include_lower_bound(self): + """If True, values of y may equal lower bound: y >= lower_bound. + """ + raise NotImplementedError() + + @abstractproperty + def include_upper_bound(self): + """If True, values of y may equal upper bound: y <= upper_bound. + """ + raise NotImplementedError() + def in_y_range(self, x): """Returns true if x is in the valid range of Y~EDM. """ - raise NotImplementedError() + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) @abstractmethod def unit_variance(self, mu): - """The unit variance :math:`v(mu)` determines the variance as + r"""The unit variance :math:`v(mu)` determines the variance as a function of the mean mu by :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as @@ -194,27 +223,27 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - """The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. """ raise NotImplementedError() - def variance(self, mu, phi=1, weight=1): - """The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is + def variance(self, mu, phi=1, weights=1): + r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, with unit variance v(mu). """ - return phi/weight * self.unit_variance(mu) + return phi/weights * self.unit_variance(mu) - def variance_derivative(self, mu, phi=1, weight=1): - """The derivative of the variance w.r.t. mu, + def variance_derivative(self, mu, phi=1, weights=1): + r"""The derivative of the variance w.r.t. mu, :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/w_i*v'(\mu_i)`, with unit variance v(mu). """ - return phi/weight * self.unit_variance_derivative(mu) + return phi/weights * self.unit_variance_derivative(mu) @abstractmethod def unit_deviance(self, y, mu): - """The unit_deviance :math:`d(y,\mu)`. + r"""The unit_deviance :math:`d(y,\mu)`. In terms of the log-likelihood it is given by :math:`d(y,\mu) = -2\phi\cdot \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` @@ -222,7 +251,7 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - """The derivative w.r.t. mu of the unit_deviance + r"""The derivative w.r.t. mu of the unit_deviance :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. @@ -232,30 +261,30 @@ def unit_deviance_derivative(self, y, mu): """ return -2*(y-mu)/self.unit_variance(mu) - def deviance(self, y, mu, weight=1): - """The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) - with weight :math:`w_i` and unit_deviance :math:`d(y,mu)`. + def deviance(self, y, mu, weights=1): + r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) + with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`. In terms of the likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{w}) - loglike(y,y,\frac{phi}{w})\right).` """ - return np.sum(weight*self.unit_deviance(y, mu)) + return np.sum(weights*self.unit_deviance(y, mu)) - def _deviance(self, coef, X, y, weight, link): + def _deviance(self, coef, X, y, weights, link): """The deviance as a function of the coefficients ``coef`` (:math:`beta`). """ lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - return self.deviance(y, mu, weight) + return self.deviance(y, mu, weights) - def deviance_derivative(self, y, mu, weight=1): + def deviance_derivative(self, y, mu, weights=1): """The derivative w.r.t. mu of the deviance.` """ - return weight*self.unit_deviance_derivative(y, mu) + return weights*self.unit_deviance_derivative(y, mu) - def _score(self, coef, phi, X, y, weight, link): - """The score function :math:`s` is the derivative of the + def _score(self, coef, phi, X, y, weights, link): + r"""The score function :math:`s` is the derivative of the log-likelihood w.r.t. the ``coef`` (:math:`\beta`). It is given by @@ -270,7 +299,7 @@ def _score(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), shape=(n_samples, n_samples)) @@ -278,8 +307,8 @@ def _score(self, coef, phi, X, y, weight, link): score = safe_sparse_dot(X.T, temp, dense_output=False) return score - def _fisher_matrix(self, coef, phi, X, y, weight, link): - """The Fisher information matrix, also known as expected + def _fisher_matrix(self, coef, phi, X, y, weights, link): + r"""The Fisher information matrix, also known as expected information matrix. It is given by .. math: @@ -295,7 +324,7 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), shape=(n_samples, n_samples)) @@ -303,8 +332,8 @@ def _fisher_matrix(self, coef, phi, X, y, weight, link): fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) return fisher_matrix - def _observed_information(self, coef, phi, X, y, weight, link): - """The observed information matrix, also known as the negative of + def _observed_information(self, coef, phi, X, y, weights, link): + r"""The observed information matrix, also known as the negative of the Hessian matrix of the log-likelihood. It is given by .. math: @@ -327,7 +356,7 @@ def _observed_information(self, coef, phi, X, y, weight, link): n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weight=weight) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) d2 = link.inverse_derivative(lin_pred)**2 v = self.unit_variance_derivative(mu)/self.unit_variance(mu) @@ -338,33 +367,34 @@ def _observed_information(self, coef, phi, X, y, weight, link): observed_information = safe_sparse_dot(X.T, temp, dense_output=False) return observed_information - def _deviance_derivative(self, coef, X, y, weight, link): - """The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a + def _deviance_derivative(self, coef, X, y, weights, link): + r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a function of the coefficients ``coef``. This is equivalent to :math:`-2\phi` times the score function :math:`s` (derivative of the log-likelihood). """ - score = self._score(coef=coef, phi=1, X=X, y=y, weight=weight, + score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) return -2*score - def _deviance_hessian(self, coef, X, y, weight, link): - """The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance + def _deviance_hessian(self, coef, X, y, weights, link): + r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance as a function of the coefficients ``coef``. This is equivalent to :math:`+2\phi` times the observed information matrix. """ info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, - weight=weight, link=link) + weights=weights, link=link) return 2*info_matrix - def starting_mu(self, y, weight=1): + def starting_mu(self, y, weights=1): """Starting values for the mean mu_i in IRLS.""" - return (weight*y+np.mean(weight*y))/(2.*np.sum(np.ones_like(y)*weight)) + return ((weights*y+np.mean(weights*y)) + / (2.*np.sum(np.ones_like(y)*weights))) class TweedieDistribution(ExponentialDispersionModel): - """A class for the Tweedie distribution. + r"""A class for the Tweedie distribution. They have mu=E[X] and Var[X] \propto mu**power. Attributes @@ -373,67 +403,44 @@ class TweedieDistribution(ExponentialDispersionModel): The variance power of the unit_variance :math:`v(mu) = mu^{power}`. """ - def _less_upper_bound(self, x): - return np.less(x, self.upper_bound) - - def _less_equal_upper_bound(self, x): - return np.less_equal(x, self.upper_bound) - - def _greater_lower_bound(self, x): - return np.greater(x, self.lower_bound) - - def _greater_equal_lower_bound(self, x): - return np.greater_equal(x, self.lower_bound) - def __init__(self, power=0): self.power = power self._upper_bound = np.Inf - # self._upper_compare = lambda x: np.less(x, self.upper_bound) - self._upper_compare = self._less_upper_bound + self._include_upper_bound = False if power < 0: # Extreme Stable self._lower_bound = -np.Inf - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power == 0: # NormalDistribution self._lower_bound = -np.Inf - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif (power > 0) and (power < 1): raise ValueError('For 0 1) and (power < 2): # Compound Poisson self._lower_bound = 0 - # self._lower_compare = ( - # lambda x: np.greater_equal(x, self.lower_bound)) - self._lower_compare = self._greater_equal_lower_bound + self._include_lower_bound = True elif power == 2: # GammaDistribution self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif (power > 2) and (power < 3): # Positive Stable self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power == 3: # InverseGaussianDistribution self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False elif power > 3: # Positive Stable self._lower_bound = 0 - # self._lower_compare = lambda x: np.greater(x, self.lower_bound) - self._lower_compare = self._greater_lower_bound + self._include_lower_bound = False @property def power(self): @@ -454,8 +461,13 @@ def lower_bound(self): def upper_bound(self): return self._upper_bound - def in_y_range(self, x): - return np.logical_and(self._lower_compare(x), self._upper_compare(x)) + @property + def include_lower_bound(self): + return self._include_lower_bound + + @property + def include_upper_bound(self): + return self._include_upper_bound def unit_variance(self, mu): """The unit variance of a Tweedie distribution is v(mu)=mu**power. @@ -485,7 +497,7 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, beta, phi, weight=1): + def likelihood(self, y, X, beta, phi, weights=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -521,6 +533,8 @@ class GeneralizedHyperbolicSecand(ExponentialDispersionModel): def __init__(self): self._lower_bound = -np.Inf self._upper_bound = np.Inf + self._include_lower_bound = False + self._include_upper_bound = False @property def lower_bound(self): @@ -530,11 +544,13 @@ def lower_bound(self): def upper_bound(self): return self._upper_bound - def in_y_range(self, x): - np.logical_and( - np.greater(x, self.lower_bound), - np.less(x, self.lower_bound) - ) + @property + def include_lower_bound(self): + return self._include_lower_bound + + @property + def include_upper_bound(self): + return self._include_upper_bound def unit_variance(self, mu): return 1 + mu**2 @@ -548,7 +564,7 @@ def unit_deviance(self, y, mu): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """ + r""" Class to fit a Generalized Linear Model (GLM) based on reproductive Exponential Dispersion Models (EDM). @@ -665,7 +681,7 @@ def __init__(self, fit_intercept=True, family='normal', self.start_params = start_params self.verbose = verbose - def fit(self, X, y, weight=None): + def fit(self, X, y, sample_weight=None): """Fit a generalized linear model. Parameters @@ -676,11 +692,12 @@ def fit(self, X, y, weight=None): y : numpy array of shape [n_samples] Target values - weight : numpy array of shape [n_samples] + sample_weight : numpy array of shape [n_samples] Individual weights for each sample. Var[Y_i]=phi/weight_i * v(mu) If Y_i ~ EDM(mu, phi/w_i) then - sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)) + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + weighted average with weights=sample_weight. Returns ------- @@ -751,16 +768,17 @@ def fit(self, X, y, weight=None): "range for family {0}" .format(family.__class__.__name__)) - if weight is None: - weight = np.ones_like(y) - elif np.isscalar(weight): - weight = weight*np.ones_like(y) + if sample_weight is None: + weights = np.ones_like(y) + elif np.isscalar(sample_weight): + weights = sample_weight*np.ones_like(y) else: - weight = np.atleast_1d(weight) - if weight.ndim > 1: - raise ValueError("Weights must be 1D array or scalar") - elif weight.shape[0] != y.shape[0]: - raise ValueError("Weights must have the same length as y") + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != y.shape[0]: + raise ValueError("Sample weights must have the same length as" + " y") if self.fit_intercept: # intercept is first column <=> coef[0] is for intecept @@ -780,14 +798,14 @@ def fit(self, X, y, weight=None): coef = None if start_params is None: # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weight) + mu = family.starting_mu(y, weights) # linear predictor eta = link.link(mu) # h'(eta) hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weights=weights)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -819,7 +837,7 @@ def fit(self, X, y, weight=None): hp = link.inverse_derivative(eta) # working weights w, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weight=weight)) + w = (hp**2 / family.variance(mu, phi=1, weights=weights)) wroot = np.sqrt(w) # working observations yw = eta + (y-mu)/hp @@ -840,7 +858,7 @@ def fit(self, X, y, weight=None): # use gradient for compliance with newton-cg and lbfgs # TODO: faster computation of gradient, use mu and eta directly gradient = family._deviance_derivative( - coef=coef, X=Xnew, y=y, weight=weight, link=link) + coef=coef, X=Xnew, y=y, weights=weights, link=link) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -855,7 +873,7 @@ def fit(self, X, y, weight=None): elif self.solver == 'lbfgs': func = family._deviance fprime = family._deviance_derivative - args = (Xnew, y, weight, link) + args = (Xnew, y, weights, link) coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, @@ -874,47 +892,72 @@ def fit(self, X, y, weight=None): func = family._deviance grad = family._deviance_derivative - def grad_hess(coef, X, y, weight, link): + def grad_hess(coef, X, y, weights, link): grad = (family._deviance_derivative( - coef, X, y, weight, link)) + coef, X, y, weights, link)) hessian = (family._deviance_hessian( - coef, X, y, weight, link)) + coef, X, y, weights, link)) def Hs(s): ret = np.dot(hessian, s) return ret return grad, Hs hess = grad_hess - args = (Xnew, y, weight, link) + args = (Xnew, y, weights, link) coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) self.coef_ = coef - if self.fit_intercept is True: + if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] else: + self.intercept_ = 0. self.coef_ = coef if self.fit_dispersion in ['chisqr', 'deviance']: - self.dispersion_ = self.estimate_phi(y, X, weight) + self.dispersion_ = self.estimate_phi(y, X, weights) return self - def predict(self, X, weight=1): - """Prediction with features X. - If weights are given, returns prediction*weights. + def linear_predictor(self, X): + """The linear_predictor X*coef_ + intercept_. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Samples. + + Returns + ------- + C : array, shape = (n_samples) + Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) - # TODO: validation of weight - eta = safe_sparse_dot(X, self.coef_, dense_output=True) - if self.fit_intercept is True: - eta += self.intercept_ + return safe_sparse_dot(X, self.coef_, + dense_output=True) + self.intercept_ + + def predict(self, X, sample_weight=1): + """Predict uing GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. + + Parameters + ---------- + X : numpy array or sparse matrix of shape [n_samples,n_features] + Samples. + + Returns + ------- + C : array, shape = (n_samples) + Returns predicted values times sample_weight. + """ + # TODO: validation of sample_weight + eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) - return mu*weight + return mu*sample_weight - def estimate_phi(self, y, X, weight): + def estimate_phi(self, y, X, sample_weight): """Estimation of the dispersion parameter. Returns the estimate. """ @@ -925,11 +968,11 @@ def estimate_phi(self, y, X, weight): eta += self.intercept_ mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(weight*(y-mu)**2 / + chisq = np.sum(sample_weight*(y-mu)**2 / self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, weight) + dev = self._family_instance.deviance(y, mu, sample_weight) return dev/(n_samples - n_features) # TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" @@ -937,19 +980,44 @@ def estimate_phi(self, y, X, weight): # from sklearn.utils.estimator_checks import check_estimator # from sklearn.linear_model import GeneralizedLinearRegressor # check_estimator(GeneralizedLinearRegressor) - def score(self, X, y, weight=1): - """The natural score for a GLM is -deviance. - Returns the weight averaged negative deviance (the better the score, - the better the fit). Maximum score is therefore 0. + def score(self, X, y, sample_weight=None): + r"""Returns D^2, a generalization of the coefficient of determination + R^2, which uses deviance instead of squared error. + + D^2 is defined as + :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` + is the null deviance, i.e. the deviance of a model with intercept + alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean + :math:`\bar{y}` is average by sample_weight. In the case of a Normal + distribution, this D^2 equals R^2. + Best possible score is 1.0 and it can be negative (because the + model can be arbitrarily worse). + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples + + y : array-like of shape = (n_samples) + True valeus for X. + + sample_weight : array-like, shape = (n_samples), optional + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) wrt. y. """ - # RegressorMixin has R^2 score. - # TODO: Make it more compatible with the score function in - # sklearn.metrics.regression.py - check_is_fitted(self, "coef_") - eta = safe_sparse_dot(X, self.coef_, dense_output=True) - if self.fit_intercept is True: - eta += self.intercept_ - mu = self._link_instance.inverse(eta) - output_errors = self._family_instance.unit_deviance(y, mu) - weight = weight * np.ones_like(y) - return -np.average(output_errors, weights=weight) + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + if sample_weight is None: + weights = np.ones_like(y) + else: + weights = np.atleast_1d(sample_weight) + mu = self.predict(X) + dev = self._family_instance.deviance(y, mu, weights=weights) + y_mean = np.average(y, weights=weights) + dev_null = self._family_instance.deviance(y, y_mean, weights=weights) + return 1. - dev / dev_null From 0f4bdb3a8c5c45e80786b8156398da93bfc597e8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 18 Sep 2017 23:41:19 +0200 Subject: [PATCH 006/269] [WIP] Add Generalized Linear Models (#9405) * added L2 penalty * api change: alpha, l1_ratio, P1, P2, warm_start, check_input, copy_X * added entry in user guide * improved docstrings * helper function _irls_step --- doc/modules/linear_model.rst | 113 +++- sklearn/linear_model/glm.py | 702 ++++++++++++++++++------- sklearn/linear_model/tests/test_glm.py | 115 +++- 3 files changed, 721 insertions(+), 209 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 75b95f6c7a44f..51b3821fa6207 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -786,7 +786,7 @@ non-smooth `penalty="l1"`. This is therefore the solver of choice for sparse multinomial logistic regression. It is also the only solver that supports `penalty="elasticnet"`. -The "lbfgs" is an optimization algorithm that approximates the +The "lbfgs" is an optimization algorithm that approximates the Broyden–Fletcher–Goldfarb–Shanno algorithm [8]_, which belongs to quasi-Newton methods. The "lbfgs" solver is recommended for use for small data-sets but for larger datasets its performance suffers. [9]_ @@ -874,6 +874,117 @@ to warm-starting (see :term:`Glossary `). .. [9] `"Performance Evaluation of Lbfgs vs other solvers" `_ +.. _Generalized_linear_regression: + +Generalized linear regression +============================= + +:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two +ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear +combination of the input variables :math:`X` via an inverse link function +:math:`h` as + +.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). + +Secondly, the squared loss function is replaced by the deviance :math:`D` of an +exponential dispersion model (EDM) [2]_. The objective function beeing minimized +becomes + +.. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 + +\frac{\alpha(1-\rho)}{2} w^T P_2 w + +with sample weights :math:`s`. +:math:`P_1` can be used to exclude some of the coefficients in the L1 +penalty, :math:`P_2` (must be positive semi-definite) allows for a more +versatile L2 penalty. + +Use cases, where a loss different from the squared loss might be appropriate, +are the following: + + * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance. + + * If the target values are positive valued and skewed, you might try a Gamma deviance. + + * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). + +Since the linear predictor :math:`Xw` can be negative and +Poisson, Gamma and Inverse Gaussian distributions don't have negative values, +it is convenient to apply a link function different from the identity link +:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=\exp(Xw)`. + +Note that the feature matrix `X` should be standardized before fitting. This +ensures that the penalty treats features equally. + + >>> from sklearn import linear_model + >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0) + >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + >>> reg.coef_ + array([ 0.24630255, 0.43373521]) + >>> reg.intercept_ + -0.76383575123143277 + +Mathematical formulation +------------------------ + +In the unpenalized case, the assumptions are the folowing: + + * The target values :math:`y_i` are realizations of random variables + :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` + with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter + :math:`\phi` and sample weights :math:`s_i`. + * The aim is to predict the expectation :math:`\mu_i` with + :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + +Note that the first assumption implies +:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance +function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the +same as specifying a unit variance function (they are one-to-one). + +Including penalties helps to avoid overfitting or, in case of L1 penalty, to +obtain sparse solutions. But there are also other motivations to include them, +e.g. accounting fo dependence structure of :math:`y`. + +The objective function, which is independent of :math:`\phi`, is minimized with +respect to the coefficients :math:`w`. + +The deviance is defined by + +.. math:: D(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\frac{\phi}{s}) + - loglike(y,y,\frac{\phi}{s})\right) + +===================================== ================================= +Distribution Variance Function :math:`v(\mu)` +===================================== ================================= +Normal ("normal") :math:`1` +Poisson ("poisson") :math:`\mu` +Gamma ("gamma") :math:`\mu^2` +Inverse Gaussian ("inverse.gaussian") :math:`\mu^3` +===================================== ================================= + +Two remarks: + +* The deviances for at least Normal, Poisson and Gamma distributions are + strictly consistent scoring functions for the mean :math:`\mu`, see Eq. + (19)-(20) in [3]_. + +* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) + you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. + + +.. topic:: References: + + .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + See also `Exponential dispersion model. `_ + + .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b80842f817f4d..2db3c56d5e1c1 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,15 +6,15 @@ # License: BSD 3 clause # TODO: Write more tests -# TODO: Which name/symbol for coefficients and weights in docu? -# sklearn.linear_models uses w for coefficients. -# So far, coefficients=beta and weights=w (as standard literature) -# TODO: Add l2-penalty (maybe more general w.P.w with P penalty matrix) # TODO: Add l1-penalty (elastic net) +# TODO: deal with option self.copy_X +# TODO: Should the option `normalize` be included (like other linear models)? +# So far, it is not included. User must pass a normalized X. # TODO: Add cross validation -# TODO: Write docu and examples +# TODO: Write examples and more docu # TODO: Make it as much consistent to other estimators in linear_model as # possible +# TODO: options P1 and P2 in fit() or in __init__()??? # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -22,10 +22,21 @@ # Linear Model does both depending on the chosen distribution, e.g. Normal => # regressor, Bernoulli/Binomial => classifier. # Solution: GeneralizedLinearRegressor since this is the focus. +# - Allow for finer control of penalty terms: +# L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude +# factors from the L1 penalty. +# L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be +# a 1st or 2nd order difference matrix (compare B-spline penalties and +# Tikhonov regularization). # - The link funtion (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. # Solution: link as argument in those functions +# - Which name/symbol for sample_weight in docu? +# sklearn.linear_models uses w for coefficients, standard literature on +# GLMs use beta for coefficients and w for (sample) weights. +# So far, coefficients=w and sample weights=s. + from __future__ import division from abc import ABCMeta, abstractmethod, abstractproperty @@ -34,6 +45,8 @@ from scipy import linalg, optimize, sparse import warnings from .base import LinearRegression +from .coordinate_descent import ElasticNet +from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six @@ -164,7 +177,8 @@ class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): References ---------- - See https://en.wikipedia.org/wiki/Exponential_dispersion_model. + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ @abstractproperty @@ -192,7 +206,7 @@ def include_upper_bound(self): raise NotImplementedError() def in_y_range(self, x): - """Returns true if x is in the valid range of Y~EDM. + """Returns true if `x` is in the valid range of Y~EDM. """ if self.include_lower_bound: if self.include_upper_bound: @@ -211,33 +225,36 @@ def in_y_range(self, x): @abstractmethod def unit_variance(self, mu): - r"""The unit variance :math:`v(mu)` determines the variance as - a function of the mean mu by - :math:`\mathrm{Var}[Y_i] = \phi/w_i*v(\mu_i)`. + r"""The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. """ raise NotImplementedError() @abstractmethod def unit_variance_derivative(self, mu): - r"""The derivative of the unit variance w.r.t. mu, :math:`v'(\mu)`. + r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. """ raise NotImplementedError() def variance(self, mu, phi=1, weights=1): - r"""The variance of :math:`Y \sim \mathrm{EDM}(\mu,\phi)` is - :math:`\mathrm{Var}[Y_i]=\phi/w_i*v(\mu_i)`, - with unit variance v(mu). + r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. """ return phi/weights * self.unit_variance(mu) def variance_derivative(self, mu, phi=1, weights=1): - r"""The derivative of the variance w.r.t. mu, + r"""The derivative of the variance w.r.t. `mu`, :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] - =phi/w_i*v'(\mu_i)`, with unit variance v(mu). + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. """ return phi/weights * self.unit_variance_derivative(mu) @@ -251,8 +268,8 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - r"""The derivative w.r.t. mu of the unit_deviance - :math:`\frac{d}{d\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + r"""The derivative w.r.t. `mu` of the unit deviance + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. Returns @@ -262,39 +279,39 @@ def unit_deviance_derivative(self, y, mu): return -2*(y-mu)/self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""The deviance is given by :math:`D = \sum_i w_i \cdot d(y, \mu) - with weights :math:`w_i` and unit_deviance :math:`d(y,mu)`. + r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu) + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. In terms of the likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{w}) - - loglike(y,y,\frac{phi}{w})\right).` + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. """ return np.sum(weights*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weights, link): - """The deviance as a function of the coefficients ``coef`` - (:math:`beta`). + """The deviance as a function of the coefficients `coef` + (:math:`w`). """ lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) return self.deviance(y, mu, weights) def deviance_derivative(self, y, mu, weights=1): - """The derivative w.r.t. mu of the deviance.` + """The derivative w.r.t. `mu` of the deviance. """ return weights*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weights, link): - r"""The score function :math:`s` is the derivative of the - log-likelihood w.r.t. the ``coef`` (:math:`\beta`). + r"""The score function is the derivative of the + log-likelihood w.r.t. `coef` (:math:`w`). It is given by .. math: - \mathbf{s}(\boldsymbol{\beta}) = \mathbf{X}^T \mathbf{D} + \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D} \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and - :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}(y_1),\ldots)`. + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -303,23 +320,27 @@ def _score(self, coef, phi, X, y, weights, link): d = link.inverse_derivative(lin_pred) d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=False) - score = safe_sparse_dot(X.T, temp, dense_output=False) + temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True) + score = safe_sparse_dot(X.T, temp, dense_output=True) return score def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""The Fisher information matrix, also known as expected - information matrix. It is given by + r"""The Fisher information matrix. + The Fisher information matrix, also known as expected information + matrix is given by .. math: - \mathbf{F}(\boldsymbol{\beta}) = \mathrm{E}\left[ - -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} - \partial\boldsymbol{\beta}^T}\right] + \mathbf{F}(\boldsymbol{w}) = + \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial + \boldsymbol{w}} \right] + = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T}\right] = \mathbf{X}^T W \mathbf{X} \,, with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see score function. + see func:`score_function`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -333,14 +354,15 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): - r"""The observed information matrix, also known as the negative of + r"""The observed information matrix. + The observed information matrix, also known as the negative of the Hessian matrix of the log-likelihood. It is given by .. math: - \mathbf{H}(\boldsymbol{\beta}) = - -\frac{\partial^2 loglike}{\partial\boldsymbol{\beta} - \partial\boldsymbol{\beta}^T} + \mathbf{H}(\boldsymbol{w}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T} = \mathbf{X}^T \legt[ - \mathbf{D}' \mathbf{R} + \mathbf{D}^2 \mathbf{V} \mathbf{R} @@ -351,7 +373,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ v(\mu_i)} \right)`, - see score function and Fisher matrix. + see :func:`score_` function and :func:`_fisher_matrix`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -368,18 +390,18 @@ def _observed_information(self, coef, phi, X, y, weights, link): return observed_information def _deviance_derivative(self, coef, X, y, weights, link): - r"""The derivative w.r.t. ``coef`` (:math:`\beta`) of the deviance as a - function of the coefficients ``coef``. + r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a + function of the coefficients `coef`. This is equivalent to :math:`-2\phi` times the score function - :math:`s` (derivative of the log-likelihood). + :func:`score_function` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) return -2*score def _deviance_hessian(self, coef, X, y, weights, link): - r"""The hessian matrix w.r.t. ``coef`` (:math:`\beta`) of the deviance - as a function of the coefficients ``coef``. + r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance + as a function of the coefficients `coef`. This is equivalent to :math:`+2\phi` times the observed information matrix. """ @@ -388,20 +410,21 @@ def _deviance_hessian(self, coef, X, y, weights, link): return 2*info_matrix def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in IRLS.""" - return ((weights*y+np.mean(weights*y)) - / (2.*np.sum(np.ones_like(y)*weights))) + """Starting values for the mean mu_i in (unpenalized) IRLS.""" + return ((weights*y+np.mean(weights*y)) / + (2.*np.sum(np.ones_like(y)*weights))) class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - They have mu=E[X] and Var[X] \propto mu**power. + They have :math:`\mu=\mathrm{E}[Y]` and + :math:`\mathrm{Var}[Y] \propto \mu^power. Attributes ---------- power : float The variance power of the unit_variance - :math:`v(mu) = mu^{power}`. + :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): self.power = power @@ -497,7 +520,7 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, beta, phi, weights=1): + def likelihood(self, y, X, w, phi, weights=1): raise NotImplementedError('This function is not (yet) implemented.') @@ -563,51 +586,135 @@ def unit_deviance(self, y, mu): np.log((1+mu**2)/(1+y**2))) +def _irls_step(X, W, P2, z): + """One step in iteratively reweighted least squares + + Solve A w = b for w with + A = (X' W X + P2) + b = X' W z + z = eta + D^-1 (y-mu) + + See also fit method of :class:`GeneralizedLinearRegressor`. + + Parameters + ---------- + X : numpy array or sparse matrix of shape (n_samples, n_features) + Training data (with intercept included if present) + + W : numpy array of shape (n_samples, ) + + P2 : numpy array or sparse matrix of shape (n_features, n_features) + The l2-penalty matrix or vector (=diagonal matrix) + + z : numpy array of shape (n_samples, ) + Working observations + + Returns + ------- + coef: array, shape = (X.shape[1]) + """ + # TODO: scipy.linalg.solve if faster, but ordinary least squares uses + # scipy.linalg.lstsq. What is more appropriate? + n_samples, n_features = X.shape + if sparse.issparse(X): + W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() + if P2.ndim == 1: + L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) + ).tocsr() + else: + L2 = sparse.csr_matrix(P2) + XtW = X.transpose() * W + A = XtW * X + L2 + b = XtW * z + coef = sparse.linalg.spsolve(A, b) + else: + XtW = (X.T * W) + A = XtW.dot(X) + if P2.ndim == 1: + A[np.diag_indices_from(A)] += P2 + else: + A += P2 + b = XtW.dot(z) + coef = linalg.solve(A, b) + return coef + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - r""" - Class to fit a Generalized Linear Model (GLM) based on reproductive - Exponential Dispersion Models (EDM). + """Regression via a Generalized Linear Model (GLM) based on reproductive + Exponential Dispersion Models (EDM) with combined L1 and L2 priors as + regularizer. + + Minimizes the objective function:: + + 1/(2s) * deviance(y, h(X*w)) + + alpha * l1_ratio * ||P1*w||_1 + + 1/2 * alpha * (1 - l1_ratio) * w*P2*w + + with inverse link function `h` and s=sum of `sample_weight` (which equals + n_samples for `sample_weight=None`). + For `P1`=`P2`=identity, the penalty is the elastic net:: - #TODO: This belongs to User Guide - Assumptions: + alpha * l1_ratio * ||w||_1 + + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - - The target values y_i are realizations of random variables - :math:`Y_i \sim \mathrm{EDM}(\mu_i, \frac{\phi}{w_i})` with dispersion - parameter :math:`\phi` and weights :math:`w_i`. - - The expectation of :math:`Y_i` is :math:`\mu_i=\mathrm{E}[Y]=h(\eta_i)` - whith the linear predictor :math:`\eta=X*\beta`, inverse link function - :math:`h(\eta)`, design matrix :math:`X` and parameters :math:`\beta` - to be estimated. + If you are interested in controlling the L1 and L2 penalty + separately, keep in mind that this is equivalent to:: - Note that the first assumption implies - :math:`\mathrm{Var}[Y_i]=\frac{\phi}{w_i} v(\mu_i)` with uni variance - function :math:`v(\mu)`. + a * L1 + b * L2 + + where:: + + alpha = a + b and l1_ratio = a / (a + b) + + The parameter `l1_ratio` corresponds to alpha in the glmnet R package while + alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio + = 1 is the lasso penalty. + + Read more in the :ref:`User Guide `. The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\mu_i=h(\eta_i)` and - :math:`Var[Y_i]=\frac{\phi}{w_i} v(\mu_i)` + the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + (penalized) maximum likelihood which is equivalent to minimizing the + deviance. - The parameters :math:`\beta` are estimated by maximum likelihood which is - equivalent to minimizing the deviance. + TODO: For `alpha` > 0, the feature matrix `X` is assumed to be + standardized. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. TODO: Estimation of the dispersion parameter phi. - TODO: Notes on weights and 'scaled' Poisson, e.g. fit y = x/w with - with x=counts and w=exposure (time, money, persons, ...) => y is a - ratio with weights w. + TODO: Notes on weights and 'scaled' distributions. For Poisson, this means + to fit y = z/w with z=counts and w=exposure (time, money, persons, ...) + => y is a ratio with weights w. Same for other distributions. Parameters ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms und thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + l1_ratio : float, optional (defaul=0) + The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance - of a subclass of ExponentialDispersionModel, optional - (default='normal') + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ + of class ExponentialDispersionModel, optional(default='normal') the distributional assumption of the GLM. - link : {'identity', 'log'} or an instance of a subclass of Link, + link : {'identity', 'log'} or an instance of class Link, optional (default='identity') the link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). @@ -634,28 +741,41 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): where ``g_i`` is the i-th component of the gradient (derivative of the deviance). - start_params : {array shape (n_features, ), 'ols'}, optional (default=None) - sets the start values for coef_ in the fit. - If None, default values are taken. - If 'ols' the result of an ordinary least squares in the link space - (linear predictor) is taken. - If an array is given, these values are taken as coef_ to start with. - If fit_intercept is true, the first value is assumed to be the start - value for the intercept_. + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to fit as + initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to fit), option ``start_params`` sets the + starting values for ``coef_`` and ``intercept_``. + + start_params : None or array of shape (n_features, ) or 'least_squares'}, \ + optional (default=None) + If an array of size n_features is supplied, use these as start values + for ``coef_`` in the fit. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + If 'least_squares' is set, the result of a least squares fit in the + link space (linear predictor) is taken. If ``None``, the start values + are calculated by setting mu to family.starting_mu(..) and one step of + irls. + This option only applies if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not exist). + + copy_X : boolean, optional, default True + If ``True``, X will be copied; else, it may be overwritten. verbose : int, optional (default=0) For the lbfgs solver set verbose to any positive number for verbosity. Attributes ---------- - coef_ : array, shape (1, n_features) + coef_ : array, shape (n_features, ) Estimated coefficients for the linear predictor (X*coef_) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. dispersion_ : float - The dispersion parameter :math:`\phi` if fit_dispersion is set. + The dispersion parameter :math:`\\phi` if fit_dispersion is set. n_iter_ : int Actual number of iterations of the solver. @@ -667,10 +787,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- TODO """ - - def __init__(self, fit_intercept=True, family='normal', - link='identity', fit_dispersion='chisqr', solver='irls', - max_iter=100, tol=1e-4, start_params=None, verbose=0): + def __init__(self, alpha=1.0, l1_ratio=0, + fit_intercept=True, family='normal', link='identity', + fit_dispersion='chisqr', solver='irls', max_iter=100, + tol=1e-4, warm_start=False, start_params=None, copy_X=True, + verbose=0): + self.alpha = alpha + self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept self.family = family self.link = link @@ -678,31 +801,86 @@ def __init__(self, fit_intercept=True, family='normal', self.solver = solver self.max_iter = max_iter self.tol = tol + self.warm_start = warm_start self.start_params = start_params + self.copy_X = copy_X self.verbose = verbose - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, P1=None, P2=None, + check_input=True): """Fit a generalized linear model. Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix of shape (n_samples, n_features) Training data - y : numpy array of shape [n_samples] + y : numpy array of shape (n_samples, ) Target values - sample_weight : numpy array of shape [n_samples] + sample_weight : array of shape (n_samples, ) or None,\ + optinal (default=None) Individual weights for each sample. Var[Y_i]=phi/weight_i * v(mu) If Y_i ~ EDM(mu, phi/w_i) then sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. + P1 : None or array of shape (n_features*, ), optional\ + (default=None) + With this array, you can exclude coefficients from ths L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``None`` is the same as an array of ones. + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + P2 : None or array of shape (n_features*, n_features*) + With this square matrix the L2 penalty is calculated as `w P2 w`. + This gives a fine control over this penalty (Tikhonov + regularization). + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + check_input : boolean, optional (default=True) + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + Returns ------- self : returns an instance of self. """ + ####################################################################### + # 1. input validation # + ####################################################################### + # 1.1 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) + y = y.astype(np.float64) + + if sample_weight is None: + weights = np.ones_like(y) + elif np.isscalar(sample_weight): + weights = sample_weight*np.ones_like(y) + else: + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != y.shape[0]: + raise ValueError("Sample weights must have the same length as" + " y") + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights = weights/np.sum(weights) + + if not isinstance(check_input, bool): + raise ValueError("The argument check_input must be bool; got " + "(check_input={0})".format(check_input)) + + # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): @@ -720,7 +898,8 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian'].") + " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" + " got (family={0})".format(self.family)) # Garantee that self._link_instance is set to an instance of class Link if isinstance(self.link, Link): @@ -733,132 +912,232 @@ def fit(self, X, y, sample_weight=None): else: raise ValueError( "The link must be an instance of class Link or" - " an element of ['identity', 'log'].") - + " an element of ['identity', 'log']; got (link={0})" + .format(self.link)) + + if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: + raise ValueError("Penalty term must be non-negative;" + " got (alpha={0})".format(self.alpha)) + if (not isinstance(self.l1_ratio, numbers.Number) or + self.l1_ratio < 0 or self.l1_ratio > 1): + raise ValueError("l1_ratio must be in interval [0, 1]; got" + " (l1_ratio={0]})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): - raise ValueError("The argument fit_intercept must be bool," + raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) if self.solver not in ['irls', 'lbfgs', 'newton-cg']: raise ValueError("GLM Regression supports only irls, lbfgs and" "newton-cg solvers, got {0}".format(self.solver)) + if self.alpha > 0: + if (self.l1_ratio > 0 and + self.solver not in []): + # TODO: Add solver for L1 + # raise ValueError("The solver option (solver={0}) is not " + # "appropriate for the chosen penalty which" + # " includes L1 (alpha={1})." + # .format(self.solver, self.alpha)) + raise NotImplementedError("Currently, no solver is implemented" + " that can deal with L1 penalties.") if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: raise ValueError("Maximum number of iteration must be positive;" " got (max_iter={0!r})".format(self.max_iter)) if not isinstance(self.tol, numbers.Number) or self.tol < 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol={0!r})".format(self.tol)) + if not isinstance(self.warm_start, bool): + raise ValueError("The argument warm_start must be bool;" + " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params is not 'ols': + if start_params is not None and start_params is not 'least_squares': start_params = np.atleast_1d(start_params) - if start_params.shape[0] != X.shape[1] + self.fit_intercept: + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" - "right length; required length {0}, got {1}" + "right length and dimension; required (length" + "={0}, ndim=1), got (length={1}, ndim={2})." .format(X.shape[1] + self.fit_intercept, - start_params.shape[0])) - - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - y_numeric=True, multi_output=False) - y = y.astype(np.float64) + start_params.shape[0], + start_params.ndim)) + if not isinstance(self.copy_X, bool): + raise ValueError("The argument copy_X must be bool;" + " got {0}".format(self.copy_X)) + + if P1 is None: + P1 = np.ones(X.shape[1]) + else: + P1 = np.atleast_1d(P1) + if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): + raise ValueError("P1 must be either None or an 1D array with " + "the length of X.shape[1]; " + "got (P1.shape[0]={0}), " + "needed (X.shape[1]={1})." + .format(P1.shape[0], X.shape[1])) + if P2 is None: + P2 = np.ones(X.shape[1]) + if sparse.issparse(X): + P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + else: + P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'], + dtype="numeric", ensure_2d=True) + if ((P2.shape[0] != P2.shape[1]) or + (P2.shape[0] != X.shape[1]) or + (P2.ndim != 2)): + raise ValueError("P2 must be either None or an array of shape " + "(n_features, n_features) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},{1})), needed ({3},{3})" + .format(P2.shape[0], P2.shape[1], X.shape[1])) family = self._family_instance link = self._link_instance - if not np.all(family.in_y_range(y)): - raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}" - .format(family.__class__.__name__)) - - if sample_weight is None: - weights = np.ones_like(y) - elif np.isscalar(sample_weight): - weights = sample_weight*np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != y.shape[0]: - raise ValueError("Sample weights must have the same length as" - " y") - if self.fit_intercept: # intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) + P1 = np.concatenate((np.array([0]), P1)) + if P2.ndim == 1: + P2 = np.concatenate((np.array([0]), P2)) + elif sparse.issparse(P2): + P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), + dtype=P2.dtype).tocsr() + else: + P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], + [np.zeros((X.shape[1], 1)), P2]]) else: Xnew = X n_samples, n_features = Xnew.shape - - # Note: Since dispersion_ alias phi does not enter the estimation - # of mu_i=E[y_i] set it to 1 where convenient. + l1 = self.alpha * self.l1_ratio + l2 = self.alpha * (1-self.l1_ratio) + P1 *= l1 + P2 *= l2 + + # 1.3 additional validations ########################################## + if check_input: + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # TODO: if alpha=0 check that Xnew is not rank deficient + # TODO: what else to check? + + ####################################################################### + # 2. initialization of coef = (intercept_, coef_) # + ####################################################################### + # Note: Since phi=self.dispersion_ does not enter the estimation + # of mu_i=E[y_i], set it to 1. # set start values for coef coef = None - if start_params is None: - # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weights) - # linear predictor - eta = link.link(mu) - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights w, in principle a diagonal matrix - # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weights=weights)) - wroot = np.sqrt(w) - # working observations - yw = eta + (y-mu)/hp - # least squares rescaled with wroot - wroot = sparse.dia_matrix((wroot, 0), shape=(n_samples, n_samples)) - X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) - yw_rescale = safe_sparse_dot(wroot, y, dense_output=True) - coef = linalg.lstsq(X_rescale, yw_rescale)[0] - elif start_params is 'ols': - reg = LinearRegression(copy_X=False, fit_intercept=False) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ + if self.warm_start and hasattr(self, "coef_"): + if self.fit_intercept: + coef = np.concatenate((self.intercept_, self.coef_)) + else: + coef = self.coef_ + elif self.start_params is None: + if self.l1_ratio == 0: + # See 3.1 IRLS + # Use mu_start and apply one irls step to calculate coef + mu = family.starting_mu(y, weights) + # linear predictor + eta = link.link(mu) + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(Xnew, W, P2, z) + else: + # with L1 penalty, start with coef = 0 + coef = np.zeros(n_features) + elif self.start_params is 'least_squares': + if self.alpha == 0: + reg = LinearRegression(copy_X=True, fit_intercept=False) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + elif self.l1_ratio <= 0.01: + # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge + reg = Ridge(copy_X=True, fit_intercept=False, + alpha=self.alpha) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + else: + # TODO: Does this make sense? + reg = ElasticNet(copy_X=True, fit_intercept=False, + alpha=self.alpha, l1_ratio=self.l1_ratio) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ else: coef = start_params + ####################################################################### + # 3. fit # + ####################################################################### # algorithms for optimiation # TODO: Parallelize it self.n_iter_ = 0 converged = False + # 3.1 IRLS ############################################################ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) + # Note: P2 = l2*P2, see above if self.solver == 'irls': - # linear predictor + # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) while self.n_iter_ < self.max_iter: self.n_iter_ += 1 # coef_old not used so far. # coef_old = coef - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights w, in principle a diagonal matrix + # working weights W, in principle a diagonal matrix # therefore here just as 1d array - w = (hp**2 / family.variance(mu, phi=1, weights=weights)) - wroot = np.sqrt(w) + W = (hp**2 / V) # working observations - yw = eta + (y-mu)/hp - # least squares rescaled with wroot - wroot = sparse.dia_matrix((wroot, 0), - shape=(n_samples, n_samples)) - X_rescale = safe_sparse_dot(wroot, Xnew, dense_output=True) - yw_rescale = safe_sparse_dot(wroot, yw, dense_output=True) - coef, residues, rank, singular_ = ( - linalg.lstsq(X_rescale, yw_rescale)) + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(Xnew, W, P2, z) # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs - # TODO: faster computation of gradient, use mu and eta directly - gradient = family._deviance_derivative( - coef=coef, X=Xnew, y=y, weights=weights, link=link) + # gradient = family._deviance_derivative( + # coef=coef, X=Xnew, y=y, weights=weights, link=link) + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V) + if P2.ndim == 1: + gradient += P2*coef + else: + gradient += safe_sparse_dot(P2, coef) if (np.max(np.abs(gradient)) <= self.tol): converged = True break @@ -868,50 +1147,73 @@ def fit(self, X, y, sample_weight=None): "of iterations (currently {0})" .format(self.max_iter), ConvergenceWarning) + # 3.2 L-BFGS and Newton-CG ############################################ # TODO: performance: make one function return both deviance and # gradient of deviance - elif self.solver == 'lbfgs': - func = family._deviance - fprime = family._deviance_derivative - args = (Xnew, y, weights, link) - coef, loss, info = optimize.fmin_l_bfgs_b( - func, coef, fprime=fprime, - args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}".format( - info["task"])) - self.n_iter_ = info['nit'] - elif self.solver == 'newton-cg': - func = family._deviance - grad = family._deviance_derivative + elif self.solver in ['lbfgs', 'newton-cg']: + def func(coef, *args): + if P2.ndim == 1: + L2 = safe_sparse_dot(coef.T, P2*coef) + else: + L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef)) + # A[np.diag_indices_from(A)] += P2 + return 0.5*family._deviance(coef, *args) + 0.5*L2 + + def fprime(coef, *args): + if P2.ndim == 1: + L2 = P2*coef + else: + L2 = safe_sparse_dot(P2, coef) + return 0.5*family._deviance_derivative(coef, *args) + L2 def grad_hess(coef, X, y, weights, link): - grad = (family._deviance_derivative( - coef, X, y, weights, link)) - hessian = (family._deviance_hessian( - coef, X, y, weights, link)) + if P2.ndim == 1: + L2 = P2*coef + else: + L2 = safe_sparse_dot(P2, coef) + grad = 0.5*family._deviance_derivative( + coef, X, y, weights, link) + L2 + hessian = 0.5*family._deviance_hessian( + coef, X, y, weights, link) + if P2.ndim == 1: + hessian[np.diag_indices_from(hessian)] += P2 + else: + hessian += P2 def Hs(s): - ret = np.dot(hessian, s) + ret = safe_sparse_dot(hessian, s) return ret return grad, Hs - hess = grad_hess + args = (Xnew, y, weights, link) - coef, n_iter_i = newton_cg(hess, func, grad, coef, args=args, - maxiter=self.max_iter, tol=self.tol) - self.coef_ = coef + if self.solver == 'lbfgs': + coef, loss, info = optimize.fmin_l_bfgs_b( + func, coef, fprime=fprime, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + elif self.solver == 'newton-cg': + coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + ####################################################################### + # 4. postprocessing # + ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] else: + # set intercept to zero as the other linear models do self.intercept_ = 0. self.coef_ = coef @@ -988,8 +1290,8 @@ def score(self, X, y, sample_weight=None): :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` is the null deviance, i.e. the deviance of a model with intercept alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is average by sample_weight. In the case of a Normal - distribution, this D^2 equals R^2. + :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal + distribution, D^2 equals R^2. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a4d4ea8650860..df0413b4d7836 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,19 +1,34 @@ import numpy as np from sklearn.linear_model.glm import ( - # Link, IdentityLink, + Link, + IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - # GeneralizedHyperbolicSecand, + GeneralizedHyperbolicSecand, GeneralizedLinearRegressor) +from sklearn.linear_model.ridge import Ridge from sklearn.utils.testing import ( - # assert_equal, + assert_equal, assert_almost_equal, assert_array_equal, assert_array_almost_equal) +def test_link_properties(): + """Test link inverse and derivative + """ + rng = np.random.RandomState(0) + x = rng.rand(100)*100 + from sklearn.linear_model.glm import Link + for link in vars()['Link'].__subclasses__(): + link = link() + assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) + assert_almost_equal(link.inverse_derivative(link.link(x)), + 1/link.derivative(x), decimal=10) + + def test_family_bounds(): """Test the valid range of distributions """ @@ -42,8 +57,23 @@ def test_family_bounds(): assert_array_equal(result, [False, False, True]) +def test_deviance_zero(): + """Test deviance(y,y) = 0 for different families + """ + for family in [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=-2.5), + TweedieDistribution(power=-1), + TweedieDistribution(power=1.5), + TweedieDistribution(power=2.5), + TweedieDistribution(power=4), + GeneralizedHyperbolicSecand()]: + assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10) + assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) + + def test_glm_identiy_regression(): - """Test linear regression on a simple dataset + """Test GLM regression with identity link on a simple dataset """ coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -55,13 +85,13 @@ def test_glm_identiy_regression(): for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( - family=family, fit_intercept=False, solver=solver) + alpha=0, family=family, fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) def test_glm_log_regression(): - """Test linear regression on a simple dataset + """Test GLM regression with log link on a simple dataset """ coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -73,10 +103,79 @@ def test_glm_log_regression(): for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( - family=family, link=LogLink(), fit_intercept=False, - solver=solver, start_params='ols') + alpha=0, family=family, link=LogLink(), fit_intercept=False, + solver=solver, start_params='least_squares') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) +def test_normal_ridge(): + """Test ridge regression for Normal distributions + + Compare to test_ridge in test_ridge.py. + """ + rng = np.random.RandomState(0) + alpha = 1.0 + + # With more samples than features + n_samples, n_features, n_predict = 6, 5, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + T = rng.randn(n_predict, n_features) + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge.fit(X, y) + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=True, solver=solver) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=False, solver='irls') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + # With more features than samples + n_samples, n_features, n_predict = 5, 10, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + T = rng.randn(n_predict, n_features) + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge.fit(X, y) + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=True, solver=solver) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + family='normal', link='identity', + fit_intercept=False, solver='irls') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + + # TODO: Test compatibility with R's glm, glmnet From 5b46c23977a8e386987a2767b2c12d4296d332af Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 19 Sep 2017 00:40:34 +0200 Subject: [PATCH 007/269] [WIP] Add Generalized Linear Models (#9405) * fix some bugs in user guide linear_model.rst * fix some pep8 issues in test_glm.py --- doc/modules/linear_model.rst | 28 +++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 7 ++++--- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 51b3821fa6207..98736facd9b76 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -880,14 +880,14 @@ Generalized linear regression ============================= :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [1]_. First, the predicted values :math:`\hat{y}` are linked to a linear +ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [2]_. The objective function beeing minimized +exponential dispersion model (EDM) [9]_. The objective function beeing minimized becomes .. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 @@ -914,16 +914,20 @@ it is convenient to apply a link function different from the identity link :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This -ensures that the penalty treats features equally. +ensures that the penalty treats features equally. The estimator can be used as +follows:: - >>> from sklearn import linear_model - >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, l1_ratio=0) - >>> reg = linear_model.GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> from sklearn.linear_model import GeneralizedLinearRegressor + >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson', + fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, + link='log', max_iter=100, solver='irls', start_params=None, + tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) - >>> reg.intercept_ - -0.76383575123143277 + >>> reg.intercept_ #doctest: +ELLIPSIS + -0.76383575... Mathematical formulation ------------------------ @@ -969,7 +973,7 @@ Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [3]_. + (19)-(20) in [10]_. * If you want to model a frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing @@ -979,12 +983,12 @@ Two remarks: .. topic:: References: - .. [1] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [2] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. See also `Exponential dispersion model. `_ - .. [3] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index df0413b4d7836..b62b51b5bcb9e 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from sklearn.linear_model.glm import ( Link, - IdentityLink, + # IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, @@ -21,8 +21,9 @@ def test_link_properties(): """ rng = np.random.RandomState(0) x = rng.rand(100)*100 - from sklearn.linear_model.glm import Link - for link in vars()['Link'].__subclasses__(): + # from sklearn.linear_model.glm import Link + # for link in vars()['Link'].__subclasses__(): + for link in Link.__subclasses__(): link = link() assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) assert_almost_equal(link.inverse_derivative(link.link(x)), From 10dd14603a5fc04f53ca4920621434aaff662064 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 3 Dec 2017 19:54:57 +0100 Subject: [PATCH 008/269] [WIP] Add Generalized Linear Models (#9405) * added test: ridge poisson with log-link compared to glmnet * fix ValueError message for l1_ratio * fix ValueError message for P2 * string comparison: use '==' and '!=' instead of 'is' and 'is not' * fix RuntimeWarnings in unit_deviance of poisson: x*log(x) as xlogy * added test for fisher matrix * added test for family argument --- sklearn/linear_model/glm.py | 29 ++++++----- sklearn/linear_model/tests/test_glm.py | 72 ++++++++++++++++++++++++-- 2 files changed, 84 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 2db3c56d5e1c1..93ce358a8a874 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -15,6 +15,8 @@ # TODO: Make it as much consistent to other estimators in linear_model as # possible # TODO: options P1 and P2 in fit() or in __init__()??? +# TODO: Include further classes in class.rst? ExponentialDispersionModel? +# TweedieDistribution? # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -42,7 +44,7 @@ from abc import ABCMeta, abstractmethod, abstractproperty import numbers import numpy as np -from scipy import linalg, optimize, sparse +from scipy import linalg, optimize, sparse, special import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -340,7 +342,7 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): = \mathbf{X}^T W \mathbf{X} \,, with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see func:`score_function`. + see func:`_score`. """ n_samples = X.shape[0] lin_pred = safe_sparse_dot(X, coef, dense_output=True) @@ -363,7 +365,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): \mathbf{H}(\boldsymbol{w}) = -\frac{\partial^2 loglike}{\partial\boldsymbol{w} \partial\boldsymbol{w}^T} - = \mathbf{X}^T \legt[ + = \mathbf{X}^T \left[ - \mathbf{D}' \mathbf{R} + \mathbf{D}^2 \mathbf{V} \mathbf{R} + \mathbf{D}^2 @@ -393,7 +395,7 @@ def _deviance_derivative(self, coef, X, y, weights, link): r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a function of the coefficients `coef`. This is equivalent to :math:`-2\phi` times the score function - :func:`score_function` (derivative of the log-likelihood). + :func:`_score` (derivative of the log-likelihood). """ score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, link=link) @@ -510,7 +512,8 @@ def unit_deviance(self, y, mu): return (y-mu)**2 if p == 1: # PoissonDistribution - return 2 * (np.where(y == 0, 0, y*np.log(y/mu))-y+mu) + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -921,7 +924,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if (not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1): raise ValueError("l1_ratio must be in interval [0, 1]; got" - " (l1_ratio={0]})".format(self.l1_ratio)) + " (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) @@ -948,7 +951,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params is not 'least_squares': + if start_params is not None and start_params != 'least_squares': start_params = np.atleast_1d(start_params) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): @@ -986,7 +989,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " - "got (P2.shape=({0},{1})), needed ({3},{3})" + "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) family = self._family_instance @@ -1058,7 +1061,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif self.start_params is 'least_squares': + elif self.start_params == 'least_squares': if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1277,11 +1280,9 @@ def estimate_phi(self, y, X, sample_weight): dev = self._family_instance.deviance(y, mu, sample_weight) return dev/(n_samples - n_features) -# TODO: Fix "AssertionError: -0.28014056555724598 not greater than 0.5" -# in check_estimator for score -# from sklearn.utils.estimator_checks import check_estimator -# from sklearn.linear_model import GeneralizedLinearRegressor -# check_estimator(GeneralizedLinearRegressor) + # Note: check_estimator(GeneralizedLinearRegressor) might raise + # "AssertionError: -0.28014056555724598 not greater than 0.5" + # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): r"""Returns D^2, a generalization of the coefficient of determination R^2, which uses deviance instead of squared error. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index b62b51b5bcb9e..de7de90db967b 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,4 +1,6 @@ import numpy as np +from numpy.testing import assert_allclose +import scipy as sp from sklearn.linear_model.glm import ( Link, @@ -73,6 +75,46 @@ def test_deviance_zero(): assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) +def test_fisher_matrix(): + """Test the Fisher matrix numerically. + Trick: Use numerical differentiation with y = mu""" + for family in [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution()]: + link = LogLink() + rng = np.random.RandomState(0) + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)] + ) + assert_allclose(fisher, approx, rtol=1e-3) + + +def test_glm_family_argument(): + """Test GLM family argument set as string + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for (f, fam) in [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution())]: + glm = GeneralizedLinearRegressor(family=f, fit_intercept=False, + alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) + + def test_glm_identiy_regression(): """Test GLM regression with identity link on a simple dataset """ @@ -82,7 +124,8 @@ def test_glm_identiy_regression(): families = ( NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecand()) for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( @@ -100,7 +143,8 @@ def test_glm_log_regression(): families = ( NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecand()) for solver in ['irls', 'lbfgs', 'newton-cg']: for family in families: glm = GeneralizedLinearRegressor( @@ -179,4 +223,26 @@ def test_normal_ridge(): assert_array_almost_equal(glm.predict(T), ridge.predict(T)) -# TODO: Test compatibility with R's glm, glmnet +def test_poisson_ridge(): + """Test ridge regression with poisson family and LogLink + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.12889386979 + # a 0.29019207995 + # b 0.03741173122 + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson', + link='log', tol=1e-10) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=7) From 72485b63e89879e65381bca12152b54600fd3970 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 8 Jan 2018 22:13:45 +0100 Subject: [PATCH 009/269] [WIP] Add Generalized Linear Models (#9405) * put arguments P1, P2 and check_input from fit to __init__ * added check_input test: is P2 positive definite? * added solver option: 'auto' --- sklearn/linear_model/glm.py | 181 +++++++++++++++++++++++------------- 1 file changed, 117 insertions(+), 64 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 93ce358a8a874..3f6b91026ef9b 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -14,7 +14,7 @@ # TODO: Write examples and more docu # TODO: Make it as much consistent to other estimators in linear_model as # possible -# TODO: options P1 and P2 in fit() or in __init__()??? +# TODO: which dtype to force for y and X? Which for P1, P2? # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? @@ -709,6 +709,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. + P1 : None or array of shape (n_features*, ), optional\ + (default=None) + With this array, you can exclude coefficients from the L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``None`` is the same as an array of ones. + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). + + P2 : None or array of shape (n_features*, n_features*) + With this square matrix the L2 penalty is calculated as `w P2 w`. + This gives a fine control over this penalty (Tikhonov + regularization). + Note that n_features* = X.shape[1] = length of coef_ (intercept + always excluded from counting). P2 must be positive semi-definite. + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). @@ -727,13 +742,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'newton-cg', 'lbfgs'}, optional (defaul='irls') + solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto') Algorithm to use in the optimization problem. - - 'irls' is iterated reweighted least squares. It is the standard - algorithm for GLMs. + - 'irls' is iterated reweighted least squares (Fisher scoring). + It is the standard algorithm for GLMs. Cannot deal with + L1 penalties. + + - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'newton-cg', 'lbfgs' + - 'auto' sets 'irls'. max_iter : int, optional (default=100) TODO @@ -766,6 +784,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. + check_input : boolean, optional (default=True) + Allow to bypass several checks on input: y values in range of family, + sample_weights non-negative, P2 positive semi-definite. + Don't use this parameter unless you know what you do. + verbose : int, optional (default=0) For the lbfgs solver set verbose to any positive number for verbosity. @@ -790,13 +813,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- TODO """ - def __init__(self, alpha=1.0, l1_ratio=0, + def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', - fit_dispersion='chisqr', solver='irls', max_iter=100, + fit_dispersion='chisqr', solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params=None, copy_X=True, - verbose=0): + check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio + self.P1 = P1 + self.P2 = P2 self.fit_intercept = fit_intercept self.family = family self.link = link @@ -807,10 +832,10 @@ def __init__(self, alpha=1.0, l1_ratio=0, self.warm_start = warm_start self.start_params = start_params self.copy_X = copy_X + self.check_input = check_input self.verbose = verbose - def fit(self, X, y, sample_weight=None, P1=None, P2=None, - check_input=True): + def fit(self, X, y, sample_weight=None): """Fit a generalized linear model. Parameters @@ -823,31 +848,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, sample_weight : array of shape (n_samples, ) or None,\ optinal (default=None) - Individual weights for each sample. - Var[Y_i]=phi/weight_i * v(mu) - If Y_i ~ EDM(mu, phi/w_i) then + Individual weights w_i for each sample. Note that for an + Exponential Dispersion Model (EDM), one has + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. - P1 : None or array of shape (n_features*, ), optional\ - (default=None) - With this array, you can exclude coefficients from ths L1 penalty. - Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as an array of ones. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). - - P2 : None or array of shape (n_features*, n_features*) - With this square matrix the L2 penalty is calculated as `w P2 w`. - This gives a fine control over this penalty (Tikhonov - regularization). - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). - - check_input : boolean, optional (default=True) - Allow to bypass several input checking. - Don't use this parameter unless you know what you do. - Returns ------- self : returns an instance of self. @@ -872,16 +879,6 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, elif weights.shape[0] != y.shape[0]: raise ValueError("Sample weights must have the same length as" " y") - # IMPORTANT NOTE: Since we want to minimize - # 1/(2*sum(sample_weight)) * deviance + L1 + L2, - # deviance = sum(sample_weight * unit_deviance), - # we rescale weights such that sum(weights) = 1 and this becomes - # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) - weights = weights/np.sum(weights) - - if not isinstance(check_input, bool): - raise ValueError("The argument check_input must be bool; got " - "(check_input={0})".format(check_input)) # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class @@ -928,17 +925,22 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['irls', 'lbfgs', 'newton-cg']: - raise ValueError("GLM Regression supports only irls, lbfgs and" - "newton-cg solvers, got {0}".format(self.solver)) + if self.solver == 'auto': + solver = 'irls' + else: + solver = self.solver + if solver not in ['irls', 'lbfgs', 'newton-cg']: + raise ValueError("GeneralizedLinearRegressor supports only irls, " + "lbfgs and newton-cg solvers, got {0}" + "".format(solver)) if self.alpha > 0: if (self.l1_ratio > 0 and - self.solver not in []): + solver not in []): # TODO: Add solver for L1 # raise ValueError("The solver option (solver={0}) is not " # "appropriate for the chosen penalty which" # " includes L1 (alpha={1})." - # .format(self.solver, self.alpha)) + # .format(solver, self.alpha)) raise NotImplementedError("Currently, no solver is implemented" " that can deal with L1 penalties.") if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: @@ -964,28 +966,32 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) + if not isinstance(self.check_input, bool): + raise ValueError("The attribute check_input must be bool; got " + "(check_input={0})".format(self.check_input)) - if P1 is None: + if self.P1 is None: P1 = np.ones(X.shape[1]) else: - P1 = np.atleast_1d(P1) + P1 = np.atleast_1d(np.copy(self.P1)) if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) - if P2 is None: + if self.P2 is None: P2 = np.ones(X.shape[1]) if sparse.issparse(X): P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() else: - P2 = check_array(P2, accept_sparse=['csr', 'csc', 'coo'], + P2 = check_array(self.P2, copy=True, + accept_sparse=['csr', 'csc', 'coo'], dtype="numeric", ensure_2d=True) - if ((P2.shape[0] != P2.shape[1]) or - (P2.shape[0] != X.shape[1]) or - (P2.ndim != 2)): + if ((P2.ndim != 2) or + (P2.shape[0] != P2.shape[1]) or + (P2.shape[0] != X.shape[1])): raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " @@ -1020,16 +1026,39 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, P2 *= l2 # 1.3 additional validations ########################################## - if check_input: + if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) + if not np.all(weights >= 0): + raise ValueError("Sample weights must be non-negative.") + # check that P2 is positive semidefinite + # np.linalg.cholesky(P2) 'only' asserts positive definite + if self.P2 is not None: + if sparse.issparse(P2): + # TODO: check sparse P2 for non-negativeness + raise NotImplementedError("Check sparse P2 for " + "non-negaitveness is not yet " + "implemented.") + elif P2.ndim == 2: + if not np.all(np.linalg.eigvals(P2) >= -1e-15): + raise ValueError("P2 must be positive definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? ####################################################################### - # 2. initialization of coef = (intercept_, coef_) # + # 2. rescaling of weights (sample_weight) # + ####################################################################### + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights = weights/np.sum(weights) + + ####################################################################### + # 3. initialization of coef = (intercept_, coef_) # ####################################################################### # Note: Since phi=self.dispersion_ does not enter the estimation # of mu_i=E[y_i], set it to 1. @@ -1082,13 +1111,13 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, coef = start_params ####################################################################### - # 3. fit # + # 4. fit # ####################################################################### # algorithms for optimiation # TODO: Parallelize it self.n_iter_ = 0 converged = False - # 3.1 IRLS ############################################################ + # 4.1 IRLS ############################################################ # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' # Obj = objective function = 1/2 Dev + l2/2 w P2 w # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 @@ -1104,7 +1133,7 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, # Obj'' ~ X' W X + l2 P2 # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) # Note: P2 = l2*P2, see above - if self.solver == 'irls': + if solver == 'irls': # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) mu = link.inverse(eta) @@ -1150,10 +1179,10 @@ def fit(self, X, y, sample_weight=None, P1=None, P2=None, "of iterations (currently {0})" .format(self.max_iter), ConvergenceWarning) - # 3.2 L-BFGS and Newton-CG ############################################ + # 4.2 L-BFGS and Newton-CG ############################################ # TODO: performance: make one function return both deviance and # gradient of deviance - elif self.solver in ['lbfgs', 'newton-cg']: + elif solver in ['lbfgs', 'newton-cg']: def func(coef, *args): if P2.ndim == 1: L2 = safe_sparse_dot(coef.T, P2*coef) @@ -1190,7 +1219,7 @@ def Hs(s): args = (Xnew, y, weights, link) - if self.solver == 'lbfgs': + if solver == 'lbfgs': coef, loss, info = optimize.fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1204,13 +1233,37 @@ def Hs(s): warnings.warn("lbfgs failed for the reason: {0}" .format(info["task"])) self.n_iter_ = info['nit'] - elif self.solver == 'newton-cg': + elif solver == 'newton-cg': coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, args=args, maxiter=self.max_iter, tol=self.tol) + # 4.3 coordinate descent ############################################## + # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + # An Improved GLMNET for L1-regularized Logistic Regression, + # Journal of Machine Learning Research 13 (2012) 1999-2030 + # Note: Use Fisher matrix instead of Hessian + # + # 1. find optimal descent direction d by minimizing + # min_d F(w+d) = min_d F(w+d) - F(w) + # F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1 + # 2. quadrdatic approx of f(w+d)-f(w): + # q(d) = f'(w)*d +1/2 d*H*d + # min_d q(d) + g(w+d) - g(w) + # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): + # min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w) + # = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d) + # TODO + # elif solver == 'cd': + # line search parameters + # (beta, sigma) = (0.5, 0.01) + # for iteration k from 1 to maxiter + # for coordinate j sample at random + # np.random.choice(coord, replace = False) + # + ####################################################################### - # 4. postprocessing # + # 5. postprocessing # ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] From 5c1369bde863a73aff46a502acd70e58e06dcb85 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 15:22:08 +0100 Subject: [PATCH 010/269] [WIP] Add Generalized Linear Models (#9405) * added coordinate descent solver * skip doctest for GeneralizedLinearRegressor example * symmetrize P2 => use P2 = 1/2 (P2+P2') * better validation of parameter start_params --- doc/modules/linear_model.rst | 21 +- sklearn/linear_model/glm.py | 353 ++++++++++++++++++++----- sklearn/linear_model/tests/test_glm.py | 82 ++++-- 3 files changed, 365 insertions(+), 91 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 98736facd9b76..834466e494a4a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -894,9 +894,9 @@ becomes +\frac{\alpha(1-\rho)}{2} w^T P_2 w with sample weights :math:`s`. -:math:`P_1` can be used to exclude some of the coefficients in the L1 -penalty, :math:`P_2` (must be positive semi-definite) allows for a more -versatile L2 penalty. +:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in +the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows +for a more versatile L2 penalty. Use cases, where a loss different from the squared loss might be appropriate, are the following: @@ -908,22 +908,23 @@ are the following: * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). Since the linear predictor :math:`Xw` can be negative and -Poisson, Gamma and Inverse Gaussian distributions don't have negative values, +Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as -follows:: +follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - GeneralizedLinearRegressor(alpha=0.5, copy_X=True, family='poisson', - fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, - link='log', max_iter=100, solver='irls', start_params=None, - tol=0.0001, verbose=0, warm_start=False) + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP + GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, + copy_X=True, family='poisson', fit_dispersion='chisqr', + fit_intercept=True, l1_ratio=0, link='log', max_iter=100, + random_state=None, selection='random', solver='auto', + start_params=None, tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 3f6b91026ef9b..3de82c20f33cf 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -3,20 +3,21 @@ """ # Author: Christian Lorentzen +# some parts and tricks stolen from other sklearn files. # License: BSD 3 clause # TODO: Write more tests -# TODO: Add l1-penalty (elastic net) +# TODO: Write examples and more docu # TODO: deal with option self.copy_X # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation -# TODO: Write examples and more docu -# TODO: Make it as much consistent to other estimators in linear_model as -# possible -# TODO: which dtype to force for y and X? Which for P1, P2? +# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? +# So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? +# TODO: Negative values in P1 are not allowed so far. They could be used form +# group lasse. # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -52,10 +53,11 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six +from ..externals.six.moves import xrange from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_random_state class Link(six.with_metaclass(ABCMeta)): @@ -309,7 +311,9 @@ def _score(self, coef, phi, X, y, weights, link): .. math: - \mathbf{score}(\boldsymbol{w}) = \mathbf{X}^T \mathbf{D} + \mathbf{score}(\boldsymbol{w}) + = \frac{\partial loglike}{\partial\boldsymbol{w}} + = \mathbf{X}^T \mathbf{D} \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and @@ -411,6 +415,29 @@ def _deviance_hessian(self, coef, X, y, weights, link): weights=weights, link=link) return 2*info_matrix + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): + """Calculates eta (linear predictor), mu, score function (derivative + of log-likelihood) and Fisher matrix (all with phi=1) all in one go""" + n_samples, n_features = X.shape + # eta = linear predictor + eta = safe_sparse_dot(X, coef, dense_output=True) + mu = link.inverse(eta) + sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) + d1 = link.inverse_derivative(eta) # = h'(eta) + # Alternatively: + # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g + # d1 = 1./link.derivative(mu) + d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True) + score = safe_sparse_dot(X.T, temp, dense_output=True) + # + d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0), + shape=(n_samples, n_samples)) + temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) + fisher = safe_sparse_dot(X.T, temp, dense_output=False) + return eta, mu, score, fisher + def starting_mu(self, y, weights=1): """Starting values for the mean mu_i in (unpenalized) IRLS.""" return ((weights*y+np.mean(weights*y)) / @@ -670,8 +697,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): alpha = a + b and l1_ratio = a / (a + b) The parameter `l1_ratio` corresponds to alpha in the glmnet R package while - alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio - = 1 is the lasso penalty. + 'alpha' corresponds to the lambda parameter in glmnet. Specifically, + l1_ratio = 1 is the lasso penalty. Read more in the :ref:`User Guide `. @@ -686,6 +713,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO: For `alpha` > 0, the feature matrix `X` is assumed to be standardized. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + Otherwise, the strength of the penalty is different for the features. TODO: Estimation of the dispersion parameter phi. @@ -742,19 +770,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'newton-cg', 'lbfgs'}, optional (defaul='auto') + solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ + optional (defaul='auto') Algorithm to use in the optimization problem. + - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. + - 'irls' is iterated reweighted least squares (Fisher scoring). It is the standard algorithm for GLMs. Cannot deal with L1 penalties. - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'auto' sets 'irls'. + - 'cd' is the coordinate descent algorithm. It can deal with L1 and + L2 penalties. max_iter : int, optional (default=100) - TODO + The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, @@ -781,6 +813,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): This option only applies if ``warm_start=False`` or if fit is called the first time (``self.coef_`` does not exist). + selection : str, optional (default='random') + For the solver 'cd' (coordinate descent), the coordinates (features) + can be updated in either cyclic or random order. + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially by default. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator that selects a random + feature to be updated for solver 'cd' (coordinate descent). + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. @@ -816,7 +865,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', fit_dispersion='chisqr', solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params=None, copy_X=True, + tol=1e-4, warm_start=False, start_params=None, + selection='random', random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio @@ -831,6 +881,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, self.tol = tol self.warm_start = warm_start self.start_params = start_params + self.selection = selection + self.random_state = random_state self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -925,24 +977,21 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver == 'auto': - solver = 'irls' - else: - solver = self.solver - if solver not in ['irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: raise ValueError("GeneralizedLinearRegressor supports only irls, " - "lbfgs and newton-cg solvers, got {0}" - "".format(solver)) - if self.alpha > 0: - if (self.l1_ratio > 0 and - solver not in []): - # TODO: Add solver for L1 - # raise ValueError("The solver option (solver={0}) is not " - # "appropriate for the chosen penalty which" - # " includes L1 (alpha={1})." - # .format(solver, self.alpha)) - raise NotImplementedError("Currently, no solver is implemented" - " that can deal with L1 penalties.") + "auto, lbfgs, newton-cg and cd solvers, got {0}" + "".format(self.solver)) + solver = self.solver + if self.solver == 'auto': + if self.l1_ratio == 0: + solver = 'irls' + else: + solver = 'cd' + if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: raise ValueError("Maximum number of iteration must be positive;" " got (max_iter={0!r})".format(self.max_iter)) @@ -953,7 +1002,14 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is not None and start_params != 'least_squares': + if start_params is None: + pass + elif isinstance(start_params, six.string_types): + if start_params not in ['least_squares']: + raise ValueError("The argument start_params must be None, " + "'least-squares' or an array of right length," + " got(start_params={0})".format(start_params)) + else: start_params = np.atleast_1d(start_params) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): @@ -963,6 +1019,12 @@ def fit(self, X, y, sample_weight=None): .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) + + if self.selection not in ['cyclic', 'random']: + raise ValueError("The argument selection must be 'cyclic' or " + "'random', got (selection={0})" + .format(self.selection)) + random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -974,15 +1036,16 @@ def fit(self, X, y, sample_weight=None): P1 = np.ones(X.shape[1]) else: P1 = np.atleast_1d(np.copy(self.P1)) - if (P1.shape[0] != X.shape[1]) or (P1.ndim != 1): + if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) if self.P2 is None: - P2 = np.ones(X.shape[1]) - if sparse.issparse(X): + if not sparse.issparse(X): + P2 = np.ones(X.shape[1]) + else: P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() else: @@ -1024,6 +1087,12 @@ def fit(self, X, y, sample_weight=None): l2 = self.alpha * (1-self.l1_ratio) P1 *= l1 P2 *= l2 + # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') + # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric + if sparse.issparse(P2): + P2 = 0.5 * (P2 + P2.transpose()) + else: + P2 = 0.5 * (P2 + P2.T) # 1.3 additional validations ########################################## if self.check_input: @@ -1033,14 +1102,20 @@ def fit(self, X, y, sample_weight=None): .format(family.__class__.__name__)) if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") - # check that P2 is positive semidefinite + # check if P1 has only non-negative values, negative values might + # indicate group lasso in the future. + if self.P1 is not None: + if not np.all(P1 >= 0): + raise ValueError("P1 must not have negative values.") + # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite if self.P2 is not None: if sparse.issparse(P2): # TODO: check sparse P2 for non-negativeness - raise NotImplementedError("Check sparse P2 for " - "non-negaitveness is not yet " - "implemented.") + # raise NotImplementedError("Check sparse P2 for " + # "non-negaitveness is not yet " + # "implemented.") + pass elif P2.ndim == 2: if not np.all(np.linalg.eigvals(P2) >= -1e-15): raise ValueError("P2 must be positive definite.") @@ -1090,7 +1165,8 @@ def fit(self, X, y, sample_weight=None): else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif self.start_params == 'least_squares': + elif (isinstance(self.start_params, six.string_types) and + self.start_params == 'least_squares'): if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1102,7 +1178,7 @@ def fit(self, X, y, sample_weight=None): reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: - # TODO: Does this make sense? + # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio) reg.fit(Xnew, link.link(y)) @@ -1125,14 +1201,17 @@ def fit(self, X, y, sample_weight=None): # D2 = link.inverse_derivative(eta)^2 = D^2 # W = D2/V(mu) # l2 = alpha * (1 - l1_ratio) - # Obj' = d(Obj)/d(w) = 1/2 Dev' + P2 w + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w # = -X' D (y-mu)/V(mu) + l2 P2 w # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 # Use Fisher matrix instead of full info matrix -X'(...) X, # i.e. E[Dev''] with E[y-mu]=0: # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, with z = eta + D^-1 (y-mu) - # Note: P2 = l2*P2, see above + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: ' denotes derivative, but also transpose for matrices if solver == 'irls': # eta = linear predictor eta = safe_sparse_dot(Xnew, coef, dense_output=True) @@ -1150,9 +1229,8 @@ def fit(self, X, y, sample_weight=None): # working observations z = eta + (y-mu)/hp # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z + # A = X' W X + P2, b = X' W z coef = _irls_step(Xnew, W, P2, z) - # updated linear predictor # do it here for updated values for tolerance eta = safe_sparse_dot(Xnew, coef, dense_output=True) @@ -1242,25 +1320,182 @@ def Hs(s): # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin # An Improved GLMNET for L1-regularized Logistic Regression, # Journal of Machine Learning Research 13 (2012) 1999-2030 - # Note: Use Fisher matrix instead of Hessian + # Note: Use Fisher matrix instead of Hessian for H # # 1. find optimal descent direction d by minimizing # min_d F(w+d) = min_d F(w+d) - F(w) - # F = f + g; f(w) = 1/2 dev; g(w) = 1/2*w*P2*w + ||P1*w||_1 - # 2. quadrdatic approx of f(w+d)-f(w): - # q(d) = f'(w)*d +1/2 d*H*d - # min_d q(d) + g(w+d) - g(w) + # F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1 + # 2. quadrdatic approximation of F(w+d)-F(w) = q(d): + # using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives + # q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + # + ||P1*(w+d)||_1 - ||P1*w||_1 + # min_d q(d) # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): - # min_z q(d+z*e_j) + g(w+d+z*e_j) - g(w) - # = min_z q(d+z e_j) - q(d) + g(w+d+z*e_j) - g(w+d) - # TODO - # elif solver == 'cd': + # min_z q(d+z*e_j) + # = min_z q(d+z*e_j) - q(d) + # = min_z A_j z + 1/2 B_jj z^2 + # + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + # A = f'(w) + d*H(w) + (w+d)*P2 + # B = H+P2 + # Note: we already set P2 = l2*P2, P1 = l1*P1, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: f' = -score, H = Fisher matrix + elif solver == 'cd': # line search parameters - # (beta, sigma) = (0.5, 0.01) - # for iteration k from 1 to maxiter - # for coordinate j sample at random - # np.random.choice(coord, replace = False) - # + (beta, sigma) = (0.5, 0.01) + # max inner loops (cycles through all features) + max_inner_iter = 1000 + # some precalculations + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # initial stopping tolerance of inner loop + # use L1-norm of minimum-norm of subgradient of F + # fp_wP2 = f'(w) + w*P2 + if P2.ndim == 1: + fp_wP2 = -score + coef*P2 + else: + fp_wP2 = -score + safe_sparse_dot(coef, P2) + inner_tol = (np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2+np.sign(coef)*P1)) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while self.n_iter_ < self.max_iter: + self.n_iter_ += 1 + # initialize search direction d (to be optimized) + d = np.zeros_like(coef) + # inner loop + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + d = np.zeros_like(coef) + # A = f'(w) + d*H(w) + (w+d)*P2 + # B = H+P2 + # Note: f'=-score and H=fisher are updated at the end of outer + # iteration + B = fisher + if P2.ndim == 1: + coef_P2 = coef * P2 + B[np.diag_indices_from(B)] += P2 + else: + coef_P2 = safe_sparse_dot(coef, P2) + B += P2 + A = -score + coef_P2 # + d*(H+P2) but d=0 so far + inner_iter = 0 + while inner_iter < max_inner_iter: + inner_iter += 1 + if self.selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, ee 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + a = A[j] + b = B[j, j] + if P1[j] == 0: + if b == 0: + z = 0 + else: + z = -a/b + elif a + P1[j] < b * (coef[j]+d[j]): + if b == 0: + z = 0 + else: + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[j]+d[j]): + if b == 0: + z = 0 + else: + z = -(a - P1[j])/b + else: + z = -(coef[j] + d[j]) + # update direction d + d[j] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if sparse.issparse(B): + if sparse.isspmatrix_csc(B): + # slice columns + A += B[:, j].toarray().ravel() * z + else: + # slice rows + A += B[j, :].toarray().ravel() * z + else: + A += B[j, :] * z + # end of cycle + # stopping criterion for inner loop + # sum_i(|minimum-norm subgrad of q(d)_i|) + mn_subgrad = (np.where(coef + d == 0, + np.sign(A)*np.maximum(np.abs(A)-P1, 0), + A+np.sign(coef+d)*P1)) + mn_subgrad = np.sum(np.abs(mn_subgrad)) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1*coef, ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * ( + safe_sparse_dot(-score, d) + + safe_sparse_dot(coef_P2, d) + + linalg.norm(P1*(coef+d), ord=1) - + P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * safe_sparse_dot(coef_P2, coef) + + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d, + dense_output=True)) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1*(coef+la*d), ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d) + else: + Fwd += 0.5 * (safe_sparse_dot(coef+la*d, + safe_sparse_dot(P2, coef+la*d))) + if Fwd-Fw <= sigma*la*bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # stopping criterion for outer loop + # sum_i(|minimum-norm subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + if P2.ndim == 1: + fp_wP2 = -score + coef*P2 + else: + fp_wP2 = -score + safe_sparse_dot(coef, P2) + mn_subgrad = (np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2+np.sign(coef)*P1)) + mn_subgrad = np.sum(np.abs(mn_subgrad)) + if mn_subgrad <= self.tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the number of iterations (currently {0})" + .format(self.max_iter), ConvergenceWarning) ####################################################################### # 5. postprocessing # diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de7de90db967b..87cc8bea45f5b 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,6 +1,7 @@ import numpy as np from numpy.testing import assert_allclose import scipy as sp +from scipy import sparse from sklearn.linear_model.glm import ( Link, @@ -126,7 +127,7 @@ def test_glm_identiy_regression(): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: for family in families: glm = GeneralizedLinearRegressor( alpha=0, family=family, fit_intercept=False, solver=solver) @@ -162,28 +163,31 @@ def test_normal_ridge(): rng = np.random.RandomState(0) alpha = 1.0 - # With more samples than features + # 1. With more samples than features n_samples, n_features, n_predict = 6, 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) T = rng.randn(n_predict, n_features) # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, + solver='svd', normalize=False) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg']: + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', - fit_intercept=True, solver=solver) + fit_intercept=True, tol=1e-6, + max_iter=100, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='svd', normalize=False) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -192,28 +196,30 @@ def test_normal_ridge(): assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) - # With more features than samples + # 2. With more features than samples and sparse n_samples, n_features, n_predict = 5, 10, 10 y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - T = rng.randn(n_predict, n_features) + X = sparse.csr_matrix(rng.randn(n_samples, n_features)) + T = sparse.csr_matrix(rng.randn(n_predict, n_features)) # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, + solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', fit_intercept=True, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, normalize=False) + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -240,9 +246,41 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, family='poisson', - link='log', tol=1e-10) + s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7} + for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=1e-7, + solver=solver, max_iter=200) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=s_dec[solver]) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=s_dec[solver]) + + +def test_poisson_enet(): + """Test elastic net regression with poisson family and LogLink + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.03550978409 + # a 0.16936423283 + # b . + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', tol=1e-7) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, decimal=7) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7) + assert_array_almost_equal(glm.coef_, [0.16936423283, 0.], decimal=7) From 91497a2abc4824cdcb72f88dc26c9fd347d54b0d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 20:37:27 +0100 Subject: [PATCH 011/269] [WIP] Add Generalized Linear Models (#9405) * bug for sparse matrices for newton-cg solver, function grad_hess * reduce precision for solver newton-cg in test_poisson_ridge * remedy doctest issues in linear_model.rst for example of GeneralizedLinearRegressor * remove unused import of xrange from six --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/glm.py | 3 +-- sklearn/linear_model/tests/test_glm.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 834466e494a4a..1f0946e97b059 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -919,7 +919,7 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +SKIP + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, copy_X=True, family='poisson', fit_dispersion='chisqr', fit_intercept=True, l1_ratio=0, link='log', max_iter=100, diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 3de82c20f33cf..25f3ee1f52a2e 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -53,7 +53,6 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..externals import six -from ..externals.six.moves import xrange from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg @@ -1288,7 +1287,7 @@ def grad_hess(coef, X, y, weights, link): if P2.ndim == 1: hessian[np.diag_indices_from(hessian)] += P2 else: - hessian += P2 + hessian = hessian + P2 def Hs(s): ret = safe_sparse_dot(hessian, s) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 87cc8bea45f5b..c48c59ebd0eda 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -246,7 +246,7 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 7, 'cd': 7} + s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', From b9e5105ddb011a2a4efd74eeb3033ebb824fa5a8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 24 Jan 2018 21:44:10 +0100 Subject: [PATCH 012/269] [WIP] Add Generalized Linear Models (#9405) * bug in cd solver for sparse matrices * higer precision (smaller tol) in test_normal_ridge for sparse matrices * for each solver a separate precision (tol) in test_poisson_ridge --- sklearn/linear_model/glm.py | 2 +- sklearn/linear_model/tests/test_glm.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 25f3ee1f52a2e..b428ee7509d14 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1376,7 +1376,7 @@ def Hs(s): B[np.diag_indices_from(B)] += P2 else: coef_P2 = safe_sparse_dot(coef, P2) - B += P2 + B = B + P2 A = -score + coef_P2 # + d*(H+P2) but d=0 so far inner_iter = 0 while inner_iter < max_inner_iter: diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index c48c59ebd0eda..baad852dfb945 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -207,19 +207,20 @@ def test_normal_ridge(): solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, family='normal', link='identity', - fit_intercept=True, solver=solver) + fit_intercept=True, solver=solver, + max_iter=300) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', fit_intercept=False, solver='irls') glm.fit(X, y) @@ -247,11 +248,12 @@ def test_poisson_ridge(): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} + s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8} for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', - link='log', tol=1e-7, - solver=solver, max_iter=200) + link='log', tol=s_tol[solver], + solver=solver, max_iter=300) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, decimal=s_dec[solver]) From e317422e9dd860c4ed5a3c6ac6191eb8e560c365 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Jan 2018 21:44:04 +0100 Subject: [PATCH 013/269] [WIP] Add Generalized Linear Models (#9405) * improved documentation * additional option 'zero' for argument start_params * validation of sample_weight in function predict * input validation of estimate_phi * set default fit_dispersion=None * bug in estimate_phi because of weight rescaling * test for estimate_phi in normal ridge regression * extended tests for elastic net poisson --- sklearn/linear_model/glm.py | 116 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 39 +++++++-- 2 files changed, 110 insertions(+), 45 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b428ee7509d14..e5eda6108052c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -11,13 +11,13 @@ # TODO: deal with option self.copy_X # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. -# TODO: Add cross validation +# TODO: Add cross validation support # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used form -# group lasse. +# TODO: Negative values in P1 are not allowed so far. They could be used to +# for group lasso. # Design Decisions: # - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. @@ -642,7 +642,7 @@ def _irls_step(X, W, P2, z): ------- coef: array, shape = (X.shape[1]) """ - # TODO: scipy.linalg.solve if faster, but ordinary least squares uses + # TODO: scipy.linalg.solve is faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? n_samples, n_features = X.shape if sparse.issparse(X): @@ -709,16 +709,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): (penalized) maximum likelihood which is equivalent to minimizing the deviance. - TODO: For `alpha` > 0, the feature matrix `X` is assumed to be - standardized. Call + For `alpha` > 0, the feature matrix `X` should be standardized in order to + penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - Otherwise, the strength of the penalty is different for the features. TODO: Estimation of the dispersion parameter phi. - TODO: Notes on weights and 'scaled' distributions. For Poisson, this means - to fit y = z/w with z=counts and w=exposure (time, money, persons, ...) - => y is a ratio with weights w. Same for other distributions. + If your target `y` is a ratio, you should also provide appropriate weights + `w`. As an example, consider Poission distributed counts `z` (integers) and + weights `w`=exposure (time, money, persons years, ...), then you fit + `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, + sample_weight=w)``. You need the weights for the right mean, consider: + :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`. + In this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. Parameters ---------- @@ -800,8 +804,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to fit), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. - start_params : None or array of shape (n_features, ) or 'least_squares'}, \ - optional (default=None) + start_params : {None, 'least_squares', 'zero'} or array of shape \ + (n_features, ) or }, optional (default=None) If an array of size n_features is supplied, use these as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. @@ -854,16 +858,18 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations of the solver. - Notes - ----- References ---------- - TODO + For the coordinate descent implementation: + .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', - fit_dispersion='chisqr', solver='auto', max_iter=100, + fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params=None, selection='random', random_state=None, copy_X=True, check_input=True, verbose=0): @@ -1004,9 +1010,10 @@ def fit(self, X, y, sample_weight=None): if start_params is None: pass elif isinstance(start_params, six.string_types): - if start_params not in ['least_squares']: + if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " - "'least-squares' or an array of right length," + "'least-squares', 'zero' or an array of right" + " length," " got(start_params={0})".format(start_params)) else: start_params = np.atleast_1d(start_params) @@ -1129,6 +1136,7 @@ def fit(self, X, y, sample_weight=None): # deviance = sum(sample_weight * unit_deviance), # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights_sum = np.sum(weights) weights = weights/np.sum(weights) ####################################################################### @@ -1141,7 +1149,8 @@ def fit(self, X, y, sample_weight=None): coef = None if self.warm_start and hasattr(self, "coef_"): if self.fit_intercept: - coef = np.concatenate((self.intercept_, self.coef_)) + coef = np.concatenate((np.array([self.intercept_]), + self.coef_)) else: coef = self.coef_ elif self.start_params is None: @@ -1164,24 +1173,27 @@ def fit(self, X, y, sample_weight=None): else: # with L1 penalty, start with coef = 0 coef = np.zeros(n_features) - elif (isinstance(self.start_params, six.string_types) and - self.start_params == 'least_squares'): - if self.alpha == 0: - reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ - elif self.l1_ratio <= 0.01: - # ElasticNet says l1_ratio <= 0.01 is not reliable, use Ridge - reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ - else: - # TODO: Does this make sense at all? - reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio) - reg.fit(Xnew, link.link(y)) - coef = reg.coef_ + elif isinstance(self.start_params, six.string_types): + if self.start_params == 'zero': + coef = np.zeros(n_features) + elif self.start_params == 'least_squares': + if self.alpha == 0: + reg = LinearRegression(copy_X=True, fit_intercept=False) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + elif self.l1_ratio <= 0.01: + # ElasticNet says l1_ratio <= 0.01 is not reliable + # => use Ridge + reg = Ridge(copy_X=True, fit_intercept=False, + alpha=self.alpha) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ + else: + # TODO: Does this make sense at all? + reg = ElasticNet(copy_X=True, fit_intercept=False, + alpha=self.alpha, l1_ratio=self.l1_ratio) + reg.fit(Xnew, link.link(y)) + coef = reg.coef_ else: coef = start_params @@ -1365,6 +1377,7 @@ def Hs(s): d = np.zeros_like(coef) # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper d = np.zeros_like(coef) # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 @@ -1508,7 +1521,8 @@ def Hs(s): self.coef_ = coef if self.fit_dispersion in ['chisqr', 'deviance']: - self.dispersion_ = self.estimate_phi(y, X, weights) + # attention because of rescaling of weights + self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum return self @@ -1544,9 +1558,23 @@ def predict(self, X, sample_weight=1): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - # TODO: validation of sample_weight eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) + if sample_weight is None: + return mu + elif np.isscalar(sample_weight): + if sample_weight <= 0: + raise ValueError("Sample weight must be positive, " + "got (sample_weight={0})." + .format(sample_weight)) + else: + sample_weights = np.atleast_1d(sample_weight) + if sample_weight.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar.") + elif sample_weight.shape[0] != mu.shape[0]: + raise ValueError("Sample weights must have the same length as" + " X.shape[1].") + return mu*sample_weight def estimate_phi(self, y, X, sample_weight): @@ -1554,10 +1582,20 @@ def estimate_phi(self, y, X, sample_weight): Returns the estimate. """ check_is_fitted(self, "coef_") + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ + n_features += 1 + if n_samples <= n_features: + raise ValueError("Estimation of dispersion parameter phi requires" + " more samples than features, got" + " samples=X.shape[0]={0} and" + " n_features=X.shape[1]+fit_intercept={1}." + .format(n_samples, n_features)) mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': chisq = np.sum(sample_weight*(y-mu)**2 / diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index baad852dfb945..ee90cd51c874d 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -164,7 +164,7 @@ def test_normal_ridge(): alpha = 1.0 # 1. With more samples than features - n_samples, n_features, n_predict = 6, 5, 10 + n_samples, n_features, n_predict = 10, 5, 10 y = rng.randn(n_samples) X = rng.randn(n_samples, n_features) T = rng.randn(n_predict, n_features) @@ -189,12 +189,16 @@ def test_normal_ridge(): ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, family='normal', link='identity', - fit_intercept=False, solver='irls') + fit_intercept=False, solver='irls', + fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_) assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + mu = glm.predict(X) + assert_almost_equal(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse n_samples, n_features, n_predict = 5, 10, 10 @@ -278,11 +282,34 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . + glmnet_intercept = -0.03550978409 + glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', tol=1e-7) + link='log', solver='cd', tol=1e-7) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + + # same for start_params='zero' with reduced precision + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-5, + start_params='zero') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + # start_params='least_squares' with different alpha + glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, + family='poisson', + link='log', solver='cd', tol=1e-5, + start_params='zero') + glm.fit(X, y) + # warm start with original alpha and use of sparse matrices + glm.warm_start = True + glm.alpha = 1 + X = sparse.csr_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.03550978409, decimal=7) - assert_array_almost_equal(glm.coef_, [0.16936423283, 0.], - decimal=7) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) From 9a9818441d605bf86547651997db81e969f41cdf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 25 Jan 2018 22:59:46 +0100 Subject: [PATCH 014/269] [WIP] Add Generalized Linear Models (#9405) * new helper function _check_weights for validation of sample_weight * fix white space issue in doctest of linear_model.rst --- doc/modules/linear_model.rst | 8 ++--- sklearn/linear_model/glm.py | 59 +++++++++++++++++------------------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 1f0946e97b059..f7b0ca0cc7add 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -921,10 +921,10 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion='chisqr', - fit_intercept=True, l1_ratio=0, link='log', max_iter=100, - random_state=None, selection='random', solver='auto', - start_params=None, tol=0.0001, verbose=0, warm_start=False) + copy_X=True, family='poisson', fit_dispersion='chisqr', + fit_intercept=True, l1_ratio=0, link='log', max_iter=100, + random_state=None, selection='random', solver='auto', + start_params=None, tol=0.0001, verbose=0, warm_start=False) >>> reg.coef_ array([ 0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index e5eda6108052c..138830ea431c9 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -59,6 +59,26 @@ from ..utils.validation import check_is_fitted, check_random_state +def _check_weights(sample_weight, n_samples): + if sample_weight is None: + weights = np.ones(n_samples) + elif np.isscalar(sample_weight): + if sample_weight < 0: + raise ValueError("Sample weights must be non-negative.") + weights = sample_weight*np.ones(n_samples) + else: + weights = np.atleast_1d(sample_weight) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != n_samples: + raise ValueError("Sample weights must have the same length as" + " y") + if not np.all(sample_weight >= 0): + raise ValueError("Sample weights must be non-negative.") + + return weights + + class Link(six.with_metaclass(ABCMeta)): """Abstract base class for Link funtions """ @@ -925,17 +945,7 @@ def fit(self, X, y, sample_weight=None): dtype=_dtype, y_numeric=True, multi_output=False) y = y.astype(np.float64) - if sample_weight is None: - weights = np.ones_like(y) - elif np.isscalar(sample_weight): - weights = sample_weight*np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != y.shape[0]: - raise ValueError("Sample weights must have the same length as" - " y") + weights = _check_weights(sample_weight, y.shape[0]) # 1.2 validate arguments of __init__ ################################## # Garantee that self._family_instance is an instance of class @@ -1544,7 +1554,7 @@ def linear_predictor(self, X): return safe_sparse_dot(X, self.coef_, dense_output=True) + self.intercept_ - def predict(self, X, sample_weight=1): + def predict(self, X, sample_weight=None): """Predict uing GLM with feature matrix X. If sample_weight is given, returns prediction*sample_weight. @@ -1558,26 +1568,13 @@ def predict(self, X, sample_weight=1): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ + weights = _check_weights(sample_weight, X.shape[0]) eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) - if sample_weight is None: - return mu - elif np.isscalar(sample_weight): - if sample_weight <= 0: - raise ValueError("Sample weight must be positive, " - "got (sample_weight={0})." - .format(sample_weight)) - else: - sample_weights = np.atleast_1d(sample_weight) - if sample_weight.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar.") - elif sample_weight.shape[0] != mu.shape[0]: - raise ValueError("Sample weights must have the same length as" - " X.shape[1].") - return mu*sample_weight + return mu*weights - def estimate_phi(self, y, X, sample_weight): + def estimate_phi(self, y, X, sample_weight=None): """Estimation of the dispersion parameter. Returns the estimate. """ @@ -1586,6 +1583,7 @@ def estimate_phi(self, y, X, sample_weight): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape + weights = _check_weights(sample_weight, n_samples) eta = safe_sparse_dot(X, self.coef_, dense_output=True) if self.fit_intercept is True: eta += self.intercept_ @@ -1640,10 +1638,7 @@ def score(self, X, y, sample_weight=None): # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get # input validation and so on) - if sample_weight is None: - weights = np.ones_like(y) - else: - weights = np.atleast_1d(sample_weight) + weights = _check_weights(sample_weight, y.shape[0]) mu = self.predict(X) dev = self._family_instance.deviance(y, mu, weights=weights) y_mean = np.average(y, weights=weights) From db9defe6b7637fe022034ca7f435f4fd37f6c118 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 08:33:59 +0100 Subject: [PATCH 015/269] [WIP] Add Generalized Linear Models (#9405) * fit_dispersion default=None also in docs. * improved docs. * fixed input validation of predict * fixed bug for sample_weight in estimate_phi --- sklearn/linear_model/glm.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 138830ea431c9..22a8be9e50828 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -754,7 +754,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): case, the design matrix X must have full column rank (no collinearities). - l1_ratio : float, optional (defaul=0) + l1_ratio : float, optional (default=0) The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a @@ -768,7 +768,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). - P2 : None or array of shape (n_features*, n_features*) + P2 : None or array of shape (n_features*, n_features*), optional\ + (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov regularization). @@ -781,20 +782,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ of class ExponentialDispersionModel, optional(default='normal') - the distributional assumption of the GLM. + the distributional assumption of the GLM, i.e. which loss function to + be minimized. link : {'identity', 'log'} or an instance of class Link, optional (default='identity') the link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul='chisqr') + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ - optional (defaul='auto') + optional (default='auto') Algorithm to use in the optimization problem. - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. @@ -830,11 +832,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. If 'least_squares' is set, the result of a least squares fit in the - link space (linear predictor) is taken. If ``None``, the start values - are calculated by setting mu to family.starting_mu(..) and one step of - irls. - This option only applies if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not exist). + link space (linear predictor) is taken. + If 'zero' is set, all coefficients start with zero. + If ``None``, the start values are calculated by setting mu to + family.starting_mu(..) and one step of irls. + These options only apply if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). selection : str, optional (default='random') For the solver 'cd' (coordinate descent), the coordinates (features) @@ -1550,7 +1553,9 @@ def linear_predictor(self, X): Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) return safe_sparse_dot(X, self.coef_, dense_output=True) + self.intercept_ @@ -1568,9 +1573,10 @@ def predict(self, X, sample_weight=None): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - weights = _check_weights(sample_weight, X.shape[0]) + # validation of X in linear_predictor eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) + weights = _check_weights(sample_weight, X.shape[0]) return mu*weights @@ -1596,11 +1602,11 @@ def estimate_phi(self, y, X, sample_weight=None): .format(n_samples, n_features)) mu = self._link_instance.inverse(eta) if self.fit_dispersion == 'chisqr': - chisq = np.sum(sample_weight*(y-mu)**2 / + chisq = np.sum(weights*(y-mu)**2 / self._family_instance.unit_variance(mu)) return chisq/(n_samples - n_features) elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, sample_weight) + dev = self._family_instance.deviance(y, mu, weights) return dev/(n_samples - n_features) # Note: check_estimator(GeneralizedLinearRegressor) might raise From dc7fdd7f8f6a3a2276f5f96aa6d5dd6ad3ce853e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 08:41:24 +0100 Subject: [PATCH 016/269] [WIP] Add Generalized Linear Models (#9405) * improved docs --- sklearn/linear_model/tests/test_glm.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index ee90cd51c874d..0af837c9c73f3 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -177,7 +177,8 @@ def test_normal_ridge(): glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, tol=1e-6, - max_iter=100, solver=solver) + max_iter=100, solver=solver, + random_state=42) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -214,7 +215,7 @@ def test_normal_ridge(): glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, family='normal', link='identity', fit_intercept=True, solver=solver, - max_iter=300) + max_iter=300, random_state=42) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) @@ -257,7 +258,8 @@ def test_poisson_ridge(): glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', link='log', tol=s_tol[solver], - solver=solver, max_iter=300) + solver=solver, max_iter=300, + random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, decimal=s_dec[solver]) @@ -282,20 +284,23 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . + rand = 0 glmnet_intercept = -0.03550978409 glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-7) + link='log', solver='cd', tol=1e-7, + selection='random', random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) - # same for start_params='zero' with reduced precision + # same for start_params='zero' and selection='cyclic' + # with reduced precision glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', link='log', solver='cd', tol=1e-5, - start_params='zero') + selection='cyclic', start_params='zero') glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) From b11d06ba72865c14b0532c6d6c34d264a09d7ae4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 16:57:45 +0100 Subject: [PATCH 017/269] [WIP] Add Generalized Linear Models (#9405) * fixed input validation of X in predict --- sklearn/linear_model/glm.py | 4 +++- sklearn/linear_model/tests/test_glm.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 22a8be9e50828..032ded86816dd 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1573,7 +1573,9 @@ def predict(self, X, sample_weight=None): C : array, shape = (n_samples) Returns predicted values times sample_weight. """ - # validation of X in linear_predictor + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) eta = self.linear_predictor(X) mu = self._link_instance.inverse(eta) weights = _check_weights(sample_weight, X.shape[0]) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 0af837c9c73f3..776edd8aeec46 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -284,7 +284,6 @@ def test_poisson_enet(): # (Intercept) -0.03550978409 # a 0.16936423283 # b . - rand = 0 glmnet_intercept = -0.03550978409 glmnet_coef = [0.16936423283, 0.] X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T From 9e6c01378a4cb245824bcb9429b3d566652af743 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 26 Jan 2018 17:37:44 +0100 Subject: [PATCH 018/269] [WIP] Add Generalized Linear Models (#9405) * redundant line of code 'd = np.zeros_like(coef)' --- doc/modules/linear_model.rst | 2 +- sklearn/linear_model/glm.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index f7b0ca0cc7add..dcb35b6a5d941 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -921,7 +921,7 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion='chisqr', + copy_X=True, family='poisson', fit_dispersion=None, fit_intercept=True, l1_ratio=0, link='log', max_iter=100, random_state=None, selection='random', solver='auto', start_params=None, tol=0.0001, verbose=0, warm_start=False) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 032ded86816dd..8856af8ec698a 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -73,7 +73,7 @@ def _check_weights(sample_weight, n_samples): elif weights.shape[0] != n_samples: raise ValueError("Sample weights must have the same length as" " y") - if not np.all(sample_weight >= 0): + if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") return weights @@ -1132,7 +1132,7 @@ def fit(self, X, y, sample_weight=None): if sparse.issparse(P2): # TODO: check sparse P2 for non-negativeness # raise NotImplementedError("Check sparse P2 for " - # "non-negaitveness is not yet " + # "non-negativeness is not yet " # "implemented.") pass elif P2.ndim == 2: @@ -1391,7 +1391,6 @@ def Hs(s): # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper - d = np.zeros_like(coef) # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 # Note: f'=-score and H=fisher are updated at the end of outer From bad0190a22623eae3a2f6dfcdb0fd8caee625111 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 27 Jan 2018 20:38:02 +0100 Subject: [PATCH 019/269] [WIP] Add Generalized Linear Models (#9405) * added test to compare to ElasticNet * deleted identical comment lines --- sklearn/linear_model/tests/test_glm.py | 29 +++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 776edd8aeec46..9990cafe2cbcf 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -12,7 +12,7 @@ GammaDistribution, InverseGaussianDistribution, GeneralizedHyperbolicSecand, GeneralizedLinearRegressor) -from sklearn.linear_model.ridge import Ridge +from sklearn.linear_model import ElasticNet, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, @@ -267,14 +267,37 @@ def test_poisson_ridge(): decimal=s_dec[solver]) +def test_normal_enet(): + """Tet elastic net regression with normal/gaussian family""" + rng = np.random.RandomState(0) + alpha, l1_ratio = 0.3, 0.7 + n_samples, n_features = 20, 2 + X = rng.randn(n_samples, n_features).copy(order='F') + beta = rng.randn(n_features) + y = 2 + np.dot(X, beta) + rng.randn(n_samples) + + glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, + family='normal', link='identity', + fit_intercept=True, tol=1e-7, + max_iter=100, selection='cyclic', + solver='cd', start_params='zero', + check_input=False) + glm.fit(X, y) + + enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, + normalize=False, tol=1e-7, copy_X=True) + enet.fit(X, y) + + assert_almost_equal(glm.intercept_, enet.intercept_) + assert_array_almost_equal(glm.coef_, enet.coef_) + + def test_poisson_enet(): """Test elastic net regression with poisson family and LogLink Compare to R's glmnet""" # library("glmnet") # options(digits=10) - # library("glmnet") - # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) # x <- data.matrix(df[,c("a", "b")]) # y <- df$y From 48137d86079c9a8efd15d57e719e3ee35f1644c9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 11:52:19 +0100 Subject: [PATCH 020/269] [WIP] Add Generalized Linear Models (#9405) * increased precision in test_normal_enet --- sklearn/linear_model/tests/test_glm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 9990cafe2cbcf..6f8bdd3a72f40 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -278,18 +278,18 @@ def test_normal_enet(): glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, family='normal', link='identity', - fit_intercept=True, tol=1e-7, + fit_intercept=True, tol=1e-8, max_iter=100, selection='cyclic', solver='cd', start_params='zero', check_input=False) glm.fit(X, y) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, - normalize=False, tol=1e-7, copy_X=True) + normalize=False, tol=1e-8, copy_X=True) enet.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_) - assert_array_almost_equal(glm.coef_, enet.coef_) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) def test_poisson_enet(): From 2c2a077a2e8c57bdf3c945678526d6efdb5763e9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 12:42:03 +0100 Subject: [PATCH 021/269] [WIP] Add Generalized Linear Models (#9405) * better doc for heavy tailed distributions --- doc/modules/linear_model.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index dcb35b6a5d941..5cb8e54afbb06 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -905,7 +905,8 @@ are the following: * If the target values are positive valued and skewed, you might try a Gamma deviance. - * If the target values seem to be heavy tailed, you might try an Inverse Gaussian deviance (or even higher variance power of the Tweedie family). + * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). + Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory. Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, From 15931c3148b68c47aa3c3c19983525ae758a0981 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 28 Jan 2018 15:18:26 +0100 Subject: [PATCH 022/269] [WIP] Add Generalized Linear Models (#9405) * improved input validation and testing of them --- sklearn/linear_model/glm.py | 24 +++- sklearn/linear_model/tests/test_glm.py | 174 ++++++++++++++++++++++++- 2 files changed, 189 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 8856af8ec698a..33df6b4c9b850 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -63,11 +63,14 @@ def _check_weights(sample_weight, n_samples): if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): - if sample_weight < 0: + if sample_weight <= 0: raise ValueError("Sample weights must be non-negative.") weights = sample_weight*np.ones(n_samples) else: - weights = np.atleast_1d(sample_weight) + _dtype = [np.float64, np.float32] + weights = check_array(sample_weight, accept_sparse='csr', + force_all_finite=True, ensure_2d=False, + dtype=_dtype) if weights.ndim > 1: raise ValueError("Sample weight must be 1D array or scalar") elif weights.shape[0] != n_samples: @@ -75,6 +78,9 @@ def _check_weights(sample_weight, n_samples): " y") if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") + elif not np.sum(weights) > 0: + raise ValueError("Sample weights must have at least one positive " + "element.") return weights @@ -1010,10 +1016,12 @@ def fit(self, X, y, sample_weight=None): "with L1 penalties, which are included with " "(alpha={1}) and (l1_ratio={2})." .format(solver, self.alpha, self.l1_ratio)) - if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0: - raise ValueError("Maximum number of iteration must be positive;" + if (not isinstance(self.max_iter, six.integer_types) + or self.max_iter <= 0): + raise ValueError("Maximum number of iteration must be a positive " + "integer;" " got (max_iter={0!r})".format(self.max_iter)) - if not isinstance(self.tol, numbers.Number) or self.tol < 0: + if not isinstance(self.tol, numbers.Number) or self.tol <= 0: raise ValueError("Tolerance for stopping criteria must be " "positive; got (tol={0!r})".format(self.tol)) if not isinstance(self.warm_start, bool): @@ -1029,7 +1037,9 @@ def fit(self, X, y, sample_weight=None): " length," " got(start_params={0})".format(start_params)) else: - start_params = np.atleast_1d(start_params) + start_params = check_array(start_params, accept_sparse='csr', + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" @@ -1160,7 +1170,7 @@ def fit(self, X, y, sample_weight=None): # set start values for coef coef = None - if self.warm_start and hasattr(self, "coef_"): + if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: coef = np.concatenate((np.array([self.intercept_]), self.coef_)) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 6f8bdd3a72f40..1abbcf0540e28 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -5,7 +5,7 @@ from sklearn.linear_model.glm import ( Link, - # IdentityLink, + IdentityLink, LogLink, TweedieDistribution, NormalDistribution, PoissonDistribution, @@ -16,7 +16,8 @@ from sklearn.utils.testing import ( assert_equal, assert_almost_equal, - assert_array_equal, assert_array_almost_equal) + assert_array_equal, assert_array_almost_equal, + assert_raises) def test_link_properties(): @@ -102,6 +103,34 @@ def f(coef): assert_allclose(fisher, approx, rtol=1e-3) +def test_sample_weights_validation(): + """Test the raised errors in the validation of sample_weight""" + # 1. scalar value but not positive + X = [[1]] + y = [1] + weights = 0 + glm = GeneralizedLinearRegressor(fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y, weights) + + # 2. 2d array + weights = [[0]] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 3. 1d but wrong length + weights = [1, 0] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 4. 1d but only zeros (sum not greater than 0) + weights = [0, 0] + X = [[0], [1]] + y = [1, 2] + assert_raises(ValueError, glm.fit, X, y, weights) + + # 5. 1d but weith a negative value + weights = [2, -1] + assert_raises(ValueError, glm.fit, X, y, weights) + + def test_glm_family_argument(): """Test GLM family argument set as string """ @@ -115,6 +144,147 @@ def test_glm_family_argument(): alpha=0).fit(X, y) assert_equal(type(glm._family_instance), type(fam)) + glm = GeneralizedLinearRegressor(family='not a family', + fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_link_argument(): + """Test GLM link argument set as string + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for (l, link) in [('identity', IdentityLink()), + ('log', LogLink())]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) + + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + link='not a link') + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_alpha_argument(): + """Test GLM alpha argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for alpha in ['not a number', -4.2]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + alpha=alpha) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_l1_ratio_argument(): + """Test GLM l1_ratio argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for l1_ratio in ['not a number', -4.2, 1.1, [1]]: + glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, + l1_ratio=l1_ratio) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_fit_intercept_argument(): + """Test GLM fit_intercept argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for fit_intercept in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_solver_argument(): + """Test GLM solver argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for solver in ['not a solver', 1, [1]]: + glm = GeneralizedLinearRegressor(solver=solver) + assert_raises(ValueError, glm.fit, X, y) + + # solver not suitable for L1 penalty + for solver in ['irls', 'lbfgs', 'newton-cg']: + glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_max_iter_argument(): + """Test GLM max_iter argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for max_iter in ['not a number', 0, -1, 5.5, [1]]: + glm = GeneralizedLinearRegressor(max_iter=max_iter) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_tol_argument(): + """Test GLM tol argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for tol in ['not a number', 0, -1.0, [1e-3]]: + glm = GeneralizedLinearRegressor(tol=tol) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_warm_start_argument(): + """Test GLM warm_start argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for warm_start in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(warm_start=warm_start) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_start_params_argument(): + """Test GLM start_params argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for start_params in ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]: + glm = GeneralizedLinearRegressor(start_params=start_params) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_selection_argument(): + """Test GLM selection argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for selection in ['not a selection', 1, 0, ['cyclic']]: + glm = GeneralizedLinearRegressor(selection=selection) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_check_input_argument(): + """Test GLM check_input argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for check_input in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(check_input=check_input) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_random_state_argument(): + """Test GLM random_state argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for random_state in ['a string', 0.5, [0]]: + glm = GeneralizedLinearRegressor(random_state=random_state) + assert_raises(ValueError, glm.fit, X, y) + + +# TODO: check P1 and P2 +# TODO: check additional validations if check_input == True def test_glm_identiy_regression(): """Test GLM regression with identity link on a simple dataset From feedba379c2a8f53a7a9792e33041f29da7a4c95 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 30 Mar 2018 18:50:06 +0200 Subject: [PATCH 023/269] [MRG] Add Generalized Linear Models (#9405) * improved input validation and testing of P1 * test case for validation of argument P2 * test case for validation of argument copy_X --- sklearn/linear_model/glm.py | 3 ++ sklearn/linear_model/tests/test_glm.py | 48 +++++++++++++++++++++----- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 33df6b4c9b850..eae4a56ea1d95 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1065,6 +1065,9 @@ def fit(self, X, y, sample_weight=None): P1 = np.ones(X.shape[1]) else: P1 = np.atleast_1d(np.copy(self.P1)) + if P1.dtype.kind not in ['b', 'i', 'u', 'f']: + raise ValueError("P1 must be a numeric value; " + "got (dtype={0}).".format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): raise ValueError("P1 must be either None or an 1D array with " "the length of X.shape[1]; " diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1abbcf0540e28..edf579a416973 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -187,6 +187,27 @@ def test_glm_l1_ratio_argument(): assert_raises(ValueError, glm.fit, X, y) +def test_glm_P1_argument(): + """Test GLM P1 arguments + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]: + glm = GeneralizedLinearRegressor(P1=P1) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_P2_argument(): + """Test GLM P2 arguments + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3], + [[1, 2]], [[1], [2]]]: + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False) + assert_raises(ValueError, glm.fit, X, y) + + def test_glm_fit_intercept_argument(): """Test GLM fit_intercept argument """ @@ -263,27 +284,36 @@ def test_glm_selection_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_check_input_argument(): - """Test GLM check_input argument +def test_glm_random_state_argument(): + """Test GLM random_state argument """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for check_input in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(check_input=check_input) + for random_state in ['a string', 0.5, [0]]: + glm = GeneralizedLinearRegressor(random_state=random_state) assert_raises(ValueError, glm.fit, X, y) -def test_glm_random_state_argument(): - """Test GLM random_state argument +def test_glm_copy_X_argument(): + """Test GLM copy_X arguments """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for random_state in ['a string', 0.5, [0]]: - glm = GeneralizedLinearRegressor(random_state=random_state) + for copy_X in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(copy_X=copy_X) + assert_raises(ValueError, glm.fit, X, y) + + +def test_glm_check_input_argument(): + """Test GLM check_input argument + """ + y = np.array([1, 2]) + X = np.array([[1], [1]]) + for check_input in ['not bool', 1, 0, [True]]: + glm = GeneralizedLinearRegressor(check_input=check_input) assert_raises(ValueError, glm.fit, X, y) -# TODO: check P1 and P2 # TODO: check additional validations if check_input == True def test_glm_identiy_regression(): From 6fdfb47428571b4c8e89046a7b1f481711832f61 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 30 Mar 2018 19:41:09 +0200 Subject: [PATCH 024/269] [MRG] Add Generalized Linear Models (#9405) * fix doctest failure in example of linear_model.rst * fix dtype issue in test_glm_P2_argument --- doc/modules/linear_model.rst | 4 ++-- sklearn/linear_model/tests/test_glm.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 5cb8e54afbb06..9f85da771c6f1 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -926,8 +926,8 @@ follows: fit_intercept=True, l1_ratio=0, link='log', max_iter=100, random_state=None, selection='random', solver='auto', start_params=None, tol=0.0001, verbose=0, warm_start=False) - >>> reg.coef_ - array([ 0.24630255, 0.43373521]) + >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + array([0.24630255, 0.43373521]) >>> reg.intercept_ #doctest: +ELLIPSIS -0.76383575... diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index edf579a416973..c5d132d35bdb9 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -202,8 +202,8 @@ def test_glm_P2_argument(): """ y = np.array([1, 2]) X = np.array([[1], [1]]) - for P2 in [np.full((2, 2), 'a string'), [[1, [2]], [3, 4]], [1, 2, 3], - [[1, 2]], [[1], [2]]]: + for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Sun, 5 Aug 2018 14:48:33 +0200 Subject: [PATCH 025/269] [MRG] Add Generalized Linear Models (#9405) * fix typos in doc --- doc/modules/linear_model.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 9f85da771c6f1..a204ccb080cc9 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -934,7 +934,7 @@ follows: Mathematical formulation ------------------------ -In the unpenalized case, the assumptions are the folowing: +In the unpenalized case, the assumptions are the following: * The target values :math:`y_i` are realizations of random variables :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` @@ -951,7 +951,7 @@ same as specifying a unit variance function (they are one-to-one). Including penalties helps to avoid overfitting or, in case of L1 penalty, to obtain sparse solutions. But there are also other motivations to include them, -e.g. accounting fo dependence structure of :math:`y`. +e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. From 809e3a2747e451fc97b9a462cf682110d59fac25 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 26 Aug 2018 20:41:25 +0200 Subject: [PATCH 026/269] Remove test_glm_P2_argument --- sklearn/linear_model/tests/test_glm.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index c5d132d35bdb9..82b8ec2435543 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -197,15 +197,15 @@ def test_glm_P1_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_P2_argument(): - """Test GLM P2 arguments - """ - y = np.array([1, 2]) - X = np.array([[1], [1]]) - for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Thu, 30 Aug 2018 19:33:27 +0200 Subject: [PATCH 027/269] Filter out DeprecationWarning in old versions of scipy.sparse.linalg.spsolve about usage of umfpack --- sklearn/linear_model/tests/test_glm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 82b8ec2435543..e4be75ddb7a64 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -355,6 +355,7 @@ def test_glm_log_regression(): assert_array_almost_equal(res.coef_, coef) +@pytest.mark.filterwarnings('ignore:DeprecationWarning') def test_normal_ridge(): """Test ridge regression for Normal distributions From 46df5b6555938dc7c9acac67929b0633ea1354a3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 20:40:08 +0200 Subject: [PATCH 028/269] import pytest --- sklearn/linear_model/tests/test_glm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index e4be75ddb7a64..dfa205407a193 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,5 +1,6 @@ import numpy as np from numpy.testing import assert_allclose +import pytest import scipy as sp from scipy import sparse From 21f2136c083b4057868b7886880f356e58703611 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 21:17:53 +0200 Subject: [PATCH 029/269] Document arguments of abstact methods --- sklearn/linear_model/glm.py | 125 +++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index eae4a56ea1d95..825ee6bfe8c45 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -93,12 +93,22 @@ class Link(six.with_metaclass(ABCMeta)): def link(self, mu): """The link function g(mu) with argument mu=E[Y] returns the linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the predicted mean. """ raise NotImplementedError @abstractmethod def derivative(self, mu): """Derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the predicted mean. """ raise NotImplementedError @@ -106,18 +116,33 @@ def derivative(self, mu): def inverse(self, lin_pred): """The inverse link function h(lin_pred) with the linear predictor as argument returns mu=E[Y]. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative(self, lin_pred): """Derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative2(self, lin_pred): """Second derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (predicted) linear predictor. """ raise NotImplementedError @@ -236,6 +261,11 @@ def include_upper_bound(self): def in_y_range(self, x): """Returns true if `x` is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. """ if self.include_lower_bound: if self.include_upper_bound: @@ -263,12 +293,22 @@ def unit_variance(self, mu): \partial\mu^2}}\big|_{y=\mu} See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ raise NotImplementedError() @abstractmethod def unit_variance_derivative(self, mu): r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. """ raise NotImplementedError() @@ -276,6 +316,17 @@ def variance(self, mu, phi=1, weights=1): r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return phi/weights * self.unit_variance(mu) @@ -284,6 +335,17 @@ def variance_derivative(self, mu, phi=1, weights=1): :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return phi/weights * self.unit_variance_derivative(mu) @@ -293,6 +355,14 @@ def unit_deviance(self, y, mu): In terms of the log-likelihood it is given by :math:`d(y,\mu) = -2\phi\cdot \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. """ raise NotImplementedError() @@ -301,9 +371,13 @@ def unit_deviance_derivative(self, y, mu): :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. - Returns - ------- - derivative: array, shape = (n_samples,) + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. """ return -2*(y-mu)/self.unit_variance(mu) @@ -313,6 +387,17 @@ def deviance(self, y, mu, weights=1): In terms of the likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return np.sum(weights*self.unit_deviance(y, mu)) @@ -326,6 +411,17 @@ def _deviance(self, coef, X, y, weights, link): def deviance_derivative(self, y, mu, weights=1): """The derivative w.r.t. `mu` of the deviance. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. """ return weights*self.unit_deviance_derivative(y, mu) @@ -464,7 +560,16 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): return eta, mu, score, fisher def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in (unpenalized) IRLS.""" + """Starting values for the mean mu_i in (unpenalized) IRLS. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ return ((weights*y+np.mean(weights*y)) / (2.*np.sum(np.ones_like(y)*weights))) @@ -656,12 +761,12 @@ def _irls_step(X, W, P2, z): X : numpy array or sparse matrix of shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array of shape (n_samples, ) + W : numpy array of shape (n_samples,) P2 : numpy array or sparse matrix of shape (n_features, n_features) The l2-penalty matrix or vector (=diagonal matrix) - z : numpy array of shape (n_samples, ) + z : numpy array of shape (n_samples,) Working observations Returns @@ -927,12 +1032,12 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- X : numpy array or sparse matrix of shape (n_samples, n_features) - Training data + Training data. - y : numpy array of shape (n_samples, ) - Target values + y : numpy array of shape (n_samples,) + Target values. - sample_weight : array of shape (n_samples, ) or None,\ + sample_weight : array of shape (n_samples,) or None,\ optinal (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has From 1faedf87eecfd2c6a668e2aec6e28a9fff8780ec Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 21:18:51 +0200 Subject: [PATCH 030/269] Pytest filter warnings use two colons --- sklearn/linear_model/tests/test_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index dfa205407a193..1ac5ccd4d3d5c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -356,7 +356,7 @@ def test_glm_log_regression(): assert_array_almost_equal(res.coef_, coef) -@pytest.mark.filterwarnings('ignore:DeprecationWarning') +@pytest.mark.filterwarnings('ignore::DeprecationWarning') def test_normal_ridge(): """Test ridge regression for Normal distributions From 992f9819838336a9372a7cadecc53eeafef439ff Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 30 Aug 2018 22:07:33 +0200 Subject: [PATCH 031/269] Improve documentation of arguments that were so far undocumented --- sklearn/linear_model/glm.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 825ee6bfe8c45..a0d0266fa0efe 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -579,6 +579,17 @@ class TweedieDistribution(ExponentialDispersionModel): They have :math:`\mu=\mathrm{E}[Y]` and :math:`\mathrm{Var}[Y] \propto \mu^power. + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + Attributes ---------- power : float @@ -586,6 +597,12 @@ class TweedieDistribution(ExponentialDispersionModel): :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): + """ + Parameters + ---------- + power : float (default=0) + Power of (of mu) of the variance function. + """ self.power = power self._upper_bound = np.Inf self._include_upper_bound = False @@ -623,6 +640,9 @@ def __init__(self, power=0): # Positive Stable self._lower_bound = 0 self._include_lower_bound = False + else: + raise ValueError('The power must be a float, i.e. real number, ' + 'got (power={})'.format(power)) @property def power(self): @@ -653,12 +673,22 @@ def include_upper_bound(self): def unit_variance(self, mu): """The unit variance of a Tweedie distribution is v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ return np.power(mu, self.power) def unit_variance_derivative(self, mu): """The derivative of the unit variance of a Tweedie distribution is v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. """ return self.power*np.power(mu, self.power-1) @@ -680,9 +710,6 @@ def unit_deviance(self, y, mu): return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - def likelihood(self, y, X, w, phi, weights=1): - raise NotImplementedError('This function is not (yet) implemented.') - class NormalDistribution(TweedieDistribution): """Class for the Normal (aka Gaussian) distribution""" From 06b8451ea109040371615e9e7baaa8ff505197f0 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 00:26:39 +0200 Subject: [PATCH 032/269] Further improve documentation of arguments --- sklearn/linear_model/glm.py | 43 +++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index a0d0266fa0efe..9688b1e0c9e5c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -601,7 +601,7 @@ def __init__(self, power=0): Parameters ---------- power : float (default=0) - Power of (of mu) of the variance function. + Variance power of the `unit_variance` function. """ self.power = power self._upper_bound = np.Inf @@ -798,7 +798,7 @@ def _irls_step(X, W, P2, z): Returns ------- - coef: array, shape = (X.shape[1]) + coef: array, shape (X.shape[1]) """ # TODO: scipy.linalg.solve is faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? @@ -898,7 +898,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : None or array of shape (n_features*, ), optional\ + P1 : None or array of shape (n_features*,), optional\ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The @@ -1007,7 +1007,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- - coef_ : array, shape (n_features, ) + coef_ : array, shape (n_features,) Estimated coefficients for the linear predictor (X*coef_) in the GLM. intercept_ : float @@ -1679,7 +1679,7 @@ def Hs(s): if self.fit_dispersion in ['chisqr', 'deviance']: # attention because of rescaling of weights - self.dispersion_ = self.estimate_phi(y, X, weights)*weights_sum + self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum return self @@ -1688,12 +1688,12 @@ def linear_predictor(self, X): Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix, shape (n_samples, n_features) Samples. Returns ------- - C : array, shape = (n_samples) + C : array, shape (n_samples) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") @@ -1709,12 +1709,15 @@ def predict(self, X, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape [n_samples,n_features] + X : numpy array or sparse matrix, shape (n_samples, n_features) Samples. + sample_weight : array of shape (n_samples,) or None , \ + (default=None) + Returns ------- - C : array, shape = (n_samples) + C : array, shape (n_samples,) Returns predicted values times sample_weight. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], @@ -1726,9 +1729,21 @@ def predict(self, X, sample_weight=None): return mu*weights - def estimate_phi(self, y, X, sample_weight=None): - """Estimation of the dispersion parameter. + def estimate_phi(self, X, y, sample_weight=None): + """Estimation of the dispersion parameter phi. Returns the estimate. + + Parameters + ---------- + X : numpy array or sparse matrix of shape (n_samples, n_features) + Training data. + + y : numpy array, shape (n_samples,) + Target values. + + sample_weight : array of shape (n_samples,) or None,\ + optinal (default=None) + Sample weights. """ check_is_fitted(self, "coef_") _dtype = [np.float64, np.float32] @@ -1773,13 +1788,13 @@ def score(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape = (n_samples, n_features) + X : array-like, shape (n_samples, n_features) Test samples - y : array-like of shape = (n_samples) + y : array-like, shape (n_samples,) True valeus for X. - sample_weight : array-like, shape = (n_samples), optional + sample_weight : array-like, shape = (n_samples,), optional Sample weights. Returns From c93f60d9e98a5be0d493d513e37c3c9de5167542 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 08:25:01 +0200 Subject: [PATCH 033/269] Remove parameters docstring for __init__ --- sklearn/linear_model/glm.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 9688b1e0c9e5c..021927b598822 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -597,12 +597,6 @@ class TweedieDistribution(ExponentialDispersionModel): :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): - """ - Parameters - ---------- - power : float (default=0) - Variance power of the `unit_variance` function. - """ self.power = power self._upper_bound = np.Inf self._include_upper_bound = False From 66ec63b5157026f8541e2761c29ef3225d89a44c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 19:24:23 +0200 Subject: [PATCH 034/269] Fix typos in docstring of TweedieDistribution --- sklearn/linear_model/glm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 021927b598822..65abb42b043d5 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -577,7 +577,7 @@ def starting_mu(self, y, weights=1): class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. They have :math:`\mu=\mathrm{E}[Y]` and - :math:`\mathrm{Var}[Y] \propto \mu^power. + :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -593,7 +593,7 @@ class TweedieDistribution(ExponentialDispersionModel): Attributes ---------- power : float - The variance power of the unit_variance + The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. """ def __init__(self, power=0): From 53c69702c790223d3940cd650b9d52a61e39b244 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 31 Aug 2018 22:20:43 +0200 Subject: [PATCH 035/269] Change docstring section of TweedieDistribution from Attributes to Parameters --- sklearn/linear_model/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 65abb42b043d5..fcb6c9754b826 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -590,7 +590,7 @@ class TweedieDistribution(ExponentialDispersionModel): 2 Gamma 3 Inverse Gaussian - Attributes + Parameters ---------- power : float The variance power of the `unit_variance` From 87d5ba38c5663224edd7ec10bc7efdf27172e7f8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 7 Oct 2018 19:43:42 +0200 Subject: [PATCH 036/269] Minor doc improvements of GeneralizedLinearRegressor --- sklearn/linear_model/glm.py | 87 +++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index fcb6c9754b826..01d5420773ffc 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,17 +6,16 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Write more tests -# TODO: Write examples and more docu -# TODO: deal with option self.copy_X +# TODO: Write examples +# TODO: Make option self.copy_X more meaningfull than just for start values. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. -# TODO: Add cross validation support +# TODO: Add cross validation support? # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. # TODO: Include further classes in class.rst? ExponentialDispersionModel? # TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used to +# TODO: Negative values in P1 are not allowed so far. They could be used # for group lasso. # Design Decisions: @@ -26,7 +25,7 @@ # regressor, Bernoulli/Binomial => classifier. # Solution: GeneralizedLinearRegressor since this is the focus. # - Allow for finer control of penalty terms: -# L1: ||P1*w||_1 with P1*w a componentwise product, this allows to exclude +# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude # factors from the L1 penalty. # L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and @@ -322,7 +321,7 @@ def variance(self, mu, phi=1, weights=1): mu : array, shape (n_samples,) Predicted mean. - phi : float + phi : float (default=1) Dispersion parameter. weights : array, shape (n_samples,) (default=1) @@ -592,7 +591,7 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- - power : float + power : float (default=0) The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. """ @@ -779,22 +778,22 @@ def _irls_step(X, W, P2, z): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {numpy array, sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array of shape (n_samples,) + W : numpy array, shape (n_samples,) - P2 : numpy array or sparse matrix of shape (n_features, n_features) - The l2-penalty matrix or vector (=diagonal matrix) + P2 : {numpy array, sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix) - z : numpy array of shape (n_samples,) + z : numpy array, shape (n_samples,) Working observations Returns ------- coef: array, shape (X.shape[1]) """ - # TODO: scipy.linalg.solve is faster, but ordinary least squares uses + # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses # scipy.linalg.lstsq. What is more appropriate? n_samples, n_features = X.shape if sparse.issparse(X): @@ -892,19 +891,20 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : None or array of shape (n_features*,), optional\ + P1 : {None, array-like}, shape (n_features*,), optional\ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as an array of ones. + default value ``None`` is the same as a 1d array of ones. Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). - P2 : None or array of shape (n_features*, n_features*), optional\ - (default=None) + P2 : {None, array-like, sparse matrix}, shape \ + (n_features*, n_features*), optional (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov regularization). + The default value ``None`` is the same as the idendity matrix. Note that n_features* = X.shape[1] = length of coef_ (intercept always excluded from counting). P2 must be positive semi-definite. @@ -939,8 +939,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. - - 'cd' is the coordinate descent algorithm. It can deal with L1 and - L2 penalties. + - 'cd' is the coordinate descent algorithm. It can + deal with L1 as well as L2 penalties. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -958,8 +958,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to fit), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. - start_params : {None, 'least_squares', 'zero'} or array of shape \ - (n_features, ) or }, optional (default=None) + start_params : {None, 'least_squares', 'zero', array of shape \ + (n_features, )}, optional (default=None) If an array of size n_features is supplied, use these as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. @@ -979,7 +979,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. - random_state : int, RandomState instance or None, optional (default=None) + random_state : {int, RandomState instance, None}, optional (default=None) The seed of the pseudo random number generator that selects a random feature to be updated for solver 'cd' (coordinate descent). If int, random_state is the seed used by the random @@ -1052,13 +1052,13 @@ def fit(self, X, y, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. - y : numpy array of shape (n_samples,) + y : array-like, shape (n_samples,) Target values. - sample_weight : array of shape (n_samples,) or None,\ + sample_weight : {None, array-like}, shape (n_samples,),\ optinal (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has @@ -1190,12 +1190,12 @@ def fit(self, X, y, sample_weight=None): if self.P1 is None: P1 = np.ones(X.shape[1]) else: - P1 = np.atleast_1d(np.copy(self.P1)) + P1 = np.copy(np.atleast_1d(self.P1)) if P1.dtype.kind not in ['b', 'i', 'u', 'f']: raise ValueError("P1 must be a numeric value; " "got (dtype={0}).".format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): - raise ValueError("P1 must be either None or an 1D array with " + raise ValueError("P1 must be either None or a 1d array with " "the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." @@ -1324,6 +1324,7 @@ def fit(self, X, y, sample_weight=None): coef = _irls_step(Xnew, W, P2, z) else: # with L1 penalty, start with coef = 0 + # TODO: Are there better options? coef = np.zeros(n_features) elif isinstance(self.start_params, six.string_types): if self.start_params == 'zero': @@ -1353,7 +1354,7 @@ def fit(self, X, y, sample_weight=None): # 4. fit # ####################################################################### # algorithms for optimiation - # TODO: Parallelize it + # TODO: Parallelize it? self.n_iter_ = 0 converged = False # 4.1 IRLS ############################################################ @@ -1682,12 +1683,12 @@ def linear_predictor(self, X): Parameters ---------- - X : numpy array or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. Returns ------- - C : array, shape (n_samples) + C : array, shape (n_samples,) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") @@ -1703,17 +1704,18 @@ def predict(self, X, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix, shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. - sample_weight : array of shape (n_samples,) or None , \ - (default=None) + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Returns ------- C : array, shape (n_samples,) Returns predicted values times sample_weight. """ + # TODO: Is copy=True necessary? X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) @@ -1729,14 +1731,14 @@ def estimate_phi(self, X, y, sample_weight=None): Parameters ---------- - X : numpy array or sparse matrix of shape (n_samples, n_features) + X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. - y : numpy array, shape (n_samples,) + y : array-like, shape (n_samples,) Target values. - sample_weight : array of shape (n_samples,) or None,\ - optinal (default=None) + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Sample weights. """ check_is_fitted(self, "coef_") @@ -1782,13 +1784,14 @@ def score(self, X, y, sample_weight=None): Parameters ---------- - X : array-like, shape (n_samples, n_features) - Test samples + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Test samples. y : array-like, shape (n_samples,) - True valeus for X. + True values of target. - sample_weight : array-like, shape = (n_samples,), optional + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) Sample weights. Returns From a9ae023ec331e782d7a23de18f7e6fbb0dd1f57d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 8 Oct 2018 20:21:35 +0200 Subject: [PATCH 037/269] Double escape in doctring of GeneralizedLinearRegressor --- sklearn/linear_model/glm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 01d5420773ffc..535af60289f8e 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -866,13 +866,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): TODO: Estimation of the dispersion parameter phi. - If your target `y` is a ratio, you should also provide appropriate weights - `w`. As an example, consider Poission distributed counts `z` (integers) and - weights `w`=exposure (time, money, persons years, ...), then you fit + If the target `y` is a ratio, appropriate weights `w` should be provided. + As an example, consider Poission distributed counts `z` (integers) and + weights `w`=exposure (time, money, persons years, ...). Then you fit `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, - sample_weight=w)``. You need the weights for the right mean, consider: - :math:`\bar(y) = \frac{\sum_i w_i y_i}{\sum_i w_i}`. - In this case one might say that y has a 'scaled' Poisson distributions. + sample_weight=w)``. The weights are necessary for the right mean, consider: + :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`. + In this case one might say that 'y' has a 'scaled' Poisson distributions. The same holds for other distributions. Parameters @@ -891,7 +891,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features*,), optional\ + P1 : {None, array-like}, shape (n_features*,), optional \ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The From bb62485166412d8ba6393e5ba753b015b806867c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 31 Dec 2018 16:54:40 +0100 Subject: [PATCH 038/269] Add example for GeneralizedLinearRegressor * add example * improve docstring of GeneralizedLinearRegressor * improve user guide for GeneralizedLinearRegressor --- doc/modules/linear_model.rst | 34 +++++--- .../plot_poisson_spline_regression.py | 83 +++++++++++++++++++ sklearn/linear_model/glm.py | 22 ++--- 3 files changed, 115 insertions(+), 24 deletions(-) create mode 100644 examples/linear_model/plot_poisson_spline_regression.py diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index a204ccb080cc9..d65f7ed121f8e 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -901,12 +901,15 @@ for a more versatile L2 penalty. Use cases, where a loss different from the squared loss might be appropriate, are the following: - * If the target values :math:`y` are counts (integer valued) or frequencies, you might try a Poisson deviance. + * If the target values :math:`y` are counts (non-negative integer valued) or + frequencies (non-negative), you might use a Poisson deviance with log-link. - * If the target values are positive valued and skewed, you might try a Gamma deviance. + * If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. - * If the target values seem to be heavier tailed than a Gamma distribution, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). - Keep in mind that the mean is not a good measure for very heavy tailed distributions, cf. extreme value theory. + * If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, @@ -931,6 +934,11 @@ follows: >>> reg.intercept_ #doctest: +ELLIPSIS -0.76383575... + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + Mathematical formulation ------------------------ @@ -956,20 +964,20 @@ e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. -The deviance is defined by +The deviance is defined by the log of the EDM likelihood as .. math:: D(y, \mu) = -2\phi\cdot \left(loglike(y,\mu,\frac{\phi}{s}) - loglike(y,y,\frac{\phi}{s})\right) -===================================== ================================= -Distribution Variance Function :math:`v(\mu)` -===================================== ================================= -Normal ("normal") :math:`1` -Poisson ("poisson") :math:`\mu` -Gamma ("gamma") :math:`\mu^2` -Inverse Gaussian ("inverse.gaussian") :math:`\mu^3` -===================================== ================================= +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Deviance :math:`D(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{/mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ Two remarks: diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py new file mode 100644 index 0000000000000..b98bca5d8f867 --- /dev/null +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -0,0 +1,83 @@ +""" +================================= +Poisson Regression with B-Splines +================================= + +As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` +example, a Poisson regression with penalized B-splines (P-splines) [1]_ is +fitted on slightly different sinusodial, Poisson distributed data and +compared to an AdaBoost model with decision trees. +One can see, that this is a hard problem for both estimators. + +.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines + and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. + `doi:10.1214/ss/1038425655 + `_ + +""" +print(__doc__) + +# Author: Christian Lorentzen +# based on the AdaBoost regression example from Noel Dawe +# License: BSD 3 clause + +# importing necessary libraries +import numpy as np +from scipy.linalg import toeplitz +from scipy.interpolate import BSpline +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import AdaBoostRegressor +from sklearn.linear_model import GeneralizedLinearRegressor + + +# Create the dataset +xmin, xmax = 0, 6 +rng = np.random.RandomState(1) +X = np.linspace(xmin, xmax, 500)[:, np.newaxis] +y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) +y = rng.poisson(y_true, X.shape[0]) + +# b-spline basis +nknots, degree = 40, 3 +ns = nknots - degree - 1 # number of base spline functions +dx = (xmax - xmin) / (nknots - 1 - 2 * degree) +knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) +coef = np.zeros(ns) +splineBasis = np.empty((X.shape[0], ns), dtype=float) +for i in range(ns): + coef[i] = 1 + splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ + .ravel() + coef[i] = 0 + +# second order difference matrix +P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) +P2[0, 0] = P2[-1, -1] = 1 + +# Fit regression model +regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), + n_estimators=10, random_state=rng) + +regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', + fit_intercept=True, alpha=0.02, + l1_ratio=0.1, P2=P2) + +regr_1.fit(X, y) +regr_2.fit(splineBasis, y) + +# Predict +y_1 = regr_1.predict(X) +y_2 = regr_2.predict(splineBasis) + +# Plot the results +plt.figure() +plt.plot(X, y_true, c="b", label="true mean") +plt.scatter(X, y, c="k", marker='.', label="training samples") +plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) +plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Regression Comparison") +plt.legend() +plt.show() diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 535af60289f8e..37afc8da2d6db 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -6,7 +6,7 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Write examples +# TODO: Write more examples. # TODO: Make option self.copy_X more meaningfull than just for start values. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. @@ -832,7 +832,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): with inverse link function `h` and s=sum of `sample_weight` (which equals n_samples for `sample_weight=None`). - For `P1`=`P2`=identity, the penalty is the elastic net:: + For `P1=P2=identity`, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 @@ -868,11 +868,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If the target `y` is a ratio, appropriate weights `w` should be provided. As an example, consider Poission distributed counts `z` (integers) and - weights `w`=exposure (time, money, persons years, ...). Then you fit - `y = z/w`, i.e. ``GeneralizedLinearModel(family='Poisson').fit(X, y, - sample_weight=w)``. The weights are necessary for the right mean, consider: - :math:`\\bar(y) = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`. - In this case one might say that 'y' has a 'scaled' Poisson distributions. + weights `w=exposure` (time, money, persons years, ...). Then you fit + `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=w)``. The weights are necessary for the right meanself. + Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`, + in this case one might say that `y` has a 'scaled' Poisson distributions. The same holds for other distributions. Parameters @@ -1017,10 +1017,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): References ---------- For the coordinate descent implementation: - .. [1] Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', From 16d064db7cee1d59569d21631cc2fa41be8b3b14 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 1 Jan 2019 11:58:41 +0100 Subject: [PATCH 039/269] Resolve merge conflicts * resolve merge conflicts in linear_model.rst * replace BSpline by splev to support older scipy versions --- doc/modules/linear_model.rst | 14 +++++++------- .../linear_model/plot_poisson_spline_regression.py | 8 +++++--- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d65f7ed121f8e..09f14735c2907 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -876,18 +876,18 @@ to warm-starting (see :term:`Glossary `). .. _Generalized_linear_regression: -Generalized linear regression +Generalized Linear Regression ============================= :class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [8]_. First, the predicted values :math:`\hat{y}` are linked to a linear +ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [9]_. The objective function beeing minimized +exponential dispersion model (EDM) [11]_. The objective function beeing minimized becomes .. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 @@ -983,7 +983,7 @@ Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [10]_. + (19)-(20) in [12]_. * If you want to model a frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing @@ -993,12 +993,12 @@ Two remarks: .. topic:: References: - .. [8] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [9] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. See also `Exponential dispersion model. `_ - .. [10] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py index b98bca5d8f867..fce85fae1ea8c 100644 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -24,7 +24,8 @@ # importing necessary libraries import numpy as np from scipy.linalg import toeplitz -from scipy.interpolate import BSpline +# from scipy.interpolate import BSpline +from scipy.interpolate import splev import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor @@ -47,8 +48,9 @@ splineBasis = np.empty((X.shape[0], ns), dtype=float) for i in range(ns): coef[i] = 1 - splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ - .ravel() +# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ +# .ravel() + splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() coef[i] = 0 # second order difference matrix From 1a02a901d1a6d99484241ca205178ac61fc47846 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 1 Jan 2019 12:59:30 +0100 Subject: [PATCH 040/269] Adapt for minimum numpy version * replace np.block --- sklearn/linear_model/glm.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 37afc8da2d6db..851767055c61c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -1235,8 +1235,11 @@ def fit(self, X, y, sample_weight=None): P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), dtype=P2.dtype).tocsr() else: - P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], - [np.zeros((X.shape[1], 1)), P2]]) + # as of numpy 1.13 this would work: + # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], + # [np.zeros((X.shape[1], 1)), P2]]) + P2 = np.hstack((np.zeros((X.shape[1], 1)), P2)) + P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2)) else: Xnew = X From 177eb4cc017a7262e472070b4a920250711a099c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 6 Jan 2019 19:46:19 +0100 Subject: [PATCH 041/269] Remove six dependencies as in #12639 * replace six.with_metaclass(ABCMeta) by metaclass=ABCMeta * replace six.integer_types by int * replace six.string_types by str * rebase * correct email address --- sklearn/linear_model/glm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 851767055c61c..aca49ec7edf28 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -2,12 +2,13 @@ Generalized Linear Models with Exponential Dispersion Family """ -# Author: Christian Lorentzen +# Author: Christian Lorentzen # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause # TODO: Write more examples. -# TODO: Make option self.copy_X more meaningfull than just for start values. +# TODO: Make option self.copy_X more meaningful. +# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -51,7 +52,6 @@ from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning -from ..externals import six from ..utils import check_array, check_X_y from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg @@ -84,7 +84,7 @@ def _check_weights(sample_weight, n_samples): return weights -class Link(six.with_metaclass(ABCMeta)): +class Link(metaclass=ABCMeta): """Abstract base class for Link funtions """ @@ -186,7 +186,7 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) -class ExponentialDispersionModel(six.with_metaclass(ABCMeta)): +class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by @@ -1142,7 +1142,7 @@ def fit(self, X, y, sample_weight=None): "with L1 penalties, which are included with " "(alpha={1}) and (l1_ratio={2})." .format(solver, self.alpha, self.l1_ratio)) - if (not isinstance(self.max_iter, six.integer_types) + if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " "integer;" @@ -1156,7 +1156,7 @@ def fit(self, X, y, sample_weight=None): start_params = self.start_params if start_params is None: pass - elif isinstance(start_params, six.string_types): + elif isinstance(start_params, str): if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " "'least-squares', 'zero' or an array of right" @@ -1329,7 +1329,7 @@ def fit(self, X, y, sample_weight=None): # with L1 penalty, start with coef = 0 # TODO: Are there better options? coef = np.zeros(n_features) - elif isinstance(self.start_params, six.string_types): + elif isinstance(self.start_params, str): if self.start_params == 'zero': coef = np.zeros(n_features) elif self.start_params == 'least_squares': From 3d4c784df6797c244b31dd3393083d2c63114bf2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 3 Feb 2019 21:38:24 +0100 Subject: [PATCH 042/269] Improve user guide, doc and fix penalty parameter for Ridge * move parts of docstring to new Notes section * improve user guide and doc * fix typos * fix scaling of penalty parameter in Ridge() * docstring for _check_weights * reduce tol for parameter initialization --- doc/modules/linear_model.rst | 18 +++--- sklearn/linear_model/glm.py | 114 +++++++++++++++++++---------------- 2 files changed, 73 insertions(+), 59 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 09f14735c2907..174d1e4eddae4 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -890,7 +890,7 @@ Secondly, the squared loss function is replaced by the deviance :math:`D` of an exponential dispersion model (EDM) [11]_. The objective function beeing minimized becomes -.. math:: \frac{1}{2s}D(y, \hat{y}) + \alpha \rho ||P_1w||_1 +.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 +\frac{\alpha(1-\rho)}{2} w^T P_2 w with sample weights :math:`s`. @@ -914,7 +914,7 @@ are the following: Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link -:math:`h(x)=x` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with :math:`h(Xw)=\exp(Xw)`. Note that the feature matrix `X` should be standardized before fitting. This @@ -964,17 +964,19 @@ e.g. accounting for the dependence structure of :math:`y`. The objective function, which is independent of :math:`\phi`, is minimized with respect to the coefficients :math:`w`. -The deviance is defined by the log of the EDM likelihood as +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as -.. math:: D(y, \mu) = -2\phi\cdot - \left(loglike(y,\mu,\frac{\phi}{s}) - - loglike(y,y,\frac{\phi}{s})\right) +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) ===================================== =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Deviance :math:`D(y, \mu)` +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` ===================================== =============================== ================================= ============================================ Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{/mu}-y+\mu)` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` ===================================== =============================== ================================= ============================================ diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index aca49ec7edf28..5fc869f81195f 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -8,7 +8,7 @@ # TODO: Write more examples. # TODO: Make option self.copy_X more meaningful. -# So far, fit uses Xnew instead of X. +# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -28,7 +28,7 @@ # - Allow for finer control of penalty terms: # L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude # factors from the L1 penalty. -# L2: w*P2*w with P2 a (demi-) positive definite matrix, e.g. P2 could be +# L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and # Tikhonov regularization). # - The link funtion (instance of class Link) is necessary for the evaluation @@ -59,6 +59,8 @@ def _check_weights(sample_weight, n_samples): + """Check that weights are non-negative and have the right shape + """ if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): @@ -594,6 +596,7 @@ class TweedieDistribution(ExponentialDispersionModel): power : float (default=0) The variance power of the `unit_variance` :math:`v(\mu) = \mu^{power}`. + For ``0`. - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\\mu_i=h(\\eta_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{w_i} v(\\mu_i)`. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - (penalized) maximum likelihood which is equivalent to minimizing the - deviance. - - For `alpha` > 0, the feature matrix `X` should be standardized in order to - penalize features equally strong. Call - :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - - TODO: Estimation of the dispersion parameter phi. - - If the target `y` is a ratio, appropriate weights `w` should be provided. - As an example, consider Poission distributed counts `z` (integers) and - weights `w=exposure` (time, money, persons years, ...). Then you fit - `y = z/w`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, - sample_weight=w)``. The weights are necessary for the right meanself. - Consider :math:`\\bar{y} = \\frac{\\sum_i w_i y_i}{\\sum_i w_i}`, - in this case one might say that `y` has a 'scaled' Poisson distributions. - The same holds for other distributions. - Parameters ---------- alpha : float, optional (default=1) @@ -891,22 +871,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features*,), optional \ + P1 : {None, array-like}, shape (n_features,), optional \ (default=None) With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The default value ``None`` is the same as a 1d array of ones. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). + Note that n_features = X.shape[1]. P2 : {None, array-like, sparse matrix}, shape \ - (n_features*, n_features*), optional (default=None) + (n_features, n_features), optional (default=None) With this square matrix the L2 penalty is calculated as `w P2 w`. This gives a fine control over this penalty (Tikhonov - regularization). - The default value ``None`` is the same as the idendity matrix. - Note that n_features* = X.shape[1] = length of coef_ (intercept - always excluded from counting). P2 must be positive semi-definite. + regularization). The diagonal zeros of a diagonal P2, for example, + exclude all corresponding coefficients from the L2 penalty. + The default value ``None`` is the same as the identity matrix. + Note that n_features = X.shape[1]. P2 must be positive semi-definite. fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be @@ -929,18 +908,22 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ optional (default='auto') - Algorithm to use in the optimization problem. + Algorithm to use in the optimization problem: - - 'auto' sets 'irls' if l1_ratio equals 0, else 'cd'. + 'auto' + Sets 'irls' if l1_ratio equals 0, else 'cd'. - - 'irls' is iterated reweighted least squares (Fisher scoring). + 'irls' + iterated reweighted least squares (Fisher scoring). It is the standard algorithm for GLMs. Cannot deal with L1 penalties. - - 'newton-cg', 'lbfgs'. Cannot deal with L1 penalties. + 'newton-cg', 'lbfgs' + Cannot deal with L1 penalties. - - 'cd' is the coordinate descent algorithm. It can - deal with L1 as well as L2 penalties. + 'cd' + coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -959,10 +942,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): starting values for ``coef_`` and ``intercept_``. start_params : {None, 'least_squares', 'zero', array of shape \ - (n_features, )}, optional (default=None) - If an array of size n_features is supplied, use these as start values + (n_features*, )}, optional (default=None) + If an array of size n_features* is supplied, use it as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept includes the + intercept in counting. If 'least_squares' is set, the result of a least squares fit in the link space (linear predictor) is taken. If 'zero' is set, all coefficients start with zero. @@ -1013,6 +998,30 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations of the solver. + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + (penalized) maximum likelihood which is equivalent to minimizing the + deviance. + + For `alpha` > 0, the feature matrix `X` should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + + If the target `y` is a ratio, appropriate sample weights `s` should be + provided. + As an example, consider Poission distributed counts `z` (integers) and + weights `s=exposure` (time, money, persons years, ...). Then you fit + `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=s)``. The weights are necessary for the right (finite + sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that `y` has a 'scaled' Poisson distributions. + The same holds for other distributions. References ---------- @@ -1138,10 +1147,10 @@ def fit(self, X, y, sample_weight=None): else: solver = 'cd' if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): - raise ValueError("The chosen solver (solver={0}) can't deal " - "with L1 penalties, which are included with " - "(alpha={1}) and (l1_ratio={2})." - .format(solver, self.alpha, self.l1_ratio)) + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " @@ -1340,14 +1349,17 @@ def fit(self, X, y, sample_weight=None): elif self.l1_ratio <= 0.01: # ElasticNet says l1_ratio <= 0.01 is not reliable # => use Ridge + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha) + alpha=self.alpha*n_samples, + tol=np.max([self.tol, np.sqrt(self.tol)])) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio) + alpha=self.alpha, l1_ratio=self.l1_ratio, + tol=np.max([self.tol, np.sqrt(self.tol)])) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: @@ -1557,7 +1569,7 @@ def Hs(s): # minimize_z: a z + 1/2 b z^2 + c |d+z| # a = A_j # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, ee 1.3 + # c = |P1_j| = P1_j > 0, see 1.3 # d = w_j + d_j # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) # with beta = z+d, beta_hat = d-a/b and gamma = c/b From 919912c3f98e6d1190737e344c14f31c2eef9077 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 17 Feb 2019 18:38:36 +0100 Subject: [PATCH 043/269] Smarter intercept initialization and docstring improvements * smarter initialization of intercept * PEP 257 -- Docstring Conventions * minor docstring changes --- sklearn/linear_model/glm.py | 265 +++++++++++++++++++++--------------- 1 file changed, 153 insertions(+), 112 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 5fc869f81195f..d69ccd0a66486 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -59,14 +59,13 @@ def _check_weights(sample_weight, n_samples): - """Check that weights are non-negative and have the right shape - """ + """Check that weights are non-negative and have the right shape.""" if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): if sample_weight <= 0: raise ValueError("Sample weights must be non-negative.") - weights = sample_weight*np.ones(n_samples) + weights = sample_weight * np.ones(n_samples) else: _dtype = [np.float64, np.float32] weights = check_array(sample_weight, accept_sparse='csr', @@ -75,8 +74,8 @@ def _check_weights(sample_weight, n_samples): if weights.ndim > 1: raise ValueError("Sample weight must be 1D array or scalar") elif weights.shape[0] != n_samples: - raise ValueError("Sample weights must have the same length as" - " y") + raise ValueError("Sample weights must have the same length as " + "y") if not np.all(weights >= 0): raise ValueError("Sample weights must be non-negative.") elif not np.sum(weights) > 0: @@ -87,70 +86,72 @@ def _check_weights(sample_weight, n_samples): class Link(metaclass=ABCMeta): - """Abstract base class for Link funtions - """ + """Abstract base class for Link funtions.""" @abstractmethod def link(self, mu): - """The link function g(mu) with argument mu=E[Y] returns the - linear predictor. + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. Parameters ---------- mu : array, shape (n_samples,) - Usually the predicted mean. + Usually the (predicted) mean. """ raise NotImplementedError @abstractmethod def derivative(self, mu): - """Derivative of the link g'(mu). + """Compute the derivative of the link g'(mu). Parameters ---------- mu : array, shape (n_samples,) - Usually the predicted mean. + Usually the (predicted) mean. """ raise NotImplementedError @abstractmethod def inverse(self, lin_pred): - """The inverse link function h(lin_pred) with the linear predictor as - argument returns mu=E[Y]. + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linkear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative(self, lin_pred): - """Derivative of the inverse link function h'(lin_pred). + """Compute the derivative of the inverse link function h'(lin_pred). Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError @abstractmethod def inverse_derivative2(self, lin_pred): - """Second derivative of the inverse link function h''(lin_pred). + """Compute 2nd derivative of the inverse link function h''(lin_pred). Parameters ---------- lin_pred : array, shape (n_samples,) - Usually the (predicted) linear predictor. + Usually the (fitted) linear predictor. """ raise NotImplementedError class IdentityLink(Link): - """The identity link function g(x)=x. - """ + """The identity link function g(x)=x.""" def link(self, mu): return mu @@ -169,8 +170,7 @@ def inverse_derivative2(self, lin_pred): class LogLink(Link): - """The log link function g(x)=log(x). - """ + """The log link function g(x)=log(x).""" def link(self, mu): return np.log(mu) @@ -238,26 +238,22 @@ class ExponentialDispersionModel(metaclass=ABCMeta): @abstractproperty def lower_bound(self): - """The lower bound of values of Y~EDM. - """ + """The lower bound of values of Y~EDM.""" raise NotImplementedError() @abstractproperty def upper_bound(self): - """The upper bound of values of Y~EDM. - """ + """The upper bound of values of Y~EDM.""" raise NotImplementedError() @abstractproperty def include_lower_bound(self): - """If True, values of y may equal lower bound: y >= lower_bound. - """ + """If True, values of y may equal lower bound: y >= lower_bound.""" raise NotImplementedError() @abstractproperty def include_upper_bound(self): - """If True, values of y may equal upper bound: y <= upper_bound. - """ + """If True, values of y may equal upper bound: y <= upper_bound.""" raise NotImplementedError() def in_y_range(self, x): @@ -285,7 +281,9 @@ def in_y_range(self, x): @abstractmethod def unit_variance(self, mu): - r"""The unit variance :math:`v(\mu)` determines the variance as + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as a function of the mean :math:`\mu` by :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. It can also be derived from the unit deviance :math:`d(y,\mu)` as @@ -304,7 +302,9 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - r"""The derivative of the unit variance w.r.t. `mu`, :math:`v'(\mu)`. + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. Parameters ---------- @@ -314,7 +314,9 @@ def unit_variance_derivative(self, mu): raise NotImplementedError() def variance(self, mu, phi=1, weights=1): - r"""The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. @@ -332,7 +334,9 @@ def variance(self, mu, phi=1, weights=1): return phi/weights * self.unit_variance(mu) def variance_derivative(self, mu, phi=1, weights=1): - r"""The derivative of the variance w.r.t. `mu`, + r"""Compute the derivative of the variance w.r.t. mu. + + Returns :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` and weights :math:`s_i`. @@ -352,10 +356,12 @@ def variance_derivative(self, mu, phi=1, weights=1): @abstractmethod def unit_deviance(self, y, mu): - r"""The unit_deviance :math:`d(y,\mu)`. - In terms of the log-likelihood it is given by + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,phi) - loglike(y,y,phi)\right).` + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` Parameters ---------- @@ -368,7 +374,9 @@ def unit_deviance(self, y, mu): raise NotImplementedError() def unit_deviance_derivative(self, y, mu): - r"""The derivative w.r.t. `mu` of the unit deviance + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` with unit variance :math:`v(\mu)`. @@ -383,9 +391,12 @@ def unit_deviance_derivative(self, y, mu): return -2*(y-mu)/self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""The deviance is given by :math:`D = \sum_i s_i \cdot d(y, \mu) + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. - In terms of the likelihood it is :math:`D = -2\phi\cdot + In terms of the log-likelihood it is :math:`D = -2\phi\cdot \left(loglike(y,\mu,\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. @@ -403,15 +414,15 @@ def deviance(self, y, mu, weights=1): return np.sum(weights*self.unit_deviance(y, mu)) def _deviance(self, coef, X, y, weights, link): - """The deviance as a function of the coefficients `coef` - (:math:`w`). - """ + """Compute the deviance as a function of the coefficients and data.""" lin_pred = safe_sparse_dot(X, coef, dense_output=True) mu = link.inverse(lin_pred) return self.deviance(y, mu, weights) def deviance_derivative(self, y, mu, weights=1): - """The derivative w.r.t. `mu` of the deviance. + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. Parameters ---------- @@ -427,7 +438,9 @@ def deviance_derivative(self, y, mu, weights=1): return weights*self.unit_deviance_derivative(y, mu) def _score(self, coef, phi, X, y, weights, link): - r"""The score function is the derivative of the + r"""Compute the score function. + + The score function is the derivative of the log-likelihood w.r.t. `coef` (:math:`w`). It is given by @@ -453,7 +466,8 @@ def _score(self, coef, phi, X, y, weights, link): return score def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""The Fisher information matrix. + r"""Compute the Fisher information matrix. + The Fisher information matrix, also known as expected information matrix is given by @@ -482,9 +496,10 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): - r"""The observed information matrix. + r"""Compute the observed information matrix. + The observed information matrix, also known as the negative of - the Hessian matrix of the log-likelihood. It is given by + the Hessian matrix of the log-likelihood, is given by .. math: @@ -518,8 +533,10 @@ def _observed_information(self, coef, phi, X, y, weights, link): return observed_information def _deviance_derivative(self, coef, X, y, weights, link): - r"""The derivative w.r.t. `coef` (:math:`w`) of the deviance as a - function of the coefficients `coef`. + r"""Compute the derivative of the deviance w.r.t. coef. + + The derivative of the deviance w.r.t. `coef` (:math:`w`) as a + function of the coefficients `coef` and the data. This is equivalent to :math:`-2\phi` times the score function :func:`_score` (derivative of the log-likelihood). """ @@ -528,9 +545,11 @@ def _deviance_derivative(self, coef, X, y, weights, link): return -2*score def _deviance_hessian(self, coef, X, y, weights, link): - r"""The hessian matrix w.r.t. `coef` (:math:`w`) of the deviance - as a function of the coefficients `coef`. - This is equivalent to :math:`+2\phi` times the observed information + r"""Compute the hessian matrix of the deviance w.r.t. coef. + + The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as + a function of the coefficients `coef` and the data. + It is equivalent to :math:`+2\phi` times the observed information matrix. """ info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, @@ -538,8 +557,12 @@ def _deviance_hessian(self, coef, X, y, weights, link): return 2*info_matrix def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): - """Calculates eta (linear predictor), mu, score function (derivative - of log-likelihood) and Fisher matrix (all with phi=1) all in one go""" + """Compute linear predictor, mean, score function and fisher matrix. + + It calculates the linear predictor, the mean, score function + (derivative of log-likelihood) and Fisher information matrix + all in one go as function of `coef` (:math:`w`) and the data. + """ n_samples, n_features = X.shape # eta = linear predictor eta = safe_sparse_dot(X, coef, dense_output=True) @@ -561,7 +584,9 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): return eta, mu, score, fisher def starting_mu(self, y, weights=1): - """Starting values for the mean mu_i in (unpenalized) IRLS. + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. Parameters ---------- @@ -577,7 +602,9 @@ def starting_mu(self, y, weights=1): class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - They have :math:`\mu=\mathrm{E}[Y]` and + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -668,7 +695,7 @@ def include_upper_bound(self): return self._include_upper_bound def unit_variance(self, mu): - """The unit variance of a Tweedie distribution is v(mu)=mu**power. + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. Parameters ---------- @@ -678,8 +705,8 @@ def unit_variance(self, mu): return np.power(mu, self.power) def unit_variance_derivative(self, mu): - """The derivative of the unit variance of a Tweedie distribution is - v(mu)=power*mu**(power-1). + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). Parameters ---------- @@ -732,9 +759,9 @@ def __init__(self): class GeneralizedHyperbolicSecand(ExponentialDispersionModel): - """A class for the von Generalized Hyperbolic Secand (GHS) distribution. + """A class for the Generalized Hyperbolic Secand (GHS) distribution. - The GHS distribution is for data y in (-inf, inf). + The GHS distribution is for tagets y in (-inf, inf). """ def __init__(self): self._lower_bound = -np.Inf @@ -770,7 +797,7 @@ def unit_deviance(self, y, mu): def _irls_step(X, W, P2, z): - """One step in iteratively reweighted least squares + """Compute one step in iteratively reweighted least squares. Solve A w = b for w with A = (X' W X + P2) @@ -823,18 +850,18 @@ def _irls_step(X, W, P2, z): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """Regression via a Generalized Linear Model (GLM) based on reproductive - Exponential Dispersion Models (EDM) with combined L1 and L2 priors as - regularizer. + """Regression via a Generalized Linear Model (GLM) with penalties. - Minimizes the objective function:: + GLMs based on a reproductive Exponential Dispersion Model (EDM) with + combined L1 and L2 priors as regularizer minimizes the following objective + function:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w with inverse link function `h` and s=`sample_weight` (for - `sample_weight=Nones` one has s=1 and sum(s) equals `n_samples`). + `sample_weight=None`, one has s=1 and sum(s)=`n_samples`). For `P1=P2=identity`, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 @@ -893,16 +920,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ of class ExponentialDispersionModel, optional(default='normal') - the distributional assumption of the GLM, i.e. which loss function to - be minimized. + The distributional assumption of the GLM, i.e. which distribution from + the EDM, specifies the loss function to be minimized. link : {'identity', 'log'} or an instance of class Link, optional (default='identity') - the link function of the GLM, i.e. mapping from linear predictor + The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) - method for estimation of the dispersion parameter phi. Whether to use + Method for estimation of the dispersion parameter phi. Whether to use the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. @@ -914,15 +941,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Sets 'irls' if l1_ratio equals 0, else 'cd'. 'irls' - iterated reweighted least squares (Fisher scoring). - It is the standard algorithm for GLMs. Cannot deal with + Iterated reweighted least squares (with Fisher scoring). + It is the standard algorithm for GLMs. It cannot deal with L1 penalties. 'newton-cg', 'lbfgs' Cannot deal with L1 penalties. 'cd' - coordinate descent algorithm. It can deal with L1 as well as L2 + Coordinate descent algorithm. It can deal with L1 as well as L2 penalties. max_iter : int, optional (default=100) @@ -930,15 +957,15 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, - the iteration will stop when ``max{|g_i | i = 1, ..., n} <= tol`` + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative of - the deviance). + the objective function). warm_start : boolean, optional (default=False) - If set to ``True``, reuse the solution of the previous call to fit as - initialization for ``coef_`` and ``intercept_`` (supersedes option + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option ``start_params``). If set to ``True`` or if the attribute ``coef_`` - does not exit (first call to fit), option ``start_params`` sets the + does not exit (first call to ``fit``), option ``start_params`` sets the starting values for ``coef_`` and ``intercept_``. start_params : {None, 'least_squares', 'zero', array of shape \ @@ -946,8 +973,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If an array of size n_features* is supplied, use it as start values for ``coef_`` in the fit. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. - Note that n_features* = X.shape[1] + fit_intercept includes the - intercept in counting. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. If 'least_squares' is set, the result of a least squares fit in the link space (linear predictor) is taken. If 'zero' is set, all coefficients start with zero. @@ -960,7 +987,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): For the solver 'cd' (coordinate descent), the coordinates (features) can be updated in either cyclic or random order. If set to 'random', a random coefficient is updated every iteration - rather than looping over features sequentially by default. This + rather than looping over features sequentially in the same order. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. @@ -1057,7 +1084,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, self.verbose = verbose def fit(self, X, y, sample_weight=None): - """Fit a generalized linear model. + """Fit a Generalized Linear Model. Parameters ---------- @@ -1087,12 +1114,14 @@ def fit(self, X, y, sample_weight=None): _dtype = [np.float64, np.float32] X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. y = y.astype(np.float64) weights = _check_weights(sample_weight, y.shape[0]) # 1.2 validate arguments of __init__ ################################## - # Garantee that self._family_instance is an instance of class + # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): self._family_instance = self.family @@ -1112,7 +1141,8 @@ def fit(self, X, y, sample_weight=None): " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" " got (family={0})".format(self.family)) - # Garantee that self._link_instance is set to an instance of class Link + # Guarantee that self._link_instance is set to an instance of + # class Link if isinstance(self.link, Link): self._link_instance = self.link else: @@ -1127,19 +1157,19 @@ def fit(self, X, y, sample_weight=None): .format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: - raise ValueError("Penalty term must be non-negative;" + raise ValueError("Penalty term must be a non-negative number;" " got (alpha={0})".format(self.alpha)) if (not isinstance(self.l1_ratio, numbers.Number) or self.l1_ratio < 0 or self.l1_ratio > 1): - raise ValueError("l1_ratio must be in interval [0, 1]; got" - " (l1_ratio={0})".format(self.l1_ratio)) + raise ValueError("l1_ratio must be a number in interval [0, 1];" + " got (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: - raise ValueError("GeneralizedLinearRegressor supports only irls, " - "auto, lbfgs, newton-cg and cd solvers, got {0}" - "".format(self.solver)) + raise ValueError("GeneralizedLinearRegressor supports only solvers" + " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': if self.l1_ratio == 0: @@ -1168,8 +1198,8 @@ def fit(self, X, y, sample_weight=None): elif isinstance(start_params, str): if start_params not in ['least_squares', 'zero']: raise ValueError("The argument start_params must be None, " - "'least-squares', 'zero' or an array of right" - " length," + "'least-squares', 'zero' or an array of " + " correct length;" " got(start_params={0})".format(start_params)) else: start_params = check_array(start_params, accept_sparse='csr', @@ -1179,21 +1209,21 @@ def fit(self, X, y, sample_weight=None): (start_params.ndim != 1)): raise ValueError("Start values for parameters must have the" "right length and dimension; required (length" - "={0}, ndim=1), got (length={1}, ndim={2})." + "={0}, ndim=1); got (length={1}, ndim={2})." .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " - "'random', got (selection={0})" + "'random'; got (selection={0})" .format(self.selection)) random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) if not isinstance(self.check_input, bool): - raise ValueError("The attribute check_input must be bool; got " + raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) if self.P1 is None: @@ -1232,7 +1262,7 @@ def fit(self, X, y, sample_weight=None): link = self._link_instance if self.fit_intercept: - # intercept is first column <=> coef[0] is for intecept + # Note: intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) else: @@ -1259,10 +1289,11 @@ def fit(self, X, y, sample_weight=None): P2 *= l2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric - if sparse.issparse(P2): - P2 = 0.5 * (P2 + P2.transpose()) - else: - P2 = 0.5 * (P2 + P2.T) + if P2.ndim == 2: + if sparse.issparse(P2): + P2 = 0.5 * (P2 + P2.transpose()) + else: + P2 = 0.5 * (P2 + P2.T) # 1.3 additional validations ########################################## if self.check_input: @@ -1301,7 +1332,7 @@ def fit(self, X, y, sample_weight=None): # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) weights_sum = np.sum(weights) - weights = weights/np.sum(weights) + weights = weights/weights_sum ####################################################################### # 3. initialization of coef = (intercept_, coef_) # @@ -1338,6 +1369,8 @@ def fit(self, X, y, sample_weight=None): # with L1 penalty, start with coef = 0 # TODO: Are there better options? coef = np.zeros(n_features) + if self.fit_intercept: + coef[0] = link.link(np.mean(y)) elif isinstance(self.start_params, str): if self.start_params == 'zero': coef = np.zeros(n_features) @@ -1546,6 +1579,8 @@ def Hs(s): # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 # A = f'(w) + d*H(w) + (w+d)*P2 # B = H+P2 # Note: f'=-score and H=fisher are updated at the end of outer @@ -1694,7 +1729,7 @@ def Hs(s): return self def linear_predictor(self, X): - """The linear_predictor X*coef_ + intercept_. + """Compute the linear_predictor = X*coef_ + intercept_. Parameters ---------- @@ -1741,8 +1776,7 @@ def predict(self, X, sample_weight=None): return mu*weights def estimate_phi(self, X, y, sample_weight=None): - """Estimation of the dispersion parameter phi. - Returns the estimate. + """Estimate/fit the dispersion parameter phi. Parameters ---------- @@ -1755,6 +1789,11 @@ def estimate_phi(self, X, y, sample_weight=None): sample_weight : {None, array-like}, shape (n_samples,), optional \ (default=None) Sample weights. + + Returns + ------- + phi : float + Dispersion parameter. """ check_is_fitted(self, "coef_") _dtype = [np.float64, np.float32] @@ -1785,15 +1824,17 @@ def estimate_phi(self, X, y, sample_weight=None): # "AssertionError: -0.28014056555724598 not greater than 0.5" # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): - r"""Returns D^2, a generalization of the coefficient of determination - R^2, which uses deviance instead of squared error. + r"""Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 deviance. Note that those two are equal + for family='normal'. D^2 is defined as :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` is the null deviance, i.e. the deviance of a model with intercept alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is averaged by sample_weight. In the case of a Normal - distribution, D^2 equals R^2. + :math:`\bar{y}` is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). @@ -1812,7 +1853,7 @@ def score(self, X, y, sample_weight=None): Returns ------- score : float - D^2 of self.predict(X) wrt. y. + D^2 of self.predict(X) w.r.t. y. """ # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get From 01033e36d913756f7ff5e2214189cf1d7426dee1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 20 Feb 2019 17:20:09 +0100 Subject: [PATCH 044/269] Fix false formula in starting_mu and improve start_params --- sklearn/linear_model/glm.py | 108 ++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 48 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index d69ccd0a66486..fad7492acc2fb 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -583,7 +583,7 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): fisher = safe_sparse_dot(X.T, temp, dense_output=False) return eta, mu, score, fisher - def starting_mu(self, y, weights=1): + def starting_mu(self, y, weights=1, ind_weight=0.5): """Set starting values for the mean mu. These may be good starting points for the (unpenalized) IRLS solver. @@ -595,9 +595,13 @@ def starting_mu(self, y, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. """ - return ((weights*y+np.mean(weights*y)) / - (2.*np.sum(np.ones_like(y)*weights))) + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) class TweedieDistribution(ExponentialDispersionModel): @@ -852,17 +856,19 @@ def _irls_step(X, W, P2, z): class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. - GLMs based on a reproductive Exponential Dispersion Model (EDM) with - combined L1 and L2 priors as regularizer minimizes the following objective - function:: + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes + the following objective function with combined L1 and L2 priors as + regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w with inverse link function `h` and s=`sample_weight` (for - `sample_weight=None`, one has s=1 and sum(s)=`n_samples`). - For `P1=P2=identity`, the penalty is the elastic net:: + ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). + For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the + elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 @@ -966,24 +972,34 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): as initialization for ``coef_`` and ``intercept_`` (supersedes option ``start_params``). If set to ``True`` or if the attribute ``coef_`` does not exit (first call to ``fit``), option ``start_params`` sets the - starting values for ``coef_`` and ``intercept_``. + start values for ``coef_`` and ``intercept_``. + + start_params : {'irls', 'least_squares', 'zero', array of shape \ + (n_features*, )}, optional (default='irls') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'irls' + Start values of mu are calculated by family.starting_mu(..). Then, + one step of irls obtains start values for ``coef_`. This gives + usually good results. - start_params : {None, 'least_squares', 'zero', array of shape \ - (n_features*, )}, optional (default=None) - If an array of size n_features* is supplied, use it as start values - for ``coef_`` in the fit. If ``fit_intercept=True``, the first element + 'least_squares' + Start values for ``coef_`` are obtained by a least squares fit in the + link space (y is transformed to the space of the linear predictor). + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element is assumed to be the start value for the ``intercept_``. Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes the intercept in counting. - If 'least_squares' is set, the result of a least squares fit in the - link space (linear predictor) is taken. - If 'zero' is set, all coefficients start with zero. - If ``None``, the start values are calculated by setting mu to - family.starting_mu(..) and one step of irls. - These options only apply if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not yet exist). - selection : str, optional (default='random') + selection : str, optional (default='cyclic') For the solver 'cd' (coordinate descent), the coordinates (features) can be updated in either cyclic or random order. If set to 'random', a random coefficient is updated every iteration @@ -1005,7 +1021,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): check_input : boolean, optional (default=True) Allow to bypass several checks on input: y values in range of family, - sample_weights non-negative, P2 positive semi-definite. + sample_weight non-negative, P2 positive semi-definite. Don't use this parameter unless you know what you do. verbose : int, optional (default=0) @@ -1061,8 +1077,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, fit_intercept=True, family='normal', link='identity', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params=None, - selection='random', random_state=None, copy_X=True, + tol=1e-4, warm_start=False, start_params='irls', + selection='cyclic', random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio @@ -1193,11 +1209,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) start_params = self.start_params - if start_params is None: - pass - elif isinstance(start_params, str): - if start_params not in ['least_squares', 'zero']: - raise ValueError("The argument start_params must be None, " + if isinstance(start_params, str): + if start_params not in ['irls', 'least_squares', 'zero']: + raise ValueError("The argument start_params must be 'irls', " "'least-squares', 'zero' or an array of " " correct length;" " got(start_params={0})".format(start_params)) @@ -1348,11 +1362,11 @@ def fit(self, X, y, sample_weight=None): self.coef_)) else: coef = self.coef_ - elif self.start_params is None: - if self.l1_ratio == 0: + elif isinstance(start_params, str): + if start_params == 'irls': # See 3.1 IRLS # Use mu_start and apply one irls step to calculate coef - mu = family.starting_mu(y, weights) + mu = family.starting_mu(y, weights=weights) # linear predictor eta = link.link(mu) # h'(eta) @@ -1365,16 +1379,9 @@ def fit(self, X, y, sample_weight=None): # solve A*coef = b # A = X' W X + l2 P2, b = X' W z coef = _irls_step(Xnew, W, P2, z) - else: - # with L1 penalty, start with coef = 0 - # TODO: Are there better options? - coef = np.zeros(n_features) - if self.fit_intercept: - coef[0] = link.link(np.mean(y)) - elif isinstance(self.start_params, str): - if self.start_params == 'zero': - coef = np.zeros(n_features) - elif self.start_params == 'least_squares': + elif start_params == 'least_squares': + # less restrictive tolerance for finding start values + tol = np.max([self.tol, np.sqrt(self.tol)]) if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) reg.fit(Xnew, link.link(y)) @@ -1384,18 +1391,21 @@ def fit(self, X, y, sample_weight=None): # => use Ridge # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha*n_samples, - tol=np.max([self.tol, np.sqrt(self.tol)])) + alpha=self.alpha*n_samples, tol=tol) reg.fit(Xnew, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio, - tol=np.max([self.tol, np.sqrt(self.tol)])) + tol=tol) reg.fit(Xnew, link.link(y)) coef = reg.coef_ - else: + else: # start_params == 'zero' + coef = np.zeros(n_features) + if self.fit_intercept: + coef[0] = link.link(np.average(y, weights=weights)) + else: # assign given array as start values coef = start_params ####################################################################### @@ -1560,6 +1570,8 @@ def Hs(s): # some precalculations eta, mu, score, fisher = family._eta_mu_score_fisher( coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) # initial stopping tolerance of inner loop # use L1-norm of minimum-norm of subgradient of F # fp_wP2 = f'(w) + w*P2 @@ -1574,8 +1586,8 @@ def Hs(s): # outer loop while self.n_iter_ < self.max_iter: self.n_iter_ += 1 - # initialize search direction d (to be optimized) - d = np.zeros_like(coef) + # initialize search direction d (to be optimized) with zero + d.fill(0) # inner loop # TODO: use sparsity (coefficient already 0 due to L1 penalty) # => active set of features for featurelist, see paper From 4071a8a54de0112fd1afd6d8fc5b5585708c84ea Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 20 Feb 2019 21:15:04 +0100 Subject: [PATCH 045/269] Improve argument handling of P1 and P2 * P2 also accepts 1d array and interprets it as diagonal matrix * improved input checks for P1 and P2 --- sklearn/linear_model/glm.py | 103 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 31 ++++---- 2 files changed, 81 insertions(+), 53 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index fad7492acc2fb..33e0d75730e3a 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -45,7 +45,10 @@ from abc import ABCMeta, abstractmethod, abstractproperty import numbers import numpy as np -from scipy import linalg, optimize, sparse, special +from scipy import linalg, sparse +import scipy.sparse.linalg as splinalg +from scipy.optimize import fmin_l_bfgs_b +from scipy.special import xlogy import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -727,7 +730,7 @@ def unit_deviance(self, y, mu): if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (special.xlogy(y, y/mu) - y + mu) + return 2 * (xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -840,7 +843,7 @@ def _irls_step(X, W, P2, z): XtW = X.transpose() * W A = XtW * X + L2 b = XtW * z - coef = sparse.linalg.spsolve(A, b) + coef = splinalg.spsolve(A, b) else: XtW = (X.T * W) A = XtW.dot(X) @@ -867,7 +870,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): with inverse link function `h` and s=`sample_weight` (for ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). - For `P1=P2=identity` (``P1=None``, ``P2=None``), the penalty is the + For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 @@ -904,21 +907,24 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. - P1 : {None, array-like}, shape (n_features,), optional \ - (default=None) + P1 : {'identity', array-like}, shape (n_features,), optional \ + (default='identity') With this array, you can exclude coefficients from the L1 penalty. Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``None`` is the same as a 1d array of ones. + default value ``'identity'`` is the same as a 1d array of ones. Note that n_features = X.shape[1]. - P2 : {None, array-like, sparse matrix}, shape \ - (n_features, n_features), optional (default=None) - With this square matrix the L2 penalty is calculated as `w P2 w`. - This gives a fine control over this penalty (Tikhonov - regularization). The diagonal zeros of a diagonal P2, for example, - exclude all corresponding coefficients from the L2 penalty. - The default value ``None`` is the same as the identity matrix. - Note that n_features = X.shape[1]. P2 must be positive semi-definite. + P2 : {'identity', array-like, sparse matrix}, shape \ + (n_features,) or (n_features, n_features), optional \ + (default='identity') + With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. + This gives a fine control over this penalty (Tikhonov regularization). + A 2d array is directly used as the square matrix P2. A 1d array is + interpreted as diagonal (square) matrix. The default 'identity' sets + the identity matrix, which gives the usual squared L2-norm. If you just + want to exclude certain coefficients, pass a 1d array filled with 1, + and 0 for the coefficients to be excluded. + Note that P2 must be positive semi-definite. fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be @@ -1074,7 +1080,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Journal of Machine Learning Research 13 (2012) 1999-2030 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ - def __init__(self, alpha=1.0, l1_ratio=0, P1=None, P2=None, + def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='identity', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', @@ -1240,20 +1246,23 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) - if self.P1 is None: + if isinstance(self.P1, str) and self.P1 == 'identity': P1 = np.ones(X.shape[1]) else: - P1 = np.copy(np.atleast_1d(self.P1)) - if P1.dtype.kind not in ['b', 'i', 'u', 'f']: - raise ValueError("P1 must be a numeric value; " - "got (dtype={0}).".format(P1.dtype)) + P1 = np.atleast_1d(self.P1) + try: + P1 = P1.astype(np.float64, casting='safe', copy=True) + except TypeError: + raise TypeError("The given P1 cannot be converted to a numeric" + "array; got (P1.dtype={0})." + .format(P1.dtype)) if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): - raise ValueError("P1 must be either None or a 1d array with " - "the length of X.shape[1]; " + raise ValueError("P1 must be either 'identity' or a 1d array " + "with the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) - if self.P2 is None: + if isinstance(self.P2, str) and self.P2 == 'identity': if not sparse.issparse(X): P2 = np.ones(X.shape[1]) else: @@ -1262,8 +1271,15 @@ def fit(self, X, y, sample_weight=None): else: P2 = check_array(self.P2, copy=True, accept_sparse=['csr', 'csc', 'coo'], - dtype="numeric", ensure_2d=True) - if ((P2.ndim != 2) or + dtype=_dtype, ensure_2d=False) + if P2.ndim == 1: + if P2.shape[0] != X.shape[1]: + raise ValueError("P2 should be a 1d array of shape " + "(n_features,) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},)), needed ({1},)" + .format(P2.shape[0], X.shape[1])) + elif ((P2.ndim != 2) or (P2.shape[0] != P2.shape[1]) or (P2.shape[0] != X.shape[1])): raise ValueError("P2 must be either None or an array of shape " @@ -1319,21 +1335,32 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. - if self.P1 is not None: + if self.P1 != 'identity': if not np.all(P1 >= 0): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite - if self.P2 is not None: - if sparse.issparse(P2): - # TODO: check sparse P2 for non-negativeness - # raise NotImplementedError("Check sparse P2 for " - # "non-negativeness is not yet " - # "implemented.") - pass - elif P2.ndim == 2: - if not np.all(np.linalg.eigvals(P2) >= -1e-15): - raise ValueError("P2 must be positive definite.") + if self.P2 != 'identity': + # due to numerical precision, we allow eigenvalues to be a + # tiny bit negative + epsneg = 10 * np.finfo(P2.dtype).epsneg + if P2.ndim == 1 or P2.shape[0] == 1: + if not np.all(P2 >= 0): + raise ValueError("1d array P2 must not have negative " + "values.") + elif sparse.issparse(P2): + # for sparse matrices, not all eigenvals can be computed + # efficiently, use only half of n_features + # k = how many eigenvals to compute + k = np.min([10, n_features // 10 + 1]) + sigma = 0 # start searching near this value + which = 'SA' # find smallest algebraic eigenvalues first + if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, + which=which) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + else: + if not np.all(linalg.eigvalsh(P2) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? @@ -1520,7 +1547,7 @@ def Hs(s): args = (Xnew, y, weights, link) if solver == 'lbfgs': - coef, loss, info = optimize.fmin_l_bfgs_b( + coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=fprime, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, maxiter=self.max_iter) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1ac5ccd4d3d5c..fde1604ad16e3 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -188,25 +188,26 @@ def test_glm_l1_ratio_argument(): assert_raises(ValueError, glm.fit, X, y) -def test_glm_P1_argument(): - """Test GLM P1 arguments - """ +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]]) +def test_glm_P1_argument(P1): + """Test GLM P1 arguments.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for P1 in [['a string', 'a string'], [1, [2]], [1, 2, 3]]: - glm = GeneralizedLinearRegressor(P1=P1) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(P1=P1) + with pytest.raises((ValueError, TypeError)): + glm.fit(X, y) -# def test_glm_P2_argument(): -# """Test GLM P2 arguments -# """ -# y = np.array([1, 2]) -# X = np.array([[1], [1]]) -# for P2 in [np.full((2, 2), 'a string', dtype=np.dtype(' Date: Wed, 20 Feb 2019 22:48:14 +0100 Subject: [PATCH 046/269] Fix doctest, test_poisson_enet, change IRLS to use lstsq, fix input checks * adapt examples of GeneralizedLinearModel to new defaults for P1, P2 and selection * fix precision/decimal issue in test_poisson_enet * use more robust least squares instead of solve in IRLS * fix sign error in input checks --- doc/modules/linear_model.rst | 16 +++++++++------- sklearn/linear_model/glm.py | 25 ++++++++++++++++--------- sklearn/linear_model/tests/test_glm.py | 24 ++++++++++++++++++++---- 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 174d1e4eddae4..e60e9e84a4747 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -924,15 +924,17 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE - GeneralizedLinearRegressor(P1=None, P2=None, alpha=0.5, check_input=True, - copy_X=True, family='poisson', fit_dispersion=None, - fit_intercept=True, l1_ratio=0, link='log', max_iter=100, - random_state=None, selection='random', solver='auto', - start_params=None, tol=0.0001, verbose=0, warm_start=False) + GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, + check_input=True, copy_X=True, family='poisson', + fit_dispersion=None, fit_intercept=True, l1_ratio=0, + link='log', max_iter=100, random_state=None, + selection='cyclic', solver='auto', + start_params='irls', tol=0.0001, verbose=0, + warm_start=False) >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE - array([0.24630255, 0.43373521]) + array([0.24630169, 0.43373464]) >>> reg.intercept_ #doctest: +ELLIPSIS - -0.76383575... + -0.76383633... .. topic:: Examples: diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 33e0d75730e3a..bc1a0434fa3b0 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -830,8 +830,9 @@ def _irls_step(X, W, P2, z): ------- coef: array, shape (X.shape[1]) """ - # TODO: scipy.linalg.solve seems faster, but ordinary least squares uses - # scipy.linalg.lstsq. What is more appropriate? + # Note: solve vs least squares, what is more appropriate? + # scipy.linalg.solve seems faster, but scipy.linalg.lstsq + # is more robust. n_samples, n_features = X.shape if sparse.issparse(X): W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() @@ -843,7 +844,8 @@ def _irls_step(X, W, P2, z): XtW = X.transpose() * W A = XtW * X + L2 b = XtW * z - coef = splinalg.spsolve(A, b) + # coef = splinalg.spsolve(A, b) + coef, *_ = splinalg.lsmr(A, b) else: XtW = (X.T * W) A = XtW.dot(X) @@ -852,7 +854,8 @@ def _irls_step(X, W, P2, z): else: A += P2 b = XtW.dot(z) - coef = linalg.solve(A, b) + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) return coef @@ -1340,12 +1343,15 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite - if self.P2 != 'identity': + if not isinstance(self.P2, str): # self.P2 != 'identity' # due to numerical precision, we allow eigenvalues to be a # tiny bit negative - epsneg = 10 * np.finfo(P2.dtype).epsneg + epsneg = -10 * np.finfo(P2.dtype).epsneg if P2.ndim == 1 or P2.shape[0] == 1: - if not np.all(P2 >= 0): + p2 = P2 + if sparse.issparse(P2): + p2 = P2.toarray() + if not np.all(p2 >= 0): raise ValueError("1d array P2 must not have negative " "values.") elif sparse.issparse(P2): @@ -1360,6 +1366,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): + return P2 raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? @@ -1689,7 +1696,7 @@ def Hs(s): mn_subgrad = (np.where(coef + d == 0, np.sign(A)*np.maximum(np.abs(A)-P1, 0), A+np.sign(coef+d)*P1)) - mn_subgrad = np.sum(np.abs(mn_subgrad)) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= inner_tol: if inner_iter == 1: inner_tol = inner_tol/4. @@ -1740,7 +1747,7 @@ def Hs(s): mn_subgrad = (np.where(coef == 0, np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), fp_wP2+np.sign(coef)*P1)) - mn_subgrad = np.sum(np.abs(mn_subgrad)) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= self.tol: converged = True break diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index fde1604ad16e3..8893028d0176a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from numpy.testing import assert_allclose import pytest import scipy as sp -from scipy import sparse +from scipy import sparse, optimize from sklearn.linear_model.glm import ( Link, @@ -199,8 +199,7 @@ def test_glm_P1_argument(P1): @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3]), - sparse.lil_matrix([[1]])]) + sparse.csr_matrix([1, 2, 3])]) def test_glm_P2_argument(P2): """Test GLM P2 arguments.""" y = np.array([1, 2]) @@ -515,12 +514,29 @@ def test_poisson_enet(): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-7, + link='log', solver='cd', tol=1e-8, selection='random', random_state=42) glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + # test results with general optimization procedure + def obj(coef): + pd = PoissonDistribution() + link = LogLink() + N = y.shape[0] + mu = link.inverse(X @ coef[1:]+coef[0]) + alpha, l1_ratio = (1, 0.5) + return 1./(2.*N) * pd.deviance(y, mu) \ + + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ + + alpha * l1_ratio * np.sum(np.abs(coef[1:])) + res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, + options={'maxiter': 1000, 'disp': False}) + assert_almost_equal(glm.intercept_, res.x[0], decimal=5) + assert_almost_equal(glm.coef_, res.x[1:], decimal=5) + assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, decimal=8) + # same for start_params='zero' and selection='cyclic' # with reduced precision glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', From ed8e74f97d2b1921af4b8c2907c9e30629788bdc Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 23 Feb 2019 14:13:22 +0100 Subject: [PATCH 047/269] Use pytest decorators and pytest.raises --- sklearn/linear_model/glm.py | 7 +- sklearn/linear_model/tests/test_glm.py | 510 ++++++++++++------------- 2 files changed, 257 insertions(+), 260 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index bc1a0434fa3b0..f583e17433ee3 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -765,8 +765,8 @@ def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) -class GeneralizedHyperbolicSecand(ExponentialDispersionModel): - """A class for the Generalized Hyperbolic Secand (GHS) distribution. +class GeneralizedHyperbolicSecant(ExponentialDispersionModel): + """A class for the Generalized Hyperbolic Secant (GHS) distribution. The GHS distribution is for tagets y in (-inf, inf). """ @@ -1338,7 +1338,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. - if self.P1 != 'identity': + if not isinstance(self.P1, str): # if self.P1 != 'identity': if not np.all(P1 >= 0): raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite @@ -1366,7 +1366,6 @@ def fit(self, X, y, sample_weight=None): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): - return P2 raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that Xnew is not rank deficient # TODO: what else to check? diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 8893028d0176a..361a237f2cc9f 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -2,7 +2,7 @@ from numpy.testing import assert_allclose import pytest import scipy as sp -from scipy import sparse, optimize +from scipy import linalg, optimize, sparse from sklearn.linear_model.glm import ( Link, @@ -11,354 +11,355 @@ TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecand, + GeneralizedHyperbolicSecant, GeneralizedLinearRegressor) from sklearn.linear_model import ElasticNet, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, - assert_array_equal, assert_array_almost_equal, - assert_raises) + assert_array_equal, assert_array_almost_equal) -def test_link_properties(): - """Test link inverse and derivative - """ +@pytest.mark.parametrize('link', Link.__subclasses__()) +def test_link_properties(link): + """Test link inverse and derivative.""" rng = np.random.RandomState(0) x = rng.rand(100)*100 - # from sklearn.linear_model.glm import Link - # for link in vars()['Link'].__subclasses__(): - for link in Link.__subclasses__(): - link = link() - assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) - assert_almost_equal(link.inverse_derivative(link.link(x)), - 1/link.derivative(x), decimal=10) - - -def test_family_bounds(): - """Test the valid range of distributions - """ - family = NormalDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [True, True, True]) - - family = PoissonDistribution() + link = link() # instatiate object + assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) + assert_almost_equal(link.inverse_derivative(link.link(x)), + 1/link.derivative(x), decimal=10) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, True, True]) - - family = TweedieDistribution(power=1.5) - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, True, True]) - - family = GammaDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - family = InverseGaussianDistribution() - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - family = TweedieDistribution(power=4.5) - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, [False, False, True]) - - -def test_deviance_zero(): - """Test deviance(y,y) = 0 for different families - """ - for family in [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=-2.5), - TweedieDistribution(power=-1), - TweedieDistribution(power=1.5), - TweedieDistribution(power=2.5), - TweedieDistribution(power=4), - GeneralizedHyperbolicSecand()]: - assert_almost_equal(family.deviance(0.1, 0.1), 0, decimal=10) - assert_almost_equal(family.deviance(1.5, 1.5), 0, decimal=10) - - -def test_fisher_matrix(): + assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5]), + (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_almost_equal(family.deviance(x, x), 0, decimal=10) + + +@pytest.mark.parametrize( + 'family, link', + [(NormalDistribution(), IdentityLink()), + (PoissonDistribution(), LogLink()), + (GammaDistribution(), LogLink()), + (InverseGaussianDistribution(), LogLink()), + (TweedieDistribution(power=1.5), LogLink()), + (TweedieDistribution(power=4.5), LogLink())]) +def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" - for family in [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution()]: - link = LogLink() - rng = np.random.RandomState(0) - coef = np.array([-2, 1, 0, 1, 2.5]) - phi = 0.5 - X = rng.randn(10, 5) - lin_pred = np.dot(X, coef) - mu = link.inverse(lin_pred) - weights = rng.randn(10)**2 + 1 - fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link) - approx = np.array([]).reshape(0, coef.shape[0]) - for i in range(coef.shape[0]): - def f(coef): - return -family._score(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link)[i] - approx = np.vstack( - [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)] - ) - assert_allclose(fisher, approx, rtol=1e-3) + rng = np.random.RandomState(0) + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) + assert_allclose(fisher, approx, rtol=1e-3) def test_sample_weights_validation(): - """Test the raised errors in the validation of sample_weight""" + """Test the raised errors in the validation of sample_weight.""" # 1. scalar value but not positive X = [[1]] y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 2. 2d array weights = [[0]] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 3. 1d but wrong length weights = [1, 0] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 4. 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) # 5. 1d but weith a negative value weights = [2, -1] - assert_raises(ValueError, glm.fit, X, y, weights) + with pytest.raises(ValueError): + glm.fit(X, y, weights) def test_glm_family_argument(): - """Test GLM family argument set as string - """ + """Test GLM family argument set as string.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) + X = np.array([[1], [2]]) for (f, fam) in [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse.gaussian', InverseGaussianDistribution())]: - glm = GeneralizedLinearRegressor(family=f, fit_intercept=False, - alpha=0).fit(X, y) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) assert_equal(type(glm._family_instance), type(fam)) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) - assert_raises(ValueError, glm.fit, X, y) + with pytest.raises(ValueError): + glm.fit(X, y) def test_glm_link_argument(): - """Test GLM link argument set as string - """ + """Test GLM link argument set as string.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) + X = np.array([[1], [2]]) for (l, link) in [('identity', IdentityLink()), ('log', LogLink())]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - link=l).fit(X, y) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) assert_equal(type(glm._link_instance), type(link)) - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - link='not a link') - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(family='normal', link='not a link') + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_alpha_argument(): - """Test GLM alpha argument - """ +@pytest.mark.parametrize('alpha', ['not a number', -4.2]) +def test_glm_alpha_argument(alpha): + """Test GLM for invalid alpha argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for alpha in ['not a number', -4.2]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - alpha=alpha) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_l1_ratio_argument(): - """Test GLM l1_ratio argument - """ +@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) +def test_glm_l1_ratio_argument(l1_ratio): + """Test GLM for invalid l1_ratio argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for l1_ratio in ['not a number', -4.2, 1.1, [1]]: - glm = GeneralizedLinearRegressor(family='normal', fit_intercept=False, - l1_ratio=l1_ratio) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) -@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3]]) +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], + [-1]]) def test_glm_P1_argument(P1): - """Test GLM P1 arguments.""" + """Test GLM for invalid P1 argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(P1=P1) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) with pytest.raises((ValueError, TypeError)): glm.fit(X, y) @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3])]) + sparse.csr_matrix([1, 2, 3]), [-1]]) def test_glm_P2_argument(P2): - """Test GLM P2 arguments.""" + """Test GLM for invalid P2 argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False) - with pytest.raises((ValueError, TypeError)): + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P2=P2, check_input=True) + with pytest.raises(ValueError): glm.fit(X, y) -def test_glm_fit_intercept_argument(): - """Test GLM fit_intercept argument - """ - y = np.array([1, 2]) - X = np.array([[1], [1]]) - for fit_intercept in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) - assert_raises(ValueError, glm.fit, X, y) +def test_glm_P2_positive_semidefinite(): + """Test GLM for a positive semi-definite P2 argument.""" + n_samples, n_features = 10, 5 + rng = np.random.RandomState(42) + y = np.arange(n_samples) + X = np.zeros((n_samples, n_features)) + P2 = np.diag([100, 10, 5, 0, -1E-5]) + # construct random orthogonal matrix Q + Q, R = linalg.qr(rng.randn(n_features, n_features)) + P2 = Q.T @ P2 @ Q + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_solver_argument(): - """Test GLM solver argument - """ +@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) +def test_glm_fit_intercept_argument(fit_intercept): + """Test GLM for invalid fit_intercept argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for solver in ['not a solver', 1, [1]]: - glm = GeneralizedLinearRegressor(solver=solver) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + with pytest.raises(ValueError): + glm.fit(X, y) - # solver not suitable for L1 penalty - for solver in ['irls', 'lbfgs', 'newton-cg']: - glm = GeneralizedLinearRegressor(solver=solver, alpha=1, l1_ratio=0.1) - assert_raises(ValueError, glm.fit, X, y) +@pytest.mark.parametrize('solver, l1_ratio', + [('not a solver', 0), (1, 0), ([1], 0), + ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) +def test_glm_solver_argument(solver, l1_ratio): + """Test GLM for invalid solver argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_max_iter_argument(): - """Test GLM max_iter argument - """ + +@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) +def test_glm_max_iter_argument(max_iter): + """Test GLM for invalid max_iter argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for max_iter in ['not a number', 0, -1, 5.5, [1]]: - glm = GeneralizedLinearRegressor(max_iter=max_iter) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(max_iter=max_iter) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_tol_argument(): - """Test GLM tol argument - """ +@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) +def test_glm_tol_argument(tol): + """Test GLM for invalid tol argument.""" y = np.array([1, 2]) - X = np.array([[1], [1]]) - for tol in ['not a number', 0, -1.0, [1e-3]]: - glm = GeneralizedLinearRegressor(tol=tol) - assert_raises(ValueError, glm.fit, X, y) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(tol=tol) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_warm_start_argument(): - """Test GLM warm_start argument - """ +@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) +def test_glm_warm_start_argument(warm_start): + """Test GLM for invalid warm_start argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for warm_start in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(warm_start=warm_start) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(warm_start=warm_start) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_start_params_argument(): - """Test GLM start_params argument - """ +@pytest.mark.parametrize('start_params', + ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]) +def test_glm_start_params_argument(start_params): + """Test GLM for invalid start_params argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for start_params in ['not a start_params', ['zero'], [0, 0, 0], - [[0, 0]], ['a', 'b']]: - glm = GeneralizedLinearRegressor(start_params=start_params) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(start_params=start_params) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_selection_argument(): - """Test GLM selection argument - """ +@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) +def test_glm_selection_argument(selection): + """Test GLM for invalid selection argument""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for selection in ['not a selection', 1, 0, ['cyclic']]: - glm = GeneralizedLinearRegressor(selection=selection) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(selection=selection) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_random_state_argument(): - """Test GLM random_state argument - """ +@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) +def test_glm_random_state_argument(random_state): + """Test GLM for invalid random_state argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for random_state in ['a string', 0.5, [0]]: - glm = GeneralizedLinearRegressor(random_state=random_state) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(random_state=random_state) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_copy_X_argument(): - """Test GLM copy_X arguments - """ +@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) +def test_glm_copy_X_argument(copy_X): + """Test GLM for invalid copy_X arguments.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for copy_X in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(copy_X=copy_X) - assert_raises(ValueError, glm.fit, X, y) + glm = GeneralizedLinearRegressor(copy_X=copy_X) + with pytest.raises(ValueError): + glm.fit(X, y) -def test_glm_check_input_argument(): - """Test GLM check_input argument - """ +@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) +def test_glm_check_input_argument(check_input): + """Test GLM for invalid check_input argument.""" y = np.array([1, 2]) X = np.array([[1], [1]]) - for check_input in ['not bool', 1, 0, [True]]: - glm = GeneralizedLinearRegressor(check_input=check_input) - assert_raises(ValueError, glm.fit, X, y) - + glm = GeneralizedLinearRegressor(check_input=check_input) + with pytest.raises(ValueError): + glm.fit(X, y) -# TODO: check additional validations if check_input == True -def test_glm_identiy_regression(): - """Test GLM regression with identity link on a simple dataset - """ +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_identiy_regression(family, solver): + """Test GLM regression with identity link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - families = ( - NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - for family in families: - glm = GeneralizedLinearRegressor( - alpha=0, family=family, fit_intercept=False, solver=solver) - res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) - - -def test_glm_log_regression(): - """Test GLM regression with log link on a simple dataset - """ + glm = GeneralizedLinearRegressor(alpha=0, family=family, + fit_intercept=False, solver=solver) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_log_regression(family, solver): + """Test GLM regression with log link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) - families = ( - NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecand()) - for solver in ['irls', 'lbfgs', 'newton-cg']: - for family in families: - glm = GeneralizedLinearRegressor( + glm = GeneralizedLinearRegressor( alpha=0, family=family, link=LogLink(), fit_intercept=False, solver=solver, start_params='least_squares') - res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) @pytest.mark.filterwarnings('ignore::DeprecationWarning') -def test_normal_ridge(): - """Test ridge regression for Normal distributions +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_normal_ridge(solver): + """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. """ @@ -375,25 +376,23 @@ def test_normal_ridge(): ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, - family='normal', link='identity', - fit_intercept=True, tol=1e-6, - max_iter=100, solver=solver, - random_state=42) - glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=1e-6, max_iter=100, solver=solver, + random_state=42) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_) + assert_almost_equal(glm.intercept_, ridge.intercept_) + assert_array_almost_equal(glm.predict(T), ridge.predict(T)) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-6, - family='normal', link='identity', - fit_intercept=False, solver='irls', - fit_dispersion='chisqr') + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=1e-6, max_iter=100, solver=solver, + random_state=42, fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -413,23 +412,22 @@ def test_normal_ridge(): ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, - family='normal', link='identity', - fit_intercept=True, solver=solver, - max_iter=300, random_state=42) - glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, + family='normal', link='identity', + fit_intercept=True, solver=solver, + max_iter=300, random_state=42) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, family='normal', link='identity', - fit_intercept=False, solver='irls') + fit_intercept=False, solver=solver) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) assert_array_almost_equal(glm.coef_, ridge.coef_) @@ -438,7 +436,7 @@ def test_normal_ridge(): def test_poisson_ridge(): - """Test ridge regression with poisson family and LogLink + """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" # library("glmnet") @@ -470,7 +468,7 @@ def test_poisson_ridge(): def test_normal_enet(): - """Tet elastic net regression with normal/gaussian family""" + """Test elastic net regression with normal/gaussian family.""" rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 n_samples, n_features = 20, 2 @@ -495,7 +493,7 @@ def test_normal_enet(): def test_poisson_enet(): - """Test elastic net regression with poisson family and LogLink + """Test elastic net regression with poisson family and LogLink. Compare to R's glmnet""" # library("glmnet") From fe876da908a7d5aefe8fa9ac56f4c5130ccf83df Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 24 Feb 2019 12:45:55 +0100 Subject: [PATCH 048/269] Add Logistic regression=Binomial + Logit * add Binomial distribution * add Logit link * tests for binomial against LogisticRegression * option 'auto' for link * reduce code duplication by replacing @abstractproperty by @property --- sklearn/linear_model/glm.py | 160 +++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 57 +++++++-- 2 files changed, 149 insertions(+), 68 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index f583e17433ee3..01e40b322946c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -42,13 +42,12 @@ from __future__ import division -from abc import ABCMeta, abstractmethod, abstractproperty +from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import linalg, sparse +from scipy import linalg, sparse, special import scipy.sparse.linalg as splinalg from scipy.optimize import fmin_l_bfgs_b -from scipy.special import xlogy import warnings from .base import LinearRegression from .coordinate_descent import ElasticNet @@ -191,6 +190,28 @@ def inverse_derivative2(self, lin_pred): return np.exp(lin_pred) +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return special.logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return special.expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = special.expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = special.expit(lin_pred) + ep = special.expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) + + class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). @@ -238,26 +259,25 @@ class ExponentialDispersionModel(metaclass=ABCMeta): https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ - - @abstractproperty + @property def lower_bound(self): - """The lower bound of values of Y~EDM.""" - raise NotImplementedError() + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound - @abstractproperty + @property def upper_bound(self): - """The upper bound of values of Y~EDM.""" - raise NotImplementedError() + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound - @abstractproperty + @property def include_lower_bound(self): - """If True, values of y may equal lower bound: y >= lower_bound.""" - raise NotImplementedError() + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound - @abstractproperty + @property def include_upper_bound(self): - """If True, values of y may equal upper bound: y <= upper_bound.""" - raise NotImplementedError() + """Get True if upper bound for y is includede: y <= upper_bound.""" + return self._include_upper_bound def in_y_range(self, x): """Returns true if `x` is in the valid range of Y~EDM. @@ -685,22 +705,6 @@ def power(self, power): .format(power)) self._power = power - @property - def lower_bound(self): - return self._lower_bound - - @property - def upper_bound(self): - return self._upper_bound - - @property - def include_lower_bound(self): - return self._include_lower_bound - - @property - def include_upper_bound(self): - return self._include_upper_bound - def unit_variance(self, mu): """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. @@ -730,7 +734,7 @@ def unit_deviance(self, y, mu): if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (xlogy(y, y/mu) - y + mu) + return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution return 2 * (np.log(mu/y)+y/mu-1) @@ -776,22 +780,6 @@ def __init__(self): self._include_lower_bound = False self._include_upper_bound = False - @property - def lower_bound(self): - return self._lower_bound - - @property - def upper_bound(self): - return self._upper_bound - - @property - def include_lower_bound(self): - return self._include_lower_bound - - @property - def include_upper_bound(self): - return self._include_upper_bound - def unit_variance(self, mu): return 1 + mu**2 @@ -803,6 +791,27 @@ def unit_deviance(self, y, mu): np.log((1+mu**2)/(1+y**2))) +class BinomialDistribution(ExponentialDispersionModel): + """A class for the Binomial distribution. + + The Binomial distribution is for tagets y in [0, 1]. + """ + def __init__(self): + self._lower_bound = 0 + self._upper_bound = 1 + self._include_lower_bound = True + self._include_upper_bound = True + + def unit_variance(self, mu): + return mu * (1 - mu) + + def unit_variance_derivative(self, mu): + return 1 - 2 * mu + + def unit_deviance(self, y, mu): + return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + + def _irls_step(X, W, P2, z): """Compute one step in iteratively reweighted least squares. @@ -933,15 +942,23 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} or an instance\ - of class ExponentialDispersionModel, optional(default='normal') + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + or an instance of class ExponentialDispersionModel, \ + optional(default='normal') The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'identity', 'log'} or an instance of class Link, - optional (default='identity') + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, + optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor - (X*coef) to expectation (mu). + (X*coef) to expectation (mu). Option 'auto' sets the link depending on + the chosen family as follows: + + - 'identity' for family 'normal' + + - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + + - 'logit' for family 'binomial' fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) Method for estimation of the dispersion parameter phi. Whether to use @@ -1084,7 +1101,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', - fit_intercept=True, family='normal', link='identity', + fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', selection='cyclic', random_state=None, copy_X=True, @@ -1159,27 +1176,48 @@ def fit(self, X, y, sample_weight=None): self._family_instance = GammaDistribution() elif self.family == 'inverse.gaussian': self._family_instance = InverseGaussianDistribution() + elif self.family == 'binomial': + self._family_instance = BinomialDistribution() else: raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian'];" - " got (family={0})".format(self.family)) + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of # class Link if isinstance(self.link, Link): self._link_instance = self.link else: - if self.link == 'identity': + if self.link == 'auto': + if isinstance(self._family_instance, TweedieDistribution): + if self._family_instance.power <= 0: + self._link_instance = IdentityLink() + if self._family_instance.power >= 1: + self._link_instance = LogLink() + elif isinstance(self._family_instance, + GeneralizedHyperbolicSecant): + self._link_instance = IdentityLink() + elif isinstance(self._family_instance, BinomialDistribution): + self._link_instance = LogitLink() + else: + raise ValueError("No default link known for the " + "specified distribution family. Please " + "set link manually, i.e. not to 'auto'; " + "got (link='auto', family={}" + .format(self.family)) + elif self.link == 'identity': self._link_instance = IdentityLink() elif self.link == 'log': self._link_instance = LogLink() + elif self.link == 'logit': + self._link_instance = LogitLink() else: raise ValueError( - "The link must be an instance of class Link or" - " an element of ['identity', 'log']; got (link={0})" - .format(self.link)) + "The link must be an instance of class Link or " + "an element of ['auto', 'identity', 'log', 'logit']; " + "got (link={0})".format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: raise ValueError("Penalty term must be a non-negative number;" diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 361a237f2cc9f..de0857a34fe3a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -4,16 +4,18 @@ import scipy as sp from scipy import linalg, optimize, sparse +from sklearn.datasets import make_classification from sklearn.linear_model.glm import ( Link, IdentityLink, LogLink, + LogitLink, TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecant, + GeneralizedHyperbolicSecant, BinomialDistribution, GeneralizedLinearRegressor) -from sklearn.linear_model import ElasticNet, Ridge +from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( assert_equal, assert_almost_equal, @@ -26,9 +28,19 @@ def test_link_properties(link): rng = np.random.RandomState(0) x = rng.rand(100)*100 link = link() # instatiate object - assert_almost_equal(link.link(link.inverse(x)), x, decimal=10) - assert_almost_equal(link.inverse_derivative(link.link(x)), - 1/link.derivative(x), decimal=10) + decimal = 10 + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + decimal = 8 + assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_almost_equal(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x), decimal=decimal) + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) @pytest.mark.parametrize( @@ -214,6 +226,12 @@ def test_glm_P2_positive_semidefinite(): with pytest.raises(ValueError): glm.fit(X, y) + P2 = sparse.csr_matrix(P2) + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) def test_glm_fit_intercept_argument(fit_intercept): @@ -331,7 +349,7 @@ def test_glm_identiy_regression(family, solver): coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - glm = GeneralizedLinearRegressor(alpha=0, family=family, + glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity', fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -350,7 +368,7 @@ def test_glm_log_regression(family, solver): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( - alpha=0, family=family, link=LogLink(), fit_intercept=False, + alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='least_squares') res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -557,3 +575,28 @@ def obj(coef): glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + +@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) +def test_binomial_enet(alpha): + """Test elastic net regression with binomial family and LogitLink. + + Compare to LogisticRegression. + """ + l1_ratio = 0.5 + n_samples = 500 + X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, + n_informative=5, n_redundant=0, n_repeated=0, + random_state=0) + log = LogisticRegression( + penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6, + max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), + solver='saga') + log.fit(X, y) + glm = GeneralizedLinearRegressor( + family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, + alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', + tol=1e-7) + glm.fit(X, y) + assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) + assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) From 2993e03dbfc89b068373718c82f65957639767ac Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 7 Apr 2019 15:33:27 +0200 Subject: [PATCH 049/269] More efficient sparse matrices and refactor of irls and cd solver * refactor into function _irls_solver * refactor into function _cd_solver * replace of safe_sparse_dot by matmul operator @ * more efficient handling of fisher matrix * sparse coo matrices are converted to csc or csr * sample weights don't except sparse matrices * minor doc changes --- sklearn/linear_model/glm.py | 1101 ++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 85 +- 2 files changed, 717 insertions(+), 469 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 01e40b322946c..b2de866a4b69d 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -55,13 +55,12 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y -from ..utils.extmath import safe_sparse_dot from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted, check_random_state def _check_weights(sample_weight, n_samples): - """Check that weights are non-negative and have the right shape.""" + """Check that sample weights are non-negative and have the right shape.""" if sample_weight is None: weights = np.ones(n_samples) elif np.isscalar(sample_weight): @@ -70,7 +69,7 @@ def _check_weights(sample_weight, n_samples): weights = sample_weight * np.ones(n_samples) else: _dtype = [np.float64, np.float32] - weights = check_array(sample_weight, accept_sparse='csr', + weights = check_array(sample_weight, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype=_dtype) if weights.ndim > 1: @@ -247,12 +246,11 @@ class ExponentialDispersionModel(metaclass=ABCMeta): deviance_derivative starting_mu + _mu_deviance_derivative _score _fisher_matrix _observed_information - _deviance - _deviance_derivative - _deviance_hessian + _eta_mu_score_fisher References ---------- @@ -280,7 +278,7 @@ def include_upper_bound(self): return self._include_upper_bound def in_y_range(self, x): - """Returns true if `x` is in the valid range of Y~EDM. + """Returns ``True`` if x is in the valid range of Y~EDM. Parameters ---------- @@ -411,7 +409,7 @@ def unit_deviance_derivative(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - return -2*(y-mu)/self.unit_variance(mu) + return -2 * (y - mu) / self.unit_variance(mu) def deviance(self, y, mu, weights=1): r"""Compute the deviance. @@ -434,13 +432,7 @@ def deviance(self, y, mu, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return np.sum(weights*self.unit_deviance(y, mu)) - - def _deviance(self, coef, X, y, weights, link): - """Compute the deviance as a function of the coefficients and data.""" - lin_pred = safe_sparse_dot(X, coef, dense_output=True) - mu = link.inverse(lin_pred) - return self.deviance(y, mu, weights) + return np.sum(weights * self.unit_deviance(y, mu)) def deviance_derivative(self, y, mu, weights=1): """Compute the derivative of the deviance w.r.t. mu. @@ -458,7 +450,36 @@ def deviance_derivative(self, y, mu, weights=1): weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return weights*self.unit_deviance_derivative(y, mu) + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu, the deviance and it's derivative w.r.t coef.""" + lin_pred = X @ coef + mu = link.inverse(lin_pred) + dev = self.deviance(y, mu, weights) + d1 = link.inverse_derivative(lin_pred) + devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights)) + return mu, dev, devp def _score(self, coef, phi, X, y, weights, link): r"""Compute the score function. @@ -476,16 +497,14 @@ def _score(self, coef, phi, X, y, weights, link): with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. + Note: The derivative of the deviance w.r.t. coef equals -2 * score. """ - n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) - d_sigma_inv = sparse.dia_matrix((sigma_inv*d, 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d_sigma_inv, (y-mu), dense_output=True) - score = safe_sparse_dot(X.T, temp, dense_output=True) + temp = sigma_inv * d * (y - mu) + score = X.T @ temp return score def _fisher_matrix(self, coef, phi, X, y, weights, link): @@ -508,14 +527,13 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): see func:`_score`. """ n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d2 = link.inverse_derivative(lin_pred)**2 d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) - fisher_matrix = safe_sparse_dot(X.T, temp, dense_output=False) + fisher_matrix = X.T @ d2_sigma_inv @ X return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): @@ -542,7 +560,7 @@ def _observed_information(self, coef, phi, X, y, weights, link): see :func:`score_` function and :func:`_fisher_matrix`. """ n_samples = X.shape[0] - lin_pred = safe_sparse_dot(X, coef, dense_output=True) + lin_pred = X @ coef mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) @@ -551,80 +569,59 @@ def _observed_information(self, coef, phi, X, y, weights, link): r = y - mu temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), shape=(n_samples, n_samples)) - temp = safe_sparse_dot(temp, X, dense_output=False) - observed_information = safe_sparse_dot(X.T, temp, dense_output=False) + observed_information = X.T @ temp @ X return observed_information - def _deviance_derivative(self, coef, X, y, weights, link): - r"""Compute the derivative of the deviance w.r.t. coef. - - The derivative of the deviance w.r.t. `coef` (:math:`w`) as a - function of the coefficients `coef` and the data. - This is equivalent to :math:`-2\phi` times the score function - :func:`_score` (derivative of the log-likelihood). - """ - score = self._score(coef=coef, phi=1, X=X, y=y, weights=weights, - link=link) - return -2*score - - def _deviance_hessian(self, coef, X, y, weights, link): - r"""Compute the hessian matrix of the deviance w.r.t. coef. - - The hessian of the deviance w.r.t. `coef` (:math:`w`) is evaluated as - a function of the coefficients `coef` and the data. - It is equivalent to :math:`+2\phi` times the observed information - matrix. - """ - info_matrix = self._observed_information(coef=coef, phi=1, X=X, y=y, - weights=weights, link=link) - return 2*info_matrix - - def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link): + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, + diag_fisher=False): """Compute linear predictor, mean, score function and fisher matrix. It calculates the linear predictor, the mean, score function (derivative of log-likelihood) and Fisher information matrix all in one go as function of `coef` (:math:`w`) and the data. + + Parameters + ---------- + diag_fisher : boolean, optional (default=False) + If ``True``, returns only an array d such that + fisher = X.T @ np.diag(d) @ X. + + Returns + ------- + (eta, mu, score, fisher) : tuple with 4 elements + The 4 elements are: + + * eta: ndarray, shape (X.shape[0],) + * mu: ndarray, shape (X.shape[0],) + * score: ndarray, shape (X.shape[0],) + * fisher: + + * If diag_fisher is ``False``, the full fisher matrix, + an array of shape (X.shape[1], X.shape[1]) + * If diag_fisher is ``True`, an array of shape (X.shape[0]) """ n_samples, n_features = X.shape # eta = linear predictor - eta = safe_sparse_dot(X, coef, dense_output=True) + eta = X @ coef mu = link.inverse(eta) sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) d1 = link.inverse_derivative(eta) # = h'(eta) # Alternatively: # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g # d1 = 1./link.derivative(mu) - d1_sigma_inv = sparse.dia_matrix((sigma_inv*d1, 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d1_sigma_inv, (y-mu), dense_output=True) - score = safe_sparse_dot(X.T, temp, dense_output=True) + score = X.T @ (sigma_inv * d1 * (y - mu)) # - d2_sigma_inv = sparse.dia_matrix((sigma_inv*(d1**2), 0), - shape=(n_samples, n_samples)) - temp = safe_sparse_dot(d2_sigma_inv, X, dense_output=False) - fisher = safe_sparse_dot(X.T, temp, dense_output=False) - return eta, mu, score, fisher - - def starting_mu(self, y, weights=1, ind_weight=0.5): - """Set starting values for the mean mu. - - These may be good starting points for the (unpenalized) IRLS solver. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - - ind_weight : float (default=0.5) - Must be between 0 and 1. Specifies how much weight is given to the - individual observations instead of the mean of y. - """ - return (ind_weight * y + - (1. - ind_weight) * np.average(y, weights=weights)) + d2_sigma_inv = sigma_inv * (d1**2) + if diag_fisher: + return eta, mu, score, d2_sigma_inv + else: + if sparse.issparse(X): + d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0), + shape=(n_samples, n_samples)) + fisher = (X.T @ d2_sigma_inv @ X).toarray() + else: + fisher = (X.T * d2_sigma_inv) @ X + return eta, mu, score, fisher class TweedieDistribution(ExponentialDispersionModel): @@ -724,20 +721,20 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - return self.power*np.power(mu, self.power-1) + return self.power * np.power(mu, self.power - 1) def unit_deviance(self, y, mu): p = self.power if p == 0: # NormalDistribution - return (y-mu)**2 + return (y - mu)**2 if p == 1: # PoissonDistribution # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 return 2 * (special.xlogy(y, y/mu) - y + mu) elif p == 2: # GammaDistribution - return 2 * (np.log(mu/y)+y/mu-1) + return 2 * (np.log(mu/y) + y/mu - 1) else: # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) @@ -784,11 +781,11 @@ def unit_variance(self, mu): return 1 + mu**2 def unit_variance_derivative(self, mu): - return 2*mu + return 2 * mu def unit_deviance(self, y, mu): - return (2*y*(np.arctan(y) - np.arctan(mu)) + - np.log((1+mu**2)/(1+y**2))) + return (2 * y * (np.arctan(y) - np.arctan(mu)) + + np.log((1 + mu**2)/(1 + y**2))) class BinomialDistribution(ExponentialDispersionModel): @@ -809,7 +806,7 @@ def unit_variance_derivative(self, mu): return 1 - 2 * mu def unit_deviance(self, y, mu): - return 2*(special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) def _irls_step(X, W, P2, z): @@ -824,20 +821,20 @@ def _irls_step(X, W, P2, z): Parameters ---------- - X : {numpy array, sparse matrix}, shape (n_samples, n_features) + X : {ndarray, sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present) - W : numpy array, shape (n_samples,) + W : ndarray, shape (n_samples,) - P2 : {numpy array, sparse matrix}, shape (n_features, n_features) + P2 : {ndarray, sparse matrix}, shape (n_features, n_features) The L2-penalty matrix or vector (=diagonal matrix) - z : numpy array, shape (n_samples,) + z : ndarray, shape (n_samples,) Working observations Returns ------- - coef: array, shape (X.shape[1]) + coef: ndarray, shape (X.shape[1]) """ # Note: solve vs least squares, what is more appropriate? # scipy.linalg.solve seems faster, but scipy.linalg.lstsq @@ -868,27 +865,422 @@ def _irls_step(X, W, P2, z): return coef +def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): + """Solve GLM with L2 penalty by IRLS algorithm. + + Note: If X is sparse, P2 must also be sparse. + """ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: P2 must be symmetrized + # Note: ' denotes derivative, but also transpose for matrices + + # eta = linear predictor + eta = X @ coef + mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + n_iter = 0 + while n_iter < max_iter: + n_iter += 1 + # coef_old not used so far. + # coef_old = coef + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = hp**2 / V + # working observations + z = eta + (y - mu) / hp + # solve A*coef = b + # A = X' W X + P2, b = X' W z + coef = _irls_step(X, W, P2, z) + # updated linear predictor + # do it here for updated values for tolerance + eta = X @ coef + mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + gradient = -(X.T @ (hp*(y-mu)/V)) + if P2.ndim == 1: + gradient += P2*coef + else: + gradient += P2 @ coef + if (np.max(np.abs(gradient)) <= tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter + + +def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=1000, selection='cyclic', + random_state=None, diag_fisher=False): + """Compute inner loop of coordinate descent = cycles through features. + + Minimization of 1-d subproblems:: + + min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 + + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + Note: f'=-score and H=fisher are updated at the end of outer iteration. + """ + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 + n_samples, n_features = X.shape + B = fisher + if P2.ndim == 1: + coef_P2 = coef * P2 + if not diag_fisher: + B[np.diag_indices_from(B)] += P2 + else: + coef_P2 = P2 @ coef # P2 is symmetric, mat @ vec is usually faster + if not diag_fisher: + if sparse.issparse(P2): + B += P2.toarray() + else: + B += P2 + A = -score + coef_P2 # + d @ (H+P2) but d=0 so far + # inner loop + inner_iter = 0 + while inner_iter < max_inner_iter: + inner_iter += 1 + n_cycles += 1 + if selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, see 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + a = A[j] + if diag_fisher: + if sparse.issparse(X): + xj = X[:, j] + b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis]) + b = b[0, 0] + else: + b = X[:, j] @ (fisher * X[:, j]) + + if P2.ndim == 1: + b += P2[j] + else: + b += P2[j, j] + else: + b = B[j, j] + + if b <= 0: + z = 0 + elif P1[j] == 0: + z = -a/b + elif a + P1[j] < b * (coef[j] + d[j]): + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[j] + d[j]): + z = -(a - P1[j])/b + else: + z = -(coef[j] + d[j]) + + # update direction d + d[j] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if diag_fisher: + if sparse.issparse(X): + A += (X.transpose() @ + X[:, j].multiply(fisher[:, np.newaxis]) + ).toarray().ravel() * z + else: + # A += (X.T @ (fisher * X[:, j])) * z + # same without transpose of X + A += ((fisher * X[:, j]) @ X) * z + + if P2.ndim == 1: + A[j] += P2[j] * z + elif sparse.issparse(P2): + # slice columns as P2 is csc + A += P2[:, j].toarray().ravel() * z + else: + A += P2[:, j] * z + else: + # B is symmetric, C- or F-contiguous, but never sparse + if B.flags['F_CONTIGUOUS']: + # slice columns like for sparse csc + A += B[:, j] * z + else: # B.flags['C_CONTIGUOUS'] might be true + # slice rows + A += B[j, :] * z + # end of cycle + # stopping criterion for inner loop + # sum_i(|minimum of norm of subgrad of q(d)_i|) + mn_subgrad = np.where(coef + d == 0, + np.sign(A) * np.maximum(np.abs(A) - P1, 0), + A + np.sign(coef + d) * P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + return d, coef_P2, n_cycles, inner_tol + + +def _cd_solver(coef, X, y, weights, P1, P2, family, link, + max_iter=100, max_inner_iter=1000, tol=1e-4, + selection='cyclic ', random_state=None, + diag_fisher=False, copy_X=True): + """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. + + The objective beeing minimized in the coefficients w=coef is:: + + F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 + + An Improved GLMNET for L1-regularized Logistic Regression: + + 1. Find optimal descent direction d by minimizing + min_d F(w+d) = min_d F(w+d) - F(w) + 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: + q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + + ||P1*(w+d)||_1 - ||P1*w||_1 + Then minimize q(d): min_d q(d) + 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): + min_z q(d+z*e_j) + = min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + + Repeat steps 1-3 until convergence. + Note: Use Fisher matrix instead of Hessian for H. + Note: f' = -score, H = Fisher matrix + + Parameters + ---------- + coef: ndarray, shape (n_features,) + + X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present). If not sparse, + pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. + + y : ndarray, shape (n_samples,) + Target values. + + weights: ndarray, shape (n_samples,) + Sample weights with which the deviance is weighted. The weights must + bee normalized and sum to 1. + + P1 : {ndarray}, shape (n_features,) + The L1-penalty vector (=diagonal matrix) + + P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix). If a matrix is + passed, it must be symmetric. If X is sparse, P2 must also be sparse. + + family : ExponentialDispersionModel + + link : Link + + max_iter : int, optional (default=100) + Maximum numer of outer (Newton) iterations. + + max_inner_iter : int, optional (default=1000) + Maximum number of iterations, i.e. cycles over all features, in inner + loop. + + tol : float, optional (default=1e-4) + Covergence criterion is + sum_i(|minimum of norm of subgrad of objective_i|)<=tol. + + selection : str, optional (default='cyclic') + If 'random', randomly chose features in inner loop. + + random_state : {int, RandomState instance, None}, optional (default=None) + + diag_fisher : boolean, optional (default=False) + 'False' calculates full fisher matrix, 'True' only diagonal matrix s.t. + fisher = X.T @ diag @ X. This saves storage but needs more + matrix-vector multiplications. + + copy_X : boolean, optional (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + Returns + ------- + coef : ndarray, shape (n_features,) + + n_iter : numer of outer iterations = newton iterations + + n_cycles : number of cycles over features + + References + ---------- + Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + X = check_array(X, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if P2.ndim == 2: + P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if sparse.issparse(X): + if not sparse.isspmatrix_csc(X): + raise ValueError("If X is sparse, it must be in csc format" + "; got (format={})".format(X.format)) + if not sparse.isspmatrix_csc(P2): + raise ValueError("If X is sparse, P2 must also be sparse csc" + "format. Got P2 not sparse.") + random_state = check_random_state(random_state) + # Note: we already set P2 = l2*P2, P1 = l1*P1 + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + n_iter = 0 # number of outer iterations + n_cycles = 0 # number of (complete) cycles over features + converged = False + n_samples, n_features = X.shape + # line search parameters + (beta, sigma) = (0.5, 0.01) + # some precalculations + # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a + # 1d array representing a diagonal matrix. + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # fp_wP2 = f'(w) + w*P2 + if P2.ndim == 1: + fp_wP2 = -score + coef * P2 + else: + # Note: P2 is symmetric and matrix @ vector is faster for sparse + # matrices. + fp_wP2 = -score + P2 @ coef + inner_tol = np.where(coef == 0, + np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0), + fp_wP2 + np.sign(coef) * P1) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while n_iter < max_iter: + n_iter += 1 + # initialize search direction d (to be optimized) with zero + d.fill(0) + # inner loop = _cd_cycle + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=max_inner_iter, selection=selection, + random_state=random_state, diag_fisher=diag_fisher) + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1 * coef, ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * (-(score @ d) + coef_P2 @ d + + linalg.norm(P1 * (coef + d), ord=1) - P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * (coef_P2 @ coef) + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + coef_wd = coef + la * d + mu_wd = link.inverse(X @ coef_wd) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1 * coef_wd, ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * ((coef_wd * P2) @ coef_wd) + else: + Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd)) + if Fwd - Fw <= sigma * la * bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # stopping criterion for outer loop + # sum_i(|minimum of norm of subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + if P2.ndim == 1: + fp_wP2 = -score + coef * P2 + else: + fp_wP2 = -score + P2 @ coef # P2 is symmetric, mat @ vec is faster + mn_subgrad = np.where(coef == 0, + np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), + fp_wP2 + np.sign(coef) * P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the number of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter, n_cycles + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean `mu=h(X*w)`. Therefore the fit minimizes - the following objective function with combined L1 and L2 priors as - regularizer:: + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + the fit minimizes the following objective function with combined L1 and L2 + priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + alpha * l1_ratio * ||P1*w||_1 + 1/2 * alpha * (1 - l1_ratio) * w*P2*w - with inverse link function `h` and s=`sample_weight` (for - ``sample_weight=None``, one has s=1 and sum(s)=`n_samples`). - For ``P1=P2='identity'`` (``P1=None``, ``P2=None``), the penalty is the - elastic net:: + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + For ``P1=P2='identity'``, the penalty is the elastic net:: alpha * l1_ratio * ||w||_1 + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - If you are interested in controlling the L1 and L2 penalty + If you are interested in controlling the L1 and L2 penalties separately, keep in mind that this is equivalent to:: a * L1 + b * L2 @@ -897,9 +1289,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): alpha = a + b and l1_ratio = a / (a + b) - The parameter `l1_ratio` corresponds to alpha in the glmnet R package while - 'alpha' corresponds to the lambda parameter in glmnet. Specifically, - l1_ratio = 1 is the lasso penalty. + The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, + while ``alpha`` corresponds to the lambda parameter in glmnet. + Specifically, l1_ratio = 1 is the lasso penalty. Read more in the :ref:`User Guide `. @@ -948,8 +1340,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, - optional (default='auto') + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). Option 'auto' sets the link depending on the chosen family as follows: @@ -982,7 +1374,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'cd' Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. + penalties. Note that in order to avoid unnecessary memory + duplication of the X argument in the ``fit`` method, X should be + directly passed as a Fortran-contiguous numpy array or sparse csc + matrix. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -990,8 +1385,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient (derivative of - the objective function). + where g_i is the i-th component of the gradient (derivative) of + the objective function. For the cd solver, covergence is reached + when ``sum_i(|minimum of norm of g_i|)``, where g_i is the + subgradient of the objective. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1007,7 +1404,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'irls' Start values of mu are calculated by family.starting_mu(..). Then, - one step of irls obtains start values for ``coef_`. This gives + one step of irls obtains start values for ``coef_``. This gives usually good results. 'least_squares' @@ -1042,6 +1439,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): RandomState instance used by `np.random`. Used when ``selection`` == 'random'. + diag_fisher : boolean, (default=False) + Only relevant for solver 'cd'. If ``False``, the full Fisher matrix + (expected Hessian) is computed in each outer iteretion (Newton + iteration). If ``True``, only a diagonal matrix (stored as 1d array) is + computed, such that fisher = X.T @ diag @ X. This saves memory and + matrix-matrix multiplications, but needs more matrix-vector + multiplications. If you use large sparse X or if you have many + features, i.e. n_features >> n_samples, you might set this option to + ``True``. + copy_X : boolean, optional, default True If ``True``, X will be copied; else, it may be overwritten. @@ -1056,40 +1463,43 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- coef_ : array, shape (n_features,) - Estimated coefficients for the linear predictor (X*coef_) in the GLM. + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. dispersion_ : float - The dispersion parameter :math:`\\phi` if fit_dispersion is set. + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. n_iter_ : int - Actual number of iterations of the solver. + Actual number of iterations used in solver. Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - (penalized) maximum likelihood which is equivalent to minimizing the - deviance. + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. - For `alpha` > 0, the feature matrix `X` should be standardized in order to + For alpha > 0, the feature matrix X should be standardized in order to penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - If the target `y` is a ratio, appropriate sample weights `s` should be + If the target y is a ratio, appropriate sample weights s should be provided. - As an example, consider Poission distributed counts `z` (integers) and - weights `s=exposure` (time, money, persons years, ...). Then you fit - `y = z/s`, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + As an example, consider Poission distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, - in this case one might say that `y` has a 'scaled' Poisson distributions. + in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. References @@ -1104,8 +1514,8 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, start_params='irls', - selection='cyclic', random_state=None, copy_X=True, - check_input=True, verbose=0): + selection='cyclic', random_state=None, diag_fisher=False, + copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.l1_ratio = l1_ratio self.P1 = P1 @@ -1121,6 +1531,7 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', self.start_params = start_params self.selection = selection self.random_state = random_state + self.diag_fisher = diag_fisher self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -1154,11 +1565,13 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1.1 validate arguments of fit ####################################### _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - dtype=_dtype, y_numeric=True, multi_output=False) + X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) # Without converting y to float, deviance might raise # ValueError: Integers to negative integer powers are not allowed. - y = y.astype(np.float64) + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) weights = _check_weights(sample_weight, y.shape[0]) @@ -1263,7 +1676,7 @@ def fit(self, X, y, sample_weight=None): " correct length;" " got(start_params={0})".format(start_params)) else: - start_params = check_array(start_params, accept_sparse='csr', + start_params = check_array(start_params, accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype=_dtype, copy=True) if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or @@ -1274,12 +1687,14 @@ def fit(self, X, y, sample_weight=None): .format(X.shape[1] + self.fit_intercept, start_params.shape[0], start_params.ndim)) - if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " "'random'; got (selection={0})" .format(self.selection)) random_state = check_random_state(self.random_state) + if not isinstance(self.diag_fisher, bool): + raise ValueError("The argument diag_fisher must be bool;" + " got {0}".format(self.diag_fisher)) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -1303,26 +1718,34 @@ def fit(self, X, y, sample_weight=None): "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." .format(P1.shape[0], X.shape[1])) + # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': - if not sparse.issparse(X): - P2 = np.ones(X.shape[1]) - else: + if sparse.issparse(X): P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), shape=(X.shape[1], X.shape[1]))).tocsr() + else: + P2 = np.ones(X.shape[1]) else: P2 = check_array(self.P2, copy=True, - accept_sparse=['csr', 'csc', 'coo'], + accept_sparse=['csr', 'csc'], dtype=_dtype, ensure_2d=False) if P2.ndim == 1: + P2 = np.asarray(P2) if P2.shape[0] != X.shape[1]: raise ValueError("P2 should be a 1d array of shape " "(n_features,) with " "n_features=X.shape[1]; " "got (P2.shape=({0},)), needed ({1},)" .format(P2.shape[0], X.shape[1])) - elif ((P2.ndim != 2) or - (P2.shape[0] != P2.shape[1]) or - (P2.shape[0] != X.shape[1])): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and + P2.shape[0] == X.shape[1]): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(X.shape[1], X.shape[1]))).tocsr() + else: raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " "n_features=X.shape[1]; " @@ -1335,7 +1758,8 @@ def fit(self, X, y, sample_weight=None): if self.fit_intercept: # Note: intercept is first column <=> coef[0] is for intecept if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X]) + Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X], + format=X.format) else: Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) P1 = np.concatenate((np.array([0]), P1)) @@ -1343,6 +1767,7 @@ def fit(self, X, y, sample_weight=None): P2 = np.concatenate((np.array([0]), P2)) elif sparse.issparse(P2): P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), + format=P2.format, dtype=P2.dtype).tocsr() else: # as of numpy 1.13 this would work: @@ -1355,7 +1780,7 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = Xnew.shape l1 = self.alpha * self.l1_ratio - l2 = self.alpha * (1-self.l1_ratio) + l2 = self.alpha * (1 - self.l1_ratio) P1 *= l1 P2 *= l2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') @@ -1484,315 +1909,114 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # algorithms for optimiation # TODO: Parallelize it? - self.n_iter_ = 0 - converged = False + # 4.1 IRLS ############################################################ - # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' - # Obj = objective function = 1/2 Dev + l2/2 w P2 w - # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 - # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) - # D2 = link.inverse_derivative(eta)^2 = D^2 - # W = D2/V(mu) - # l2 = alpha * (1 - l1_ratio) - # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w - # = -X' D (y-mu)/V(mu) + l2 P2 w - # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 - # Use Fisher matrix instead of full info matrix -X'(...) X, - # i.e. E[Dev''] with E[y-mu]=0: - # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, - # with z = eta + D^-1 (y-mu) # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') - # Note: ' denotes derivative, but also transpose for matrices if solver == 'irls': - # eta = linear predictor - eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = link.inverse(eta) - # D = h'(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - while self.n_iter_ < self.max_iter: - self.n_iter_ += 1 - # coef_old not used so far. - # coef_old = coef - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / V) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + P2, b = X' W z - coef = _irls_step(Xnew, W, P2, z) - # updated linear predictor - # do it here for updated values for tolerance - eta = safe_sparse_dot(Xnew, coef, dense_output=True) - mu = link.inverse(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - - # which tolerace? |coef - coef_old| or gradient? - # use gradient for compliance with newton-cg and lbfgs - # gradient = family._deviance_derivative( - # coef=coef, X=Xnew, y=y, weights=weights, link=link) - # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - gradient = -safe_sparse_dot(Xnew.T, hp*(y-mu)/V) + coef, self.n_iter_ = \ + _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2, + family=family, link=link, max_iter=self.max_iter, + tol=self.tol) + + # 4.2 L-BFGS ########################################################## + elif solver == 'lbfgs': + def func(coef, X, y, weights, P2, family, link): + mu, dev, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) if P2.ndim == 1: - gradient += P2*coef + L2 = P2 * coef else: - gradient += safe_sparse_dot(P2, coef) - if (np.max(np.abs(gradient)) <= self.tol): - converged = True - break - - if not converged: - warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})" - .format(self.max_iter), ConvergenceWarning) - - # 4.2 L-BFGS and Newton-CG ############################################ - # TODO: performance: make one function return both deviance and - # gradient of deviance - elif solver in ['lbfgs', 'newton-cg']: - def func(coef, *args): + L2 = P2 @ coef + obj = 0.5 * dev + 0.5 * (coef @ L2) + objp = 0.5 * devp + L2 + return obj, objp + + args = (Xnew, y, weights, P2, family, link) + coef, loss, info = fmin_l_bfgs_b( + func, coef, fprime=None, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + + # 4.3 Newton-CG ####################################################### + # We use again the fisher matrix instead of the hessian. More + # precisely, expected hessian of deviance. + elif solver == 'newton-cg': + def func(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = safe_sparse_dot(coef.T, P2*coef) + L2 = coef @ (P2 * coef) else: - L2 = safe_sparse_dot(coef.T, safe_sparse_dot(P2, coef)) - # A[np.diag_indices_from(A)] += P2 - return 0.5*family._deviance(coef, *args) + 0.5*L2 + L2 = coef @ (P2 @ coef) + mu = link.inverse(X @ coef) + return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 - def fprime(coef, *args): + def grad(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = P2*coef + L2 = P2 * coef else: - L2 = safe_sparse_dot(P2, coef) - return 0.5*family._deviance_derivative(coef, *args) + L2 + L2 = P2 @ coef + eta = X @ coef + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights)) + return 0.5 * grad + L2 - def grad_hess(coef, X, y, weights, link): + def grad_hess(coef, X, y, weights, P2, family, link): if P2.ndim == 1: - L2 = P2*coef + L2 = P2 * coef else: - L2 = safe_sparse_dot(P2, coef) - grad = 0.5*family._deviance_derivative( - coef, X, y, weights, link) + L2 - hessian = 0.5*family._deviance_hessian( - coef, X, y, weights, link) - if P2.ndim == 1: - hessian[np.diag_indices_from(hessian)] += P2 - else: - hessian = hessian + P2 + L2 = P2 @ coef + eta = X @ coef + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + grad = 0.5 * \ + (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \ + + L2 + # expected hessian = X.T @ diag_matrix @ X + # calculate only diag_matrix + diag = d1**2 / family.variance(mu, phi=1, weights=weights) def Hs(s): - ret = safe_sparse_dot(hessian, s) + ret = 0.5 * (X.T @ (diag * (X @ s))) + if P2.ndim == 1: + ret += P2 * s + else: + ret += P2 @ s return ret + return grad, Hs - args = (Xnew, y, weights, link) - - if solver == 'lbfgs': - coef, loss, info = fmin_l_bfgs_b( - func, coef, fprime=fprime, args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) - self.n_iter_ = info['nit'] - elif solver == 'newton-cg': - coef, n_iter_i = newton_cg(grad_hess, func, fprime, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) - - # 4.3 coordinate descent ############################################## - # Reference: Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - # An Improved GLMNET for L1-regularized Logistic Regression, - # Journal of Machine Learning Research 13 (2012) 1999-2030 - # Note: Use Fisher matrix instead of Hessian for H - # - # 1. find optimal descent direction d by minimizing - # min_d F(w+d) = min_d F(w+d) - F(w) - # F = f + g, f(w) = 1/2 deviance, g(w) = 1/2 w*P2*w + ||P1*w||_1 - # 2. quadrdatic approximation of F(w+d)-F(w) = q(d): - # using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives - # q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d - # + ||P1*(w+d)||_1 - ||P1*w||_1 - # min_d q(d) - # 3. coordinate descent by updating coordinate j (d -> d+z*e_j): - # min_z q(d+z*e_j) - # = min_z q(d+z*e_j) - q(d) - # = min_z A_j z + 1/2 B_jj z^2 - # + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 - # A = f'(w) + d*H(w) + (w+d)*P2 - # B = H+P2 - # Note: we already set P2 = l2*P2, P1 = l1*P1, see above + args = (Xnew, y, weights, P2, family, link) + coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + # 4.4 coordinate descent ############################################## + # Note: we already set P1 = l1*P1, see above + # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') - # Note: f' = -score, H = Fisher matrix elif solver == 'cd': - # line search parameters - (beta, sigma) = (0.5, 0.01) - # max inner loops (cycles through all features) - max_inner_iter = 1000 - # some precalculations - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) - # set up space for search direction d for inner loop - d = np.zeros_like(coef) - # initial stopping tolerance of inner loop - # use L1-norm of minimum-norm of subgradient of F - # fp_wP2 = f'(w) + w*P2 - if P2.ndim == 1: - fp_wP2 = -score + coef*P2 - else: - fp_wP2 = -score + safe_sparse_dot(coef, P2) - inner_tol = (np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2+np.sign(coef)*P1)) - inner_tol = linalg.norm(inner_tol, ord=1) - # outer loop - while self.n_iter_ < self.max_iter: - self.n_iter_ += 1 - # initialize search direction d (to be optimized) with zero - d.fill(0) - # inner loop - # TODO: use sparsity (coefficient already 0 due to L1 penalty) - # => active set of features for featurelist, see paper - # of Improved GLMNET or Gap Safe Screening Rules - # https://arxiv.org/abs/1611.05780 - # A = f'(w) + d*H(w) + (w+d)*P2 - # B = H+P2 - # Note: f'=-score and H=fisher are updated at the end of outer - # iteration - B = fisher - if P2.ndim == 1: - coef_P2 = coef * P2 - B[np.diag_indices_from(B)] += P2 - else: - coef_P2 = safe_sparse_dot(coef, P2) - B = B + P2 - A = -score + coef_P2 # + d*(H+P2) but d=0 so far - inner_iter = 0 - while inner_iter < max_inner_iter: - inner_iter += 1 - if self.selection == 'random': - featurelist = random_state.permutation(n_features) - else: - featurelist = np.arange(n_features) - for j in featurelist: - # minimize_z: a z + 1/2 b z^2 + c |d+z| - # a = A_j - # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, see 1.3 - # d = w_j + d_j - # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) - # with beta = z+d, beta_hat = d-a/b and gamma = c/b - # z = 1/b * S(bd-a,c) - d - # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - a = A[j] - b = B[j, j] - if P1[j] == 0: - if b == 0: - z = 0 - else: - z = -a/b - elif a + P1[j] < b * (coef[j]+d[j]): - if b == 0: - z = 0 - else: - z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[j]+d[j]): - if b == 0: - z = 0 - else: - z = -(a - P1[j])/b - else: - z = -(coef[j] + d[j]) - # update direction d - d[j] += z - # update A because d_j is now d_j+z - # A = f'(w) + d*H(w) + (w+d)*P2 - # => A += (H+P2)*e_j z = B_j * z - # Note: B is symmetric B = B.transpose - if sparse.issparse(B): - if sparse.isspmatrix_csc(B): - # slice columns - A += B[:, j].toarray().ravel() * z - else: - # slice rows - A += B[j, :].toarray().ravel() * z - else: - A += B[j, :] * z - # end of cycle - # stopping criterion for inner loop - # sum_i(|minimum-norm subgrad of q(d)_i|) - mn_subgrad = (np.where(coef + d == 0, - np.sign(A)*np.maximum(np.abs(A)-P1, 0), - A+np.sign(coef+d)*P1)) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= inner_tol: - if inner_iter == 1: - inner_tol = inner_tol/4. - break - # end of inner loop - # line search by sequence beta^k, k=0, 1, .. - # F(w + lambda d) - F(w) <= lambda * bound - # bound = sigma * (f'(w)*d + w*P2*d - # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1*coef, ord=1) - # Note: coef_P2 already calculated and still valid - bound = sigma * ( - safe_sparse_dot(-score, d) + - safe_sparse_dot(coef_P2, d) + - linalg.norm(P1*(coef+d), ord=1) - - P1w_1) - Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * safe_sparse_dot(coef_P2, coef) + - P1w_1) - la = 1./beta - for k in range(20): - la *= beta # starts with la=1 - mu_wd = link.inverse(safe_sparse_dot(Xnew, coef+la*d, - dense_output=True)) - Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1*(coef+la*d), ord=1)) - if P2.ndim == 1: - Fwd += 0.5 * safe_sparse_dot((coef+la*d)*P2, coef+la*d) - else: - Fwd += 0.5 * (safe_sparse_dot(coef+la*d, - safe_sparse_dot(P2, coef+la*d))) - if Fwd-Fw <= sigma*la*bound: - break - # update coefficients - # coef_old = coef.copy() - coef += la * d - # calculate eta, mu, score, Fisher matrix for next iteration - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=Xnew, y=y, weights=weights, link=link) - # stopping criterion for outer loop - # sum_i(|minimum-norm subgrad of F(w)_i|) - # fp_wP2 = f'(w) + w*P2 - # Note: eta, mu and score are already updated - if P2.ndim == 1: - fp_wP2 = -score + coef*P2 - else: - fp_wP2 = -score + safe_sparse_dot(coef, P2) - mn_subgrad = (np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2+np.sign(coef)*P1)) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= self.tol: - converged = True - break - # end of outer loop - if not converged: - warnings.warn("Coordinate descent failed to converge. Increase" - " the number of iterations (currently {0})" - .format(self.max_iter), ConvergenceWarning) + # For coordinate descent, if X is sparse, it should be csc format + # If X is sparse, P2 must also be csc + if sparse.issparse(Xnew): + Xnew = Xnew.tocsc(copy=self.copy_X) + P2 = sparse.csc_matrix(P2) + + coef, self.n_iter_, self._n_cycles = \ + _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1, + P2=P2, family=family, link=link, + max_iter=self.max_iter, tol=self.tol, + selection=self.selection, random_state=random_state, + diag_fisher=self.diag_fisher, copy_X=self.copy_X) ####################################################################### # 5. postprocessing # @@ -1828,8 +2052,7 @@ def linear_predictor(self, X): X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) - return safe_sparse_dot(X, self.coef_, - dense_output=True) + self.intercept_ + return X @ self.coef_ + self.intercept_ def predict(self, X, sample_weight=None): """Predict uing GLM with feature matrix X. @@ -1884,7 +2107,7 @@ def estimate_phi(self, X, y, sample_weight=None): dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape weights = _check_weights(sample_weight, n_samples) - eta = safe_sparse_dot(X, self.coef_, dense_output=True) + eta = X @ self.coef_ if self.fit_intercept is True: eta += self.intercept_ n_features += 1 @@ -1907,19 +2130,19 @@ def estimate_phi(self, X, y, sample_weight=None): # "AssertionError: -0.28014056555724598 not greater than 0.5" # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): - r"""Compute D^2, the percentage of deviance explained. + """Compute D^2, the percentage of deviance explained. D^2 is a generalization of the coefficient of determination R^2. R^2 uses squared error and D^2 deviance. Note that those two are equal for family='normal'. D^2 is defined as - :math:`D^2 = 1-\frac{D(y_{true},y_{pred})}{D_{null}}`, :math:`D_{null}` - is the null deviance, i.e. the deviance of a model with intercept - alone which corresponds to :math:`y_{pred} = \bar{y}`. The mean - :math:`\bar{y}` is averaged by sample_weight. - Best possible score is 1.0 and it can be negative (because the - model can be arbitrarily worse). + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). Parameters ---------- diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de0857a34fe3a..ae8a5f4cfc5e7 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -138,16 +138,18 @@ def test_sample_weights_validation(): glm.fit(X, y, weights) -def test_glm_family_argument(): +@pytest.mark.parametrize('f, fam', + [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution()), + ('binomial', BinomialDistribution())]) +def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" - y = np.array([1, 2]) + y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - for (f, fam) in [('normal', NormalDistribution()), - ('poisson', PoissonDistribution()), - ('gamma', GammaDistribution()), - ('inverse.gaussian', InverseGaussianDistribution())]: - glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert_equal(type(glm._family_instance), type(fam)) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -155,14 +157,16 @@ def test_glm_family_argument(): glm.fit(X, y) -def test_glm_link_argument(): +@pytest.mark.parametrize('l, link', + [('identity', IdentityLink()), + ('log', LogLink()), + ('logit', LogitLink())]) +def test_glm_link_argument(l, link): """Test GLM link argument set as string.""" - y = np.array([1, 2]) + y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - for (l, link) in [('identity', IdentityLink()), - ('log', LogLink())]: - glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert_equal(type(glm._link_instance), type(link)) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError): @@ -317,6 +321,16 @@ def test_glm_random_state_argument(random_state): glm.fit(X, y) +@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) +def test_glm_diag_fisher_argument(diag_fisher): + """Test GLM for invalid diag_fisher arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) + with pytest.raises(ValueError): + glm.fit(X, y) + + @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) def test_glm_copy_X_argument(copy_X): """Test GLM for invalid copy_X arguments.""" @@ -453,7 +467,12 @@ def test_normal_ridge(solver): assert_array_almost_equal(glm.predict(T), ridge.predict(T)) -def test_poisson_ridge(): +@pytest.mark.parametrize('solver, decimal, tol', + [('irls', 7, 1e-8), + ('lbfgs', 5, 1e-7), + ('newton-cg', 5, 1e-7), + ('cd', 7, 1e-8)]) +def test_poisson_ridge(solver, decimal, tol): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" @@ -470,22 +489,20 @@ def test_poisson_ridge(): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - s_dec = {'irls': 7, 'lbfgs': 5, 'newton-cg': 5, 'cd': 7} - s_tol = {'irls': 1e-8, 'lbfgs': 1e-7, 'newton-cg': 1e-7, 'cd': 1e-8} - for solver in ['irls', 'lbfgs', 'newton-cg', 'cd']: - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, - fit_intercept=True, family='poisson', - link='log', tol=s_tol[solver], - solver=solver, max_iter=300, - random_state=42) - glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=s_dec[solver]) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=s_dec[solver]) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=tol, + solver=solver, max_iter=300, + random_state=42) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=decimal) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=decimal) -def test_normal_enet(): +@pytest.mark.parametrize('diag_fisher', [False, True]) +def test_normal_enet(diag_fisher): """Test elastic net regression with normal/gaussian family.""" rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 @@ -494,12 +511,14 @@ def test_normal_enet(): beta = rng.randn(n_features) y = 2 + np.dot(X, beta) + rng.randn(n_samples) + # 1. test normal enet on dense data glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, family='normal', link='identity', fit_intercept=True, tol=1e-8, max_iter=100, selection='cyclic', solver='cd', start_params='zero', - check_input=False) + check_input=False, + diag_fisher=diag_fisher) glm.fit(X, y) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, @@ -509,6 +528,12 @@ def test_normal_enet(): assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + # 2. test normal enet on sparse data + X = sparse.csc_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + def test_poisson_enet(): """Test elastic net regression with poisson family and LogLink. From a6f9f13db9dd11d01d1dcab93819e67d5ca18b9d Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 20 Apr 2019 15:59:39 +0200 Subject: [PATCH 050/269] Treat the intercept separately, i.e. X, P1, P2 never include intercept --- sklearn/linear_model/glm.py | 637 +++++++++++++++---------- sklearn/linear_model/tests/test_glm.py | 6 +- 2 files changed, 386 insertions(+), 257 deletions(-) diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index b2de866a4b69d..2afd1ddf8c79c 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -7,8 +7,6 @@ # License: BSD 3 clause # TODO: Write more examples. -# TODO: Make option self.copy_X more meaningful. -# So far, fit uses Xnew instead of X. # TODO: Should the option `normalize` be included (like other linear models)? # So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support? @@ -39,6 +37,7 @@ # sklearn.linear_models uses w for coefficients, standard literature on # GLMs use beta for coefficients and w for (sample) weights. # So far, coefficients=w and sample weights=s. +# - The intercept term is the first index, i.e. coef[0] from __future__ import division @@ -86,6 +85,83 @@ def _check_weights(sample_weight, n_samples): return weights +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _safe_sandwich_dot(X, d, intercept=False): + """Compute sandwich product X.T @ diag(d) @ X. + + With ``intercept=True``, X is treated as if a column of 1 were appended as + first column of X. + X can be sparse, d must be an ndarray. Always returns a ndarray.""" + if sparse.issparse(X): + temp = (X.transpose().multiply(d) @ X).toarray() + else: + temp = (X.T * d) @ X + if intercept: + dim = X.shape[1] + 1 + if sparse.issparse(X): + order = 'F' if sparse.isspmatrix_csc(X) else 'C' + else: + order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' + res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) + res[0, 0] = d.sum() + res[1:, 0] = d @ X + res[0, 1:] = res[1:, 0] + res[1:, 1:] = temp + else: + res = temp + return res + + +def _min_norm_sugrad(coef, grad, P2, P1): + """Compute the gradient of all subgradients with minimal L2-norm. + + subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) + + g_i = grad_i + (P2*coef)_i + + if coef_i > 0: g_i + P1_i + if coef_i < 0: g_i - P1_i + if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) + + Parameters + ---------- + coef : ndarray + coef[0] may be intercept. + + grad : ndarray, shape=coef.shape + + P2 : {1d or 2d array, None} + always without intercept, ``None`` means P2 = 0 + + P1 : ndarray + always without intercept + """ + intercept = (coef.size == P1.size + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + # compute grad + coef @ P2 without intercept + grad_wP2 = grad[idx:].copy() + if P2 is None: + pass + elif P2.ndim == 1: + grad_wP2 += coef[idx:] * P2 + else: + grad_wP2 += coef[idx:] @ P2 + res = np.where(coef[idx:] == 0, + np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), + grad_wP2 + np.sign(coef[idx:]) * P1) + if intercept: + return np.concatenate(([grad[0]], res)) + else: + return res + + class Link(metaclass=ABCMeta): """Abstract base class for Link funtions.""" @@ -473,13 +549,16 @@ def starting_mu(self, y, weights=1, ind_weight=0.5): (1. - ind_weight) * np.average(y, weights=weights)) def _mu_deviance_derivative(self, coef, X, y, weights, link): - """Compute mu, the deviance and it's derivative w.r.t coef.""" - lin_pred = X @ coef + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) - dev = self.deviance(y, mu, weights) d1 = link.inverse_derivative(lin_pred) - devp = X.T @ (d1 * self.deviance_derivative(y, mu, weights)) - return mu, dev, devp + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp def _score(self, coef, phi, X, y, weights, link): r"""Compute the score function. @@ -499,12 +578,15 @@ def _score(self, coef, phi, X, y, weights, link): :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. Note: The derivative of the deviance w.r.t. coef equals -2 * score. """ - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) d = link.inverse_derivative(lin_pred) temp = sigma_inv * d * (y - mu) - score = X.T @ temp + if coef.size == X.shape[1] + 1: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp return score def _fisher_matrix(self, coef, phi, X, y, weights, link): @@ -526,14 +608,14 @@ def _fisher_matrix(self, coef, phi, X, y, weights, link): with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, see func:`_score`. """ - n_samples = X.shape[0] - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - d2 = link.inverse_derivative(lin_pred)**2 - d2_sigma_inv = sparse.dia_matrix((sigma_inv*d2, 0), - shape=(n_samples, n_samples)) - fisher_matrix = X.T @ d2_sigma_inv @ X + d = link.inverse_derivative(lin_pred) + d2_sigma_inv = sigma_inv * d * d + intercept = (coef.size == X.shape[1] + 1) + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) return fisher_matrix def _observed_information(self, coef, phi, X, y, weights, link): @@ -559,17 +641,17 @@ def _observed_information(self, coef, phi, X, y, weights, link): \right)`, see :func:`score_` function and :func:`_fisher_matrix`. """ - n_samples = X.shape[0] - lin_pred = X @ coef + lin_pred = _safe_lin_pred(X, coef) mu = link.inverse(lin_pred) sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) dp = link.inverse_derivative2(lin_pred) d2 = link.inverse_derivative(lin_pred)**2 v = self.unit_variance_derivative(mu)/self.unit_variance(mu) r = y - mu - temp = sparse.dia_matrix((sigma_inv*(-dp*r+d2*v*r+d2), 0), - shape=(n_samples, n_samples)) - observed_information = X.T @ temp @ X + temp = sigma_inv * (-dp * r + d2 * v * r + d2) + intercept = (coef.size == X.shape[1] + 1) + observed_information = _safe_sandwich_dot(X, temp, + intercept=intercept) return observed_information def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, @@ -600,28 +682,29 @@ def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, an array of shape (X.shape[1], X.shape[1]) * If diag_fisher is ``True`, an array of shape (X.shape[0]) """ - n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) # eta = linear predictor - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) d1 = link.inverse_derivative(eta) # = h'(eta) # Alternatively: # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g # d1 = 1./link.derivative(mu) - score = X.T @ (sigma_inv * d1 * (y - mu)) - # - d2_sigma_inv = sigma_inv * (d1**2) + d1_sigma_inv = d1 * sigma_inv + temp = d1_sigma_inv * (y - mu) + if intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X + + d2_sigma_inv = d1 * d1_sigma_inv if diag_fisher: - return eta, mu, score, d2_sigma_inv + fisher_matrix = d2_sigma_inv else: - if sparse.issparse(X): - d2_sigma_inv = sparse.dia_matrix((d2_sigma_inv, 0), - shape=(n_samples, n_samples)) - fisher = (X.T @ d2_sigma_inv @ X).toarray() - else: - fisher = (X.T * d2_sigma_inv) @ X - return eta, mu, score, fisher + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return eta, mu, score, fisher_matrix class TweedieDistribution(ExponentialDispersionModel): @@ -809,7 +892,7 @@ def unit_deviance(self, y, mu): return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) -def _irls_step(X, W, P2, z): +def _irls_step(X, W, P2, z, fit_intercept=True): """Compute one step in iteratively reweighted least squares. Solve A w = b for w with @@ -829,43 +912,57 @@ def _irls_step(X, W, P2, z): P2 : {ndarray, sparse matrix}, shape (n_features, n_features) The L2-penalty matrix or vector (=diagonal matrix) - z : ndarray, shape (n_samples,) + z : ndarray, shape (n_samples,) Working observations + fit_intercept : boolean, optional (default=True) + Returns ------- - coef: ndarray, shape (X.shape[1]) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. """ # Note: solve vs least squares, what is more appropriate? # scipy.linalg.solve seems faster, but scipy.linalg.lstsq # is more robust. - n_samples, n_features = X.shape - if sparse.issparse(X): - W = sparse.dia_matrix((W, 0), shape=(n_samples, n_samples)).tocsr() + # Note: X.T @ W @ X is not sparse, even when X is sparse. + # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) + if fit_intercept: + Wz = W * z + if sparse.issparse(X): + b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) + else: + b = np.concatenate(([Wz.sum()], X.T @ Wz)) + A = _safe_sandwich_dot(X, W, intercept=fit_intercept) if P2.ndim == 1: - L2 = (sparse.dia_matrix((P2, 0), shape=(n_features, n_features)) - ).tocsr() + idx = np.arange(start=1, stop=A.shape[0]) + A[(idx, idx)] += P2 # add to diag elements without intercept + elif sparse.issparse(P2): + A[1:, 1:] += P2.toarray() else: - L2 = sparse.csr_matrix(P2) - XtW = X.transpose() * W - A = XtW * X + L2 - b = XtW * z - # coef = splinalg.spsolve(A, b) - coef, *_ = splinalg.lsmr(A, b) + A[1:, 1:] += P2 else: - XtW = (X.T * W) - A = XtW.dot(X) + if sparse.issparse(X): + XtW = X.transpose().multiply(W) + A = (XtW @ X).toarray() + else: + XtW = (X.T * W) + A = XtW @ X + b = XtW @ z if P2.ndim == 1: A[np.diag_indices_from(A)] += P2 + elif sparse.issparse(P2): + A += P2.toarray() else: A += P2 - b = XtW.dot(z) - # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) - coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) return coef -def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): +def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, + max_iter, tol): """Solve GLM with L2 penalty by IRLS algorithm. Note: If X is sparse, P2 must also be sparse. @@ -889,7 +986,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): # Note: ' denotes derivative, but also transpose for matrices # eta = linear predictor - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) # D = h'(eta) hp = link.inverse_derivative(eta) @@ -906,10 +1003,10 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): z = eta + (y - mu) / hp # solve A*coef = b # A = X' W X + P2, b = X' W z - coef = _irls_step(X, W, P2, z) + coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) # updated linear predictor # do it here for updated values for tolerance - eta = X @ coef + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) hp = link.inverse_derivative(eta) V = family.variance(mu, phi=1, weights=weights) @@ -917,11 +1014,18 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): # which tolerace? |coef - coef_old| or gradient? # use gradient for compliance with newton-cg and lbfgs # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - gradient = -(X.T @ (hp*(y-mu)/V)) + temp = hp * (y - mu) / V + if sparse.issparse(X): + gradient = -(X.transpose() @ temp) + else: + gradient = -(X.T @ temp) + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - gradient += P2*coef + gradient += P2 * coef[idx:] else: - gradient += P2 @ coef + gradient += P2 @ coef[idx:] + if fit_intercept: + gradient = np.concatenate(([-temp.sum()], gradient)) if (np.max(np.abs(gradient)) <= tol): converged = True break @@ -937,7 +1041,7 @@ def _irls_solver(coef, X, y, weights, P2, family, link, max_iter, tol): def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, max_inner_iter=1000, selection='cyclic', random_state=None, diag_fisher=False): - """Compute inner loop of coordinate descent = cycles through features. + """Compute inner loop of coordinate descent, i.e. cycles through features. Minimization of 1-d subproblems:: @@ -953,24 +1057,31 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, # of Improved GLMNET or Gap Safe Screening Rules # https://arxiv.org/abs/1611.05780 n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept B = fisher if P2.ndim == 1: - coef_P2 = coef * P2 + coef_P2 = coef[idx:] * P2 if not diag_fisher: - B[np.diag_indices_from(B)] += P2 + idiag = np.arange(start=idx, stop=B.shape[0]) + # B[np.diag_indices_from(B)] += P2 + B[(idiag, idiag)] += P2 else: - coef_P2 = P2 @ coef # P2 is symmetric, mat @ vec is usually faster + coef_P2 = coef[idx:] @ P2 if not diag_fisher: if sparse.issparse(P2): - B += P2.toarray() + B[idx:, idx:] += P2.toarray() else: - B += P2 - A = -score + coef_P2 # + d @ (H+P2) but d=0 so far + B[idx:, idx:] += P2 + # A = -score + coef_P2 + A = -score + A[idx:] += coef_P2 + # A += d @ (H+P2) but so far d=0 # inner loop - inner_iter = 0 - while inner_iter < max_inner_iter: + for inner_iter in range(1, max_inner_iter+1): inner_iter += 1 n_cycles += 1 + # cycle through features, update intercept separately at the end if selection == 'random': featurelist = random_state.permutation(n_features) else: @@ -985,70 +1096,85 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, # with beta = z+d, beta_hat = d-a/b and gamma = c/b # z = 1/b * S(bd-a,c) - d # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - a = A[j] + jdx = j+idx # index for arrays containing entries for intercept + a = A[jdx] if diag_fisher: + # Note: fisher is ndarray of shape (n_samples,) => no idx + # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway + Bj = np.zeros_like(A) + if intercept: + Bj[0] = fisher.sum() if sparse.issparse(X): - xj = X[:, j] - b = xj.transpose() @ xj.multiply(fisher[:, np.newaxis]) - b = b[0, 0] + Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X + ).toarray().ravel() else: - b = X[:, j] @ (fisher * X[:, j]) + Bj[idx:] = (fisher * X[:, j]) @ X if P2.ndim == 1: - b += P2[j] + Bj[idx:] += P2[j] else: - b += P2[j, j] + if sparse.issparse(P2): + # slice columns as P2 is csc + Bj[idx:] += P2[:, j].toarray().ravel() + else: + Bj[idx:] += P2[:, j] + b = Bj[jdx] else: - b = B[j, j] + b = B[jdx, jdx] + # those ten lines aree what it is all about if b <= 0: z = 0 elif P1[j] == 0: z = -a/b - elif a + P1[j] < b * (coef[j] + d[j]): + elif a + P1[j] < b * (coef[jdx] + d[jdx]): z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[j] + d[j]): + elif a - P1[j] > b * (coef[jdx] + d[jdx]): z = -(a - P1[j])/b else: - z = -(coef[j] + d[j]) + z = -(coef[jdx] + d[jdx]) # update direction d - d[j] += z + d[jdx] += z # update A because d_j is now d_j+z # A = f'(w) + d*H(w) + (w+d)*P2 # => A += (H+P2)*e_j z = B_j * z # Note: B is symmetric B = B.transpose if diag_fisher: - if sparse.issparse(X): - A += (X.transpose() @ - X[:, j].multiply(fisher[:, np.newaxis]) - ).toarray().ravel() * z - else: - # A += (X.T @ (fisher * X[:, j])) * z - # same without transpose of X - A += ((fisher * X[:, j]) @ X) * z - - if P2.ndim == 1: - A[j] += P2[j] * z - elif sparse.issparse(P2): - # slice columns as P2 is csc - A += P2[:, j].toarray().ravel() * z - else: - A += P2[:, j] * z + # Bj = B[:, j] calculated above, still valid + A += Bj * z else: # B is symmetric, C- or F-contiguous, but never sparse if B.flags['F_CONTIGUOUS']: # slice columns like for sparse csc - A += B[:, j] * z + A += B[:, jdx] * z else: # B.flags['C_CONTIGUOUS'] might be true # slice rows - A += B[j, :] * z - # end of cycle + A += B[jdx, :] * z + # end of cycle over features + # update intercept + if intercept: + if diag_fisher: + Bj = np.zeros_like(A) + Bj[0] = fisher.sum() + Bj[1:] = fisher @ X + b = Bj[0] + else: + b = B[0, 0] + z = 0 if b <= 0 else -A[0]/b + d[0] += z + if diag_fisher: + A += Bj * z + else: + if B.flags['F_CONTIGUOUS']: + A += B[:, 0] * z + else: + A += B[0, :] * z + # end of complete cycle # stopping criterion for inner loop # sum_i(|minimum of norm of subgrad of q(d)_i|) - mn_subgrad = np.where(coef + d == 0, - np.sign(A) * np.maximum(np.abs(A) - P1, 0), - A + np.sign(coef + d) * P1) + # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 + mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= inner_tol: if inner_iter == 1: @@ -1058,7 +1184,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, return d, coef_P2, n_cycles, inner_tol -def _cd_solver(coef, X, y, weights, P1, P2, family, link, +def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, max_iter=100, max_inner_iter=1000, tol=1e-4, selection='cyclic ', random_state=None, diag_fisher=False, copy_X=True): @@ -1083,7 +1209,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 A = f'(w) + d*H(w) + (w+d)*P2 - B = H+P2 + B = H + P2 Repeat steps 1-3 until convergence. Note: Use Fisher matrix instead of Hessian for H. @@ -1091,7 +1217,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Parameters ---------- - coef: ndarray, shape (n_features,) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present). If not sparse, @@ -1112,6 +1240,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, The L2-penalty matrix or vector (=diagonal matrix). If a matrix is passed, it must be symmetric. If X is sparse, P2 must also be sparse. + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + family : ExponentialDispersionModel link : Link @@ -1120,8 +1252,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Maximum numer of outer (Newton) iterations. max_inner_iter : int, optional (default=1000) - Maximum number of iterations, i.e. cycles over all features, in inner - loop. + Maximum number of iterations in each inner loop, i.e. max number of + cycles over all features per inner loop. tol : float, optional (default=1e-4) Covergence criterion is @@ -1133,8 +1265,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, random_state : {int, RandomState instance, None}, optional (default=None) diag_fisher : boolean, optional (default=False) - 'False' calculates full fisher matrix, 'True' only diagonal matrix s.t. - fisher = X.T @ diag @ X. This saves storage but needs more + ``False`` calculates full fisher matrix, ``True`` only diagonal matrix + s.t. fisher = X.T @ diag @ X. This saves storage but needs more matrix-vector multiplications. copy_X : boolean, optional (default=True) @@ -1142,7 +1274,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, Returns ------- - coef : ndarray, shape (n_features,) + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. n_iter : numer of outer iterations = newton iterations @@ -1174,6 +1308,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, n_cycles = 0 # number of (complete) cycles over features converged = False n_samples, n_features = X.shape + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept # line search parameters (beta, sigma) = (0.5, 0.01) # some precalculations @@ -1186,16 +1321,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, d = np.zeros_like(coef) # initial stopping tolerance of inner loop # use L1-norm of minimum of norm of subgradient of F - # fp_wP2 = f'(w) + w*P2 - if P2.ndim == 1: - fp_wP2 = -score + coef * P2 - else: - # Note: P2 is symmetric and matrix @ vector is faster for sparse - # matrices. - fp_wP2 = -score + P2 @ coef - inner_tol = np.where(coef == 0, - np.sign(fp_wP2) * np.maximum(np.abs(fp_wP2) - P1, 0), - fp_wP2 + np.sign(coef) * P1) + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) inner_tol = linalg.norm(inner_tol, ord=1) # outer loop while n_iter < max_iter: @@ -1211,23 +1337,23 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, # F(w + lambda d) - F(w) <= lambda * bound # bound = sigma * (f'(w)*d + w*P2*d # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1 * coef, ord=1) + P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) + P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) # Note: coef_P2 already calculated and still valid - bound = sigma * (-(score @ d) + coef_P2 @ d + - linalg.norm(P1 * (coef + d), ord=1) - P1w_1) + bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * (coef_P2 @ coef) + P1w_1) + 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) la = 1./beta for k in range(20): la *= beta # starts with la=1 coef_wd = coef + la * d - mu_wd = link.inverse(X @ coef_wd) + mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1 * coef_wd, ord=1)) + linalg.norm(P1 * coef_wd[idx:], ord=1)) if P2.ndim == 1: - Fwd += 0.5 * ((coef_wd * P2) @ coef_wd) + Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) else: - Fwd += 0.5 * (coef_wd @ (P2 @ coef_wd)) + Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) if Fwd - Fw <= sigma * la * bound: break # update coefficients @@ -1238,16 +1364,10 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, coef=coef, phi=1, X=X, y=y, weights=weights, link=link, diag_fisher=diag_fisher) # stopping criterion for outer loop - # sum_i(|minimum of norm of subgrad of F(w)_i|) + # sum_i(|minimum-norm of subgrad of F(w)_i|) # fp_wP2 = f'(w) + w*P2 # Note: eta, mu and score are already updated - if P2.ndim == 1: - fp_wP2 = -score + coef * P2 - else: - fp_wP2 = -score + P2 @ coef # P2 is symmetric, mat @ vec is faster - mn_subgrad = np.where(coef == 0, - np.sign(fp_wP2)*np.maximum(np.abs(fp_wP2)-P1, 0), - fp_wP2 + np.sign(coef) * P1) + mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) mn_subgrad = linalg.norm(mn_subgrad, ord=1) if mn_subgrad <= tol: converged = True @@ -1255,8 +1375,8 @@ def _cd_solver(coef, X, y, weights, P1, P2, family, link, # end of outer loop if not converged: warnings.warn("Coordinate descent failed to converge. Increase" - " the number of iterations (currently {0})" - .format(max_iter), ConvergenceWarning) + " the maximum number of iterations max_iter" + " (currently {0})".format(max_iter), ConvergenceWarning) return coef, n_iter, n_cycles @@ -1387,8 +1507,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where g_i is the i-th component of the gradient (derivative) of the objective function. For the cd solver, covergence is reached - when ``sum_i(|minimum of norm of g_i|)``, where g_i is the - subgradient of the objective. + when ``sum_i(|minimum-norm of g_i|)``, where g_i is the + subgradient of the objective and minimum-norm of g_i is the element of + the subgradient g_i with the smallest L2-norm. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1563,19 +1684,7 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1. input validation # ####################################################################### - # 1.1 validate arguments of fit ####################################### - _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], - dtype=_dtype, y_numeric=True, multi_output=False, - copy=self.copy_X) - # Without converting y to float, deviance might raise - # ValueError: Integers to negative integer powers are not allowed. - # Also, y must not be sparse. - y = np.asarray(y, dtype=np.float64) - - weights = _check_weights(sample_weight, y.shape[0]) - - # 1.2 validate arguments of __init__ ################################## + # 1.1 validate arguments of __init__ ################################## # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): @@ -1668,25 +1777,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - start_params = self.start_params - if isinstance(start_params, str): - if start_params not in ['irls', 'least_squares', 'zero']: - raise ValueError("The argument start_params must be 'irls', " - "'least-squares', 'zero' or an array of " - " correct length;" - " got(start_params={0})".format(start_params)) - else: - start_params = check_array(start_params, accept_sparse=False, - force_all_finite=True, ensure_2d=False, - dtype=_dtype, copy=True) - if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or - (start_params.ndim != 1)): - raise ValueError("Start values for parameters must have the" - "right length and dimension; required (length" - "={0}, ndim=1); got (length={1}, ndim={2})." - .format(X.shape[1] + self.fit_intercept, - start_params.shape[0], - start_params.ndim)) if self.selection not in ['cyclic', 'random']: raise ValueError("The argument selection must be 'cyclic' or " "'random'; got (selection={0})" @@ -1702,36 +1792,59 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument check_input must be bool; got " "(check_input={0})".format(self.check_input)) + family = self._family_instance + link = self._link_instance + + # 1.2 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + if solver == 'cd': + _stype = ['csc'] + else: + _stype = ['csc', 'csr'] + X, y = check_X_y(X, y, accept_sparse=_stype, + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) + + weights = _check_weights(sample_weight, y.shape[0]) + + n_samples, n_features = X.shape + + # 1.3 arguments to take special care ################################## + # P1, P2, start_params if isinstance(self.P1, str) and self.P1 == 'identity': - P1 = np.ones(X.shape[1]) + P1 = np.ones(n_features) else: P1 = np.atleast_1d(self.P1) try: - P1 = P1.astype(np.float64, casting='safe', copy=True) + P1 = P1.astype(np.float64, casting='safe', copy=False) except TypeError: raise TypeError("The given P1 cannot be converted to a numeric" "array; got (P1.dtype={0})." .format(P1.dtype)) - if (P1.ndim != 1) or (P1.shape[0] != X.shape[1]): + if (P1.ndim != 1) or (P1.shape[0] != n_features): raise ValueError("P1 must be either 'identity' or a 1d array " "with the length of X.shape[1]; " "got (P1.shape[0]={0}), " "needed (X.shape[1]={1})." - .format(P1.shape[0], X.shape[1])) + .format(P1.shape[0], n_features)) # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': if sparse.issparse(X): - P2 = (sparse.dia_matrix((np.ones(X.shape[1]), 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + P2 = (sparse.dia_matrix((np.ones(n_features), 0), + shape=(n_features, n_features))).tocsc() else: - P2 = np.ones(X.shape[1]) + P2 = np.ones(n_features) else: P2 = check_array(self.P2, copy=True, - accept_sparse=['csr', 'csc'], + accept_sparse=_stype, dtype=_dtype, ensure_2d=False) if P2.ndim == 1: P2 = np.asarray(P2) - if P2.shape[0] != X.shape[1]: + if P2.shape[0] != n_features: raise ValueError("P2 should be a 1d array of shape " "(n_features,) with " "n_features=X.shape[1]; " @@ -1739,12 +1852,12 @@ def fit(self, X, y, sample_weight=None): .format(P2.shape[0], X.shape[1])) if sparse.issparse(X): P2 = (sparse.dia_matrix((P2, 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + shape=(n_features, n_features))).tocsc() elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and P2.shape[0] == X.shape[1]): if sparse.issparse(X): P2 = (sparse.dia_matrix((P2, 0), - shape=(X.shape[1], X.shape[1]))).tocsr() + shape=(n_features, n_features))).tocsc() else: raise ValueError("P2 must be either None or an array of shape " "(n_features, n_features) with " @@ -1752,37 +1865,31 @@ def fit(self, X, y, sample_weight=None): "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) - family = self._family_instance - link = self._link_instance - - if self.fit_intercept: - # Note: intercept is first column <=> coef[0] is for intecept - if sparse.issparse(X): - Xnew = sparse.hstack([np.ones([X.shape[0], 1]), X], - format=X.format) - else: - Xnew = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1) - P1 = np.concatenate((np.array([0]), P1)) - if P2.ndim == 1: - P2 = np.concatenate((np.array([0]), P2)) - elif sparse.issparse(P2): - P2 = sparse.block_diag((sparse.dia_matrix((1, 1)), P2), - format=P2.format, - dtype=P2.dtype).tocsr() - else: - # as of numpy 1.13 this would work: - # P2 = np.block([[np.zeros((1, 1)), np.zeros((1, X.shape[1]))], - # [np.zeros((X.shape[1], 1)), P2]]) - P2 = np.hstack((np.zeros((X.shape[1], 1)), P2)) - P2 = np.vstack((np.zeros((1, X.shape[1]+1)), P2)) + start_params = self.start_params + if isinstance(start_params, str): + if start_params not in ['irls', 'least_squares', 'zero']: + raise ValueError("The argument start_params must be 'irls', " + "'least-squares', 'zero' or an array of " + " correct length;" + " got(start_params={0})".format(start_params)) else: - Xnew = X + start_params = check_array(start_params, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): + raise ValueError("Start values for parameters must have the" + "right length and dimension; required (length" + "={0}, ndim=1); got (length={1}, ndim={2})." + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0], + start_params.ndim)) - n_samples, n_features = Xnew.shape l1 = self.alpha * self.l1_ratio l2 = self.alpha * (1 - self.l1_ratio) - P1 *= l1 - P2 *= l2 + # P1 and P2 are now for sure copies + P1 = l1 * P1 + P2 = l2 * P2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric if P2.ndim == 2: @@ -1791,14 +1898,12 @@ def fit(self, X, y, sample_weight=None): else: P2 = 0.5 * (P2 + P2.T) - # 1.3 additional validations ########################################## + # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) - if not np.all(weights >= 0): - raise ValueError("Sample weights must be non-negative.") # check if P1 has only non-negative values, negative values might # indicate group lasso in the future. if not isinstance(self.P1, str): # if self.P1 != 'identity': @@ -1830,7 +1935,7 @@ def fit(self, X, y, sample_weight=None): else: if not np.all(linalg.eigvalsh(P2) >= epsneg): raise ValueError("P2 must be positive semi-definite.") - # TODO: if alpha=0 check that Xnew is not rank deficient + # TODO: if alpha=0 check that X is not rank deficient # TODO: what else to check? ####################################################################### @@ -1874,13 +1979,14 @@ def fit(self, X, y, sample_weight=None): z = eta + (y-mu)/hp # solve A*coef = b # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(Xnew, W, P2, z) + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) elif start_params == 'least_squares': # less restrictive tolerance for finding start values tol = np.max([self.tol, np.sqrt(self.tol)]) if self.alpha == 0: reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ elif self.l1_ratio <= 0.01: # ElasticNet says l1_ratio <= 0.01 is not reliable @@ -1888,19 +1994,21 @@ def fit(self, X, y, sample_weight=None): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 reg = Ridge(copy_X=True, fit_intercept=False, alpha=self.alpha*n_samples, tol=tol) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ else: # TODO: Does this make sense at all? reg = ElasticNet(copy_X=True, fit_intercept=False, alpha=self.alpha, l1_ratio=self.l1_ratio, tol=tol) - reg.fit(Xnew, link.link(y)) + reg.fit(X, link.link(y)) coef = reg.coef_ else: # start_params == 'zero' - coef = np.zeros(n_features) if self.fit_intercept: + coef = np.zeros(n_features+1) coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) else: # assign given array as start values coef = start_params @@ -1915,24 +2023,28 @@ def fit(self, X, y, sample_weight=None): # Note: we already symmetriezed P2 = 1/2 (P2 + P2') if solver == 'irls': coef, self.n_iter_ = \ - _irls_solver(coef=coef, X=Xnew, y=y, weights=weights, P2=P2, - family=family, link=link, max_iter=self.max_iter, - tol=self.tol) + _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, + fit_intercept=self.fit_intercept, family=family, + link=link, max_iter=self.max_iter, tol=self.tol) # 4.2 L-BFGS ########################################################## elif solver == 'lbfgs': def func(coef, X, y, weights, P2, family, link): - mu, dev, devp = \ + mu, devp = \ family._mu_deviance_derivative(coef, X, y, weights, link) + dev = family.deviance(y, mu, weights) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - obj = 0.5 * dev + 0.5 * (coef @ L2) - objp = 0.5 * devp + L2 + L2 = P2 @ coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + objp = 0.5 * devp + objp[idx:] += L2 return obj, objp - args = (Xnew, y, weights, P2, family, link) + args = (X, y, weights, P2, family, link) coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1952,50 +2064,66 @@ def func(coef, X, y, weights, P2, family, link): # precisely, expected hessian of deviance. elif solver == 'newton-cg': def func(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = coef @ (P2 * coef) + L2 = coef[idx:] @ (P2 * coef[idx:]) else: - L2 = coef @ (P2 @ coef) - mu = link.inverse(X @ coef) + L2 = coef[idx:] @ (P2 @ coef[idx:]) + mu = link.inverse(_safe_lin_pred(X, coef)) return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 def grad(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - eta = X @ coef - mu = link.inverse(eta) - d1 = link.inverse_derivative(eta) - grad = X.T @ (d1 * family.deviance_derivative(y, mu, weights)) - return 0.5 * grad + L2 + L2 = P2 @ coef[idx:] + objp = 0.5 * devp + objp[idx:] += L2 + return objp def grad_hess(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept if P2.ndim == 1: - L2 = P2 * coef + L2 = P2 * coef[idx:] else: - L2 = P2 @ coef - eta = X @ coef + L2 = P2 @ coef[idx:] + eta = _safe_lin_pred(X, coef) mu = link.inverse(eta) d1 = link.inverse_derivative(eta) - grad = 0.5 * \ - (X.T @ (d1 * family.deviance_derivative(y, mu, weights))) \ - + L2 - # expected hessian = X.T @ diag_matrix @ X + temp = d1 * family.deviance_derivative(y, mu, weights) + if intercept: + grad = np.concatenate(([0.5 * temp.sum()], + 0.5 * temp @ X + L2)) + else: + grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + + # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix diag = d1**2 / family.variance(mu, phi=1, weights=weights) - def Hs(s): - ret = 0.5 * (X.T @ (diag * (X @ s))) + def Hs(coef): + # return (0.5 * fisher + P2) @ coef + # ret = 0.5 * (X.T @ (diag * (X @ coef))) + ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) if P2.ndim == 1: - ret += P2 * s + ret += P2 * coef[idx:] else: - ret += P2 @ s + ret += P2 @ coef[idx:] + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) + ret = np.concatenate(([0.5 * (h0i @ coef)], + ret + 0.5 * coef[0] * h0i[1:])) return ret return grad, Hs - args = (Xnew, y, weights, P2, family, link) + args = (X, y, weights, P2, family, link) coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, args=args, maxiter=self.max_iter, tol=self.tol) @@ -2007,13 +2135,14 @@ def Hs(s): elif solver == 'cd': # For coordinate descent, if X is sparse, it should be csc format # If X is sparse, P2 must also be csc - if sparse.issparse(Xnew): - Xnew = Xnew.tocsc(copy=self.copy_X) + if sparse.issparse(X): + X = X.tocsc(copy=self.copy_X) P2 = sparse.csc_matrix(P2) coef, self.n_iter_, self._n_cycles = \ - _cd_solver(coef=coef, X=Xnew, y=y, weights=weights, P1=P1, - P2=P2, family=family, link=link, + _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, + P2=P2, fit_intercept=self.fit_intercept, + family=family, link=link, max_iter=self.max_iter, tol=self.tol, selection=self.selection, random_state=random_state, diag_fisher=self.diag_fisher, copy_X=self.copy_X) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index ae8a5f4cfc5e7..cdac151b77de6 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -566,7 +566,7 @@ def obj(coef): pd = PoissonDistribution() link = LogLink() N = y.shape[0] - mu = link.inverse(X @ coef[1:]+coef[0]) + mu = link.inverse(X @ coef[1:] + coef[0]) alpha, l1_ratio = (1, 0.5) return 1./(2.*N) * pd.deviance(y, mu) \ + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ @@ -587,9 +587,9 @@ def obj(coef): assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) - # start_params='least_squares' with different alpha + # check warm_start, therefore start with different alpha glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, - family='poisson', + family='poisson', max_iter=300, link='log', solver='cd', tol=1e-5, start_params='zero') glm.fit(X, y) From c9a7a95e89deaadc40b92edd8e1208d550998a72 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 21 Apr 2019 17:03:07 +0200 Subject: [PATCH 051/269] Revised option start_params * renamed option irls into guess * removed option least_squares * updated tests --- doc/modules/linear_model.rst | 12 +- sklearn/linear_model/glm.py | 223 ++++++++++++++----------- sklearn/linear_model/tests/test_glm.py | 140 ++++++++-------- 3 files changed, 207 insertions(+), 168 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index e60e9e84a4747..4bede17af581a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -925,12 +925,12 @@ follows: >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, - check_input=True, copy_X=True, family='poisson', - fit_dispersion=None, fit_intercept=True, l1_ratio=0, - link='log', max_iter=100, random_state=None, - selection='cyclic', solver='auto', - start_params='irls', tol=0.0001, verbose=0, - warm_start=False) + check_input=True, copy_X=True, diag_fisher=False, + family='poisson', fit_dispersion=None, + fit_intercept=True, l1_ratio=0, link='log', + max_iter=100, random_state=None, selection='cyclic', + solver='auto', start_params='guess', tol=0.0001, + verbose=0, warm_start=False) >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE array([0.24630169, 0.43373464]) >>> reg.intercept_ #doctest: +ELLIPSIS diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index 2afd1ddf8c79c..a53cc39ecd307 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -48,9 +48,6 @@ import scipy.sparse.linalg as splinalg from scipy.optimize import fmin_l_bfgs_b import warnings -from .base import LinearRegression -from .coordinate_descent import ElasticNet -from .ridge import Ridge from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y @@ -93,6 +90,14 @@ def _safe_lin_pred(X, coef): return X @ coef +def _safe_toarray(X): + """Returns a numpy array.""" + if sparse.issparse(X): + return X.toarray() + else: + return np.asarray(X) + + def _safe_sandwich_dot(X, d, intercept=False): """Compute sandwich product X.T @ diag(d) @ X. @@ -100,7 +105,9 @@ def _safe_sandwich_dot(X, d, intercept=False): first column of X. X can be sparse, d must be an ndarray. Always returns a ndarray.""" if sparse.issparse(X): - temp = (X.transpose().multiply(d) @ X).toarray() + temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) + # for older versions of numpy and scipy, temp may be a np.matrix + temp = _safe_toarray(temp) else: temp = (X.T * d) @ X if intercept: @@ -945,7 +952,8 @@ def _irls_step(X, W, P2, z, fit_intercept=True): else: if sparse.issparse(X): XtW = X.transpose().multiply(W) - A = (XtW @ X).toarray() + # for older versions of numpy and scipy, A may be a np.matrix + A = _safe_toarray(XtW @ X) else: XtW = (X.T * W) A = XtW @ X @@ -1105,8 +1113,9 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, if intercept: Bj[0] = fisher.sum() if sparse.issparse(X): - Bj[idx:] = (X[:, j].transpose().multiply(fisher) @ X - ).toarray().ravel() + Bj[idx:] = _safe_toarray(X[:, j].transpose() @ + X.multiply(fisher[:, np.newaxis]) + ).ravel() else: Bj[idx:] = (fisher * X[:, j]) @ X @@ -1477,27 +1486,32 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statisic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'newton-cg', 'lbfgs', 'cd'}, \ + solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ optional (default='auto') Algorithm to use in the optimization problem: 'auto' Sets 'irls' if l1_ratio equals 0, else 'cd'. + 'cd' + Coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. Note that in order to avoid unnecessary memory + duplication of X in the ``fit`` method, X should be directly passed + as a Fortran-contiguous numpy array or sparse csc matrix. + 'irls' - Iterated reweighted least squares (with Fisher scoring). + Iterated reweighted least squares. It is the standard algorithm for GLMs. It cannot deal with L1 penalties. + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + 'newton-cg', 'lbfgs' - Cannot deal with L1 penalties. + Newton conjugate gradient algorithm cannot deal with L1 penalties. - 'cd' - Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. Note that in order to avoid unnecessary memory - duplication of the X argument in the ``fit`` method, X should be - directly passed as a Fortran-contiguous numpy array or sparse csc - matrix. + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -1505,11 +1519,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, optional (default=1e-4) Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where g_i is the i-th component of the gradient (derivative) of + where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. For the cd solver, covergence is reached - when ``sum_i(|minimum-norm of g_i|)``, where g_i is the - subgradient of the objective and minimum-norm of g_i is the element of - the subgradient g_i with the smallest L2-norm. + when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the + subgradient of the objective and minimum-norm of ``g_i`` is the element + of the subgradient ``g_i`` with the smallest L2-norm. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` @@ -1518,23 +1532,21 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): does not exit (first call to ``fit``), option ``start_params`` sets the start values for ``coef_`` and ``intercept_``. - start_params : {'irls', 'least_squares', 'zero', array of shape \ - (n_features*, )}, optional (default='irls') + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') Relevant only if ``warm_start=False`` or if fit is called the first time (``self.coef_`` does not yet exist). - 'irls' + 'guess' Start values of mu are calculated by family.starting_mu(..). Then, - one step of irls obtains start values for ``coef_``. This gives - usually good results. - - 'least_squares' - Start values for ``coef_`` are obtained by a least squares fit in the - link space (y is transformed to the space of the linear predictor). + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step, else the Newton step is + calculated by the cd solver. + This gives usually good starting values. 'zero' All coefficients are set to zero. If ``fit_intercept=True``, the - start value for the intercept is obtained by the average of y. + start value for the intercept is obtained by the weighted average of y. array The array of size n_features* is directly used as start values @@ -1560,17 +1572,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): RandomState instance used by `np.random`. Used when ``selection`` == 'random'. - diag_fisher : boolean, (default=False) - Only relevant for solver 'cd'. If ``False``, the full Fisher matrix - (expected Hessian) is computed in each outer iteretion (Newton - iteration). If ``True``, only a diagonal matrix (stored as 1d array) is - computed, such that fisher = X.T @ diag @ X. This saves memory and - matrix-matrix multiplications, but needs more matrix-vector - multiplications. If you use large sparse X or if you have many - features, i.e. n_features >> n_samples, you might set this option to - ``True``. - - copy_X : boolean, optional, default True + diag_fisher : boolean, optional, (default=False) + Only relevant for solver 'cd' (see also ``start_params='guess'``). + If ``False``, the full Fisher matrix (expected Hessian) is computed in + each outer iteration (Newton iteration). If ``True``, only a diagonal + matrix (stored as 1d array) is computed, such that + fisher = X.T @ diag @ X. This saves memory and matrix-matrix + multiplications, but needs more matrix-vector multiplications. If you + use large sparse X or if you have many features, + i.e. n_features >> n_samples, you might set this option to ``True``. + + copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. check_input : boolean, optional (default=True) @@ -1634,7 +1646,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params='irls', + tol=1e-4, warm_start=False, start_params='guess', selection='cyclic', random_state=None, diag_fisher=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha @@ -1867,11 +1879,10 @@ def fit(self, X, y, sample_weight=None): start_params = self.start_params if isinstance(start_params, str): - if start_params not in ['irls', 'least_squares', 'zero']: - raise ValueError("The argument start_params must be 'irls', " - "'least-squares', 'zero' or an array of " - " correct length;" - " got(start_params={0})".format(start_params)) + if start_params not in ['guess', 'zero']: + raise ValueError("The argument start_params must be 'guess', " + "'zero' or an array of correct length; " + "got(start_params={0})".format(start_params)) else: start_params = check_array(start_params, accept_sparse=False, force_all_finite=True, ensure_2d=False, @@ -1894,10 +1905,17 @@ def fit(self, X, y, sample_weight=None): # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric if P2.ndim == 2: if sparse.issparse(P2): - P2 = 0.5 * (P2 + P2.transpose()) + if sparse.isspmatrix_csc(P2): + P2 = 0.5 * (P2 + P2.transpose()).tocsc() + else: + P2 = 0.5 * (P2 + P2.transpose()).tocsr() else: P2 = 0.5 * (P2 + P2.T) + # For coordinate descent, if X is sparse, P2 must also be csc + if solver == 'cd' and sparse.issparse(X): + P2 = sparse.csc_matrix(P2) + # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): @@ -1964,45 +1982,63 @@ def fit(self, X, y, sample_weight=None): else: coef = self.coef_ elif isinstance(start_params, str): - if start_params == 'irls': - # See 3.1 IRLS - # Use mu_start and apply one irls step to calculate coef + if start_params == 'guess': + # Set mu=starting_mu of the family and do one Newton step + # If solver=cd use cd, else irls mu = family.starting_mu(y, weights=weights) - # linear predictor - eta = link.link(mu) - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / family.variance(mu, phi=1, weights=weights)) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(X, W, P2, z, - fit_intercept=self.fit_intercept) - elif start_params == 'least_squares': - # less restrictive tolerance for finding start values - tol = np.max([self.tol, np.sqrt(self.tol)]) - if self.alpha == 0: - reg = LinearRegression(copy_X=True, fit_intercept=False) - reg.fit(X, link.link(y)) - coef = reg.coef_ - elif self.l1_ratio <= 0.01: - # ElasticNet says l1_ratio <= 0.01 is not reliable - # => use Ridge - # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - reg = Ridge(copy_X=True, fit_intercept=False, - alpha=self.alpha*n_samples, tol=tol) - reg.fit(X, link.link(y)) - coef = reg.coef_ + eta = link.link(mu) # linear predictor + if solver in ['cd', 'lbfgs', 'newton-cg']: + # see function _cd_solver + sigma_inv = 1/family.variance(mu, phi=1, weights=weights) + d1 = link.inverse_derivative(eta) + temp = sigma_inv * d1 * (y - mu) + if self.fit_intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + + d2_sigma_inv = d1 * d1 * sigma_inv + diag_fisher = self.diag_fisher + if diag_fisher: + fisher = d2_sigma_inv + else: + fisher = \ + _safe_sandwich_dot(X, d2_sigma_inv, + intercept=self.fit_intercept) + # set up space for search direction d for inner loop + if self.fit_intercept: + coef = np.zeros(n_features+1) + else: + coef = np.zeros(n_features) + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # use less restrictive tolerance for initial guess + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, + P1=P1) + inner_tol = 4 * linalg.norm(inner_tol, ord=1) + # just one outer loop = Newton step + n_cycles = 0 + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, + inner_tol, max_inner_iter=1000, + selection=self.selection, + random_state=self.random_state, + diag_fisher=self.diag_fisher) + coef += d # for simplicity no line search here else: - # TODO: Does this make sense at all? - reg = ElasticNet(copy_X=True, fit_intercept=False, - alpha=self.alpha, l1_ratio=self.l1_ratio, - tol=tol) - reg.fit(X, link.link(y)) - coef = reg.coef_ + # See _irls_solver + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) else: # start_params == 'zero' if self.fit_intercept: coef = np.zeros(n_features+1) @@ -2048,7 +2084,7 @@ def func(coef, X, y, weights, P2, family, link): coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter) + maxiter=self.max_iter, factr=1e3) if self.verbose > 0: if info["warnflag"] == 1: warnings.warn("lbfgs failed to converge." @@ -2106,6 +2142,8 @@ def grad_hess(coef, X, y, weights, P2, family, link): # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix diag = d1**2 / family.variance(mu, phi=1, weights=weights) + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) def Hs(coef): # return (0.5 * fisher + P2) @ coef @@ -2116,7 +2154,6 @@ def Hs(coef): else: ret += P2 @ coef[idx:] if intercept: - h0i = np.concatenate(([diag.sum()], diag @ X)) ret = np.concatenate(([0.5 * (h0i @ coef)], ret + 0.5 * coef[0] * h0i[1:])) return ret @@ -2124,21 +2161,15 @@ def Hs(coef): return grad, Hs args = (X, y, weights, P2, family, link) - coef, n_iter_i = newton_cg(grad_hess, func, grad, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) + coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) # 4.4 coordinate descent ############################################## # Note: we already set P1 = l1*P1, see above # Note: we already set P2 = l2*P2, see above # Note: we already symmetriezed P2 = 1/2 (P2 + P2') elif solver == 'cd': - # For coordinate descent, if X is sparse, it should be csc format - # If X is sparse, P2 must also be csc - if sparse.issparse(X): - X = X.tocsc(copy=self.copy_X) - P2 = sparse.csc_matrix(P2) - coef, self.n_iter_, self._n_cycles = \ _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, P2=P2, fit_intercept=self.fit_intercept, diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index cdac151b77de6..17535c067bbb6 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -4,7 +4,7 @@ import scipy as sp from scipy import linalg, optimize, sparse -from sklearn.datasets import make_classification +from sklearn.datasets import make_classification, make_regression from sklearn.linear_model.glm import ( Link, IdentityLink, @@ -22,10 +22,12 @@ assert_array_equal, assert_array_almost_equal) +rng = np.random.RandomState(42) + + @pytest.mark.parametrize('link', Link.__subclasses__()) def test_link_properties(link): """Test link inverse and derivative.""" - rng = np.random.RandomState(0) x = rng.rand(100)*100 link = link() # instatiate object decimal = 10 @@ -86,7 +88,6 @@ def test_deviance_zero(family, chk_values): def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" - rng = np.random.RandomState(0) coef = np.array([-2, 1, 0, 1, 2.5]) phi = 0.5 X = rng.randn(10, 5) @@ -218,7 +219,6 @@ def test_glm_P2_argument(P2): def test_glm_P2_positive_semidefinite(): """Test GLM for a positive semi-definite P2 argument.""" n_samples, n_features = 10, 5 - rng = np.random.RandomState(42) y = np.arange(n_samples) X = np.zeros((n_samples, n_features)) P2 = np.diag([100, 10, 5, 0, -1E-5]) @@ -351,20 +351,15 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize( - 'family', - [NormalDistribution(), PoissonDistribution(), - GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecant()]) @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_identiy_regression(family, solver): +def test_glm_identiy_regression(solver): """Test GLM regression with identity link on a simple dataset.""" coef = [1, 2] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) - glm = GeneralizedLinearRegressor(alpha=0, family=family, link='identity', - fit_intercept=False, solver=solver) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False, solver=solver, + start_params='zero', tol=1e-7) res = glm.fit(X, y) assert_array_almost_equal(res.coef_, coef) @@ -375,34 +370,42 @@ def test_glm_identiy_regression(family, solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecant()]) -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_log_regression(family, solver): +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 6), + ('newton-cg', 1e-7, 6), + ('cd', 1e-7, 6)]) +def test_glm_log_regression(family, solver, tol, dec): """Test GLM regression with log link on a simple dataset.""" - coef = [1, 2] + coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=False, - solver=solver, start_params='least_squares') + solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + assert_array_almost_equal(res.coef_, coef, decimal=dec) @pytest.mark.filterwarnings('ignore::DeprecationWarning') -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_normal_ridge(solver): +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 5), + ('newton-cg', 1e-6, 5), + ('cd', 1e-6, 6)]) +def test_normal_ridge(solver, tol, dec): """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. """ - rng = np.random.RandomState(0) alpha = 1.0 # 1. With more samples than features - n_samples, n_features, n_predict = 10, 5, 10 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - T = rng.randn(n_predict, n_features) + n_samples, n_features, n_predict = 100, 7, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, @@ -410,69 +413,74 @@ def test_normal_ridge(solver): ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, - tol=1e-6, max_iter=100, solver=solver, - random_state=42) + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=False, - tol=1e-6, max_iter=100, solver=solver, - random_state=42, fit_dispersion='chisqr') + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng, + fit_dispersion='chisqr') glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) mu = glm.predict(X) assert_almost_equal(glm.dispersion_, np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse - n_samples, n_features, n_predict = 5, 10, 10 - y = rng.randn(n_samples) - X = sparse.csr_matrix(rng.randn(n_samples, n_features)) - T = sparse.csr_matrix(rng.randn(n_predict, n_features)) + n_samples, n_features, n_predict = 10, 100, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, solver='sag', normalize=False, max_iter=100000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-8, - family='normal', link='identity', - fit_intercept=True, solver=solver, - max_iter=300, random_state=42) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=300, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=5) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=5) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=5) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, solver='sag', normalize=False, max_iter=1000) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, tol=1e-7, - family='normal', link='identity', - fit_intercept=False, solver=solver) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol*2, max_iter=300, solver=solver, + check_input=False, random_state=rng) glm.fit(X, y) assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_) - assert_array_almost_equal(glm.predict(T), ridge.predict(T)) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) -@pytest.mark.parametrize('solver, decimal, tol', - [('irls', 7, 1e-8), - ('lbfgs', 5, 1e-7), - ('newton-cg', 5, 1e-7), - ('cd', 7, 1e-8)]) -def test_poisson_ridge(solver, decimal, tol): +@pytest.mark.parametrize('solver, tol, dec', + [('irls', 1e-7, 6), + ('lbfgs', 1e-7, 5), + ('newton-cg', 1e-7, 5), + ('cd', 1e-7, 7)]) +def test_poisson_ridge(solver, tol, dec): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" @@ -493,18 +501,17 @@ def test_poisson_ridge(solver, decimal, tol): fit_intercept=True, family='poisson', link='log', tol=tol, solver=solver, max_iter=300, - random_state=42) + random_state=rng) glm.fit(X, y) assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=decimal) + decimal=dec) assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=decimal) + decimal=dec) @pytest.mark.parametrize('diag_fisher', [False, True]) def test_normal_enet(diag_fisher): """Test elastic net regression with normal/gaussian family.""" - rng = np.random.RandomState(0) alpha, l1_ratio = 0.3, 0.7 n_samples, n_features = 20, 2 X = rng.randn(n_samples, n_features).copy(order='F') @@ -556,7 +563,8 @@ def test_poisson_enet(): y = np.array([0, 1, 1, 2]) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', link='log', solver='cd', tol=1e-8, - selection='random', random_state=42) + selection='random', random_state=rng, + start_params='guess') glm.fit(X, y) assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) @@ -591,7 +599,7 @@ def obj(coef): glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, family='poisson', max_iter=300, link='log', solver='cd', tol=1e-5, - start_params='zero') + selection='cyclic', start_params='zero') glm.fit(X, y) # warm start with original alpha and use of sparse matrices glm.warm_start = True @@ -612,9 +620,9 @@ def test_binomial_enet(alpha): n_samples = 500 X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, n_informative=5, n_redundant=0, n_repeated=0, - random_state=0) + random_state=rng) log = LogisticRegression( - penalty='elasticnet', random_state=0, fit_intercept=False, tol=1e-6, + penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), solver='saga') log.fit(X, y) From a7755de2cdc5022b6d41285730ed12bd41628e66 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 17:34:11 +0200 Subject: [PATCH 052/269] Fix a few typos --- doc/modules/linear_model.rst | 2 +- .../plot_poisson_spline_regression.py | 2 +- sklearn/linear_model/glm.py | 53 ++++++++++--------- sklearn/linear_model/tests/test_glm.py | 4 +- 4 files changed, 31 insertions(+), 30 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 4bede17af581a..8f17c67d950de 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -887,7 +887,7 @@ combination of the input variables :math:`X` via an inverse link function .. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The objective function beeing minimized +exponential dispersion model (EDM) [11]_. The objective function being minimized becomes .. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py index fce85fae1ea8c..30b5881bba1f5 100644 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -5,7 +5,7 @@ As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` example, a Poisson regression with penalized B-splines (P-splines) [1]_ is -fitted on slightly different sinusodial, Poisson distributed data and +fitted on slightly different sinusoidal, Poisson distributed data and compared to an AdaBoost model with decision trees. One can see, that this is a hard problem for both estimators. diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py index a53cc39ecd307..a35c68828de81 100644 --- a/sklearn/linear_model/glm.py +++ b/sklearn/linear_model/glm.py @@ -29,7 +29,7 @@ # L2: w*P2*w with P2 a (semi-) positive definite matrix, e.g. P2 could be # a 1st or 2nd order difference matrix (compare B-spline penalties and # Tikhonov regularization). -# - The link funtion (instance of class Link) is necessary for the evaluation +# - The link function (instance of class Link) is necessary for the evaluation # of deviance, score, Fisher and Hessian matrix as functions of the # coefficients, which is needed by optimizers. # Solution: link as argument in those functions @@ -170,7 +170,7 @@ def _min_norm_sugrad(coef, grad, P2, P1): class Link(metaclass=ABCMeta): - """Abstract base class for Link funtions.""" + """Abstract base class for Link functions.""" @abstractmethod def link(self, mu): @@ -201,7 +201,7 @@ def derivative(self, mu): def inverse(self, lin_pred): """Compute the inverse link function h(lin_pred). - Gives the inverse relationship between linkear predictor and the mean + Gives the inverse relationship between linker predictor and the mean mu=E[Y], i.e. h(linear predictor) = mu. Parameters @@ -357,7 +357,7 @@ def include_lower_bound(self): @property def include_upper_bound(self): - """Get True if upper bound for y is includede: y <= upper_bound.""" + """Get True if upper bound for y is included: y <= upper_bound.""" return self._include_upper_bound def in_y_range(self, x): @@ -859,7 +859,7 @@ def __init__(self): class GeneralizedHyperbolicSecant(ExponentialDispersionModel): """A class for the Generalized Hyperbolic Secant (GHS) distribution. - The GHS distribution is for tagets y in (-inf, inf). + The GHS distribution is for targets y in (-inf, inf). """ def __init__(self): self._lower_bound = -np.Inf @@ -881,7 +881,7 @@ def unit_deviance(self, y, mu): class BinomialDistribution(ExponentialDispersionModel): """A class for the Binomial distribution. - The Binomial distribution is for tagets y in [0, 1]. + The Binomial distribution is for targets y in [0, 1]. """ def __init__(self): self._lower_bound = 0 @@ -1131,7 +1131,7 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, else: b = B[jdx, jdx] - # those ten lines aree what it is all about + # those ten lines are what it is all about if b <= 0: z = 0 elif P1[j] == 0: @@ -1199,7 +1199,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, diag_fisher=False, copy_X=True): """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. - The objective beeing minimized in the coefficients w=coef is:: + The objective being minimized in the coefficients w=coef is:: F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 @@ -1207,7 +1207,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, 1. Find optimal descent direction d by minimizing min_d F(w+d) = min_d F(w+d) - F(w) - 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + 2. Quadratic approximation of F(w+d)-F(w) = q(d): using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + ||P1*(w+d)||_1 - ||P1*w||_1 @@ -1228,7 +1228,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, ---------- coef : ndarray, shape (c,) If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. + If fit_intercept=True, then c=X.shape[1] + 1. X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) Training data (with intercept included if present). If not sparse, @@ -1265,7 +1265,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, cycles over all features per inner loop. tol : float, optional (default=1e-4) - Covergence criterion is + Convergence criterion is sum_i(|minimum of norm of subgrad of objective_i|)<=tol. selection : str, optional (default='cyclic') @@ -1285,9 +1285,9 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, ------- coef : ndarray, shape (c,) If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. + If fit_intercept=True, then c=X.shape[1] + 1. - n_iter : numer of outer iterations = newton iterations + n_iter : number of outer iterations = newton iterations n_cycles : number of cycles over features @@ -1312,7 +1312,7 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, "format. Got P2 not sparse.") random_state = check_random_state(random_state) # Note: we already set P2 = l2*P2, P1 = l1*P1 - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') n_iter = 0 # number of outer iterations n_cycles = 0 # number of (complete) cycles over features converged = False @@ -1427,7 +1427,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Parameters ---------- alpha : float, optional (default=1) - Constant that multiplies the penalty terms und thus determines the + Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this @@ -1481,9 +1481,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'logit' for family 'binomial' - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statisic or the deviance statistic. If None, the + the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ @@ -1520,7 +1520,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of - the objective function. For the cd solver, covergence is reached + the objective function. For the cd solver, convergence is reached when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the subgradient of the objective and minimum-norm of ``g_i`` is the element of the subgradient ``g_i`` with the smallest L2-norm. @@ -1626,7 +1626,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If the target y is a ratio, appropriate sample weights s should be provided. - As an example, consider Poission distributed counts z (integers) and + As an example, consider Poisson distributed counts z (integers) and weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite @@ -1681,7 +1681,7 @@ def fit(self, X, y, sample_weight=None): Target values. sample_weight : {None, array-like}, shape (n_samples,),\ - optinal (default=None) + optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has Var[Y_i]=phi/w_i * v(mu). @@ -1995,7 +1995,7 @@ def fit(self, X, y, sample_weight=None): if self.fit_intercept: score = np.concatenate(([temp.sum()], temp @ X)) else: - score = temp @ X # sampe as X.T @ temp + score = temp @ X # same as X.T @ temp d2_sigma_inv = d1 * d1 * sigma_inv diag_fisher = self.diag_fisher @@ -2051,12 +2051,12 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 4. fit # ####################################################################### - # algorithms for optimiation + # algorithms for optimization # TODO: Parallelize it? # 4.1 IRLS ############################################################ # Note: we already set P2 = l2*P2, see above - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') if solver == 'irls': coef, self.n_iter_ = \ _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, @@ -2137,7 +2137,7 @@ def grad_hess(coef, X, y, weights, P2, family, link): grad = np.concatenate(([0.5 * temp.sum()], 0.5 * temp @ X + L2)) else: - grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2 # expected hessian = fisher = X.T @ diag_matrix @ X # calculate only diag_matrix @@ -2168,7 +2168,7 @@ def Hs(coef): # 4.4 coordinate descent ############################################## # Note: we already set P1 = l1*P1, see above # Note: we already set P2 = l2*P2, see above - # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + # Note: we already symmetrized P2 = 1/2 (P2 + P2') elif solver == 'cd': coef, self.n_iter_, self._n_cycles = \ _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, @@ -2215,7 +2215,8 @@ def linear_predictor(self, X): return X @ self.coef_ + self.intercept_ def predict(self, X, sample_weight=None): - """Predict uing GLM with feature matrix X. + """Predict using GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. Parameters diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 17535c067bbb6..6172824cf1b79 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -29,7 +29,7 @@ def test_link_properties(link): """Test link inverse and derivative.""" x = rng.rand(100)*100 - link = link() # instatiate object + link = link() # instantiate object decimal = 10 if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 @@ -133,7 +133,7 @@ def test_sample_weights_validation(): with pytest.raises(ValueError): glm.fit(X, y, weights) - # 5. 1d but weith a negative value + # 5. 1d but with a negative value weights = [2, -1] with pytest.raises(ValueError): glm.fit(X, y, weights) From 9aa1fc41d726e5ac9007d9c867546aa70ea79ba7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 17:40:08 +0200 Subject: [PATCH 053/269] Make module private --- sklearn/linear_model/__init__.py | 4 ++-- sklearn/linear_model/{glm.py => _glm.py} | 0 sklearn/linear_model/tests/test_glm.py | 9 +++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) rename sklearn/linear_model/{glm.py => _glm.py} (100%) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 0c5840f343a3a..0f7856fcc2046 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,8 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from .glm import (TweedieDistribution, - GeneralizedLinearRegressor) +from ._glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/_glm.py similarity index 100% rename from sklearn/linear_model/glm.py rename to sklearn/linear_model/_glm.py diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 6172824cf1b79..a148d11cb2632 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -1,3 +1,7 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause + import numpy as np from numpy.testing import assert_allclose import pytest @@ -5,7 +9,8 @@ from scipy import linalg, optimize, sparse from sklearn.datasets import make_classification, make_regression -from sklearn.linear_model.glm import ( +from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import ( Link, IdentityLink, LogLink, @@ -14,7 +19,7 @@ NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, GeneralizedHyperbolicSecant, BinomialDistribution, - GeneralizedLinearRegressor) +) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( From ca3eae24b4db9931eeac9fb925dba4f48199976c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 4 Jun 2019 21:35:11 +0200 Subject: [PATCH 054/269] Working on tests --- sklearn/linear_model/tests/test_glm.py | 52 +++++++++++++------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a148d11cb2632..2c8a9c3d2c72c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -23,7 +23,7 @@ from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.utils.testing import ( - assert_equal, assert_almost_equal, + assert_almost_equal, assert_array_equal, assert_array_almost_equal) @@ -35,16 +35,14 @@ def test_link_properties(link): """Test link inverse and derivative.""" x = rng.rand(100)*100 link = link() # instantiate object - decimal = 10 if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 # limit max eta to 15 x = x / 100 * 15 - decimal = 8 - assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + assert_allclose(link.link(link.inverse(x)), x) # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) - assert_almost_equal(link.derivative(link.inverse(x)), - 1./link.inverse_derivative(x), decimal=decimal) + assert_allclose(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x)) # for LogitLink, in the following x should be between 0 and 1. # assert_almost_equal(link.inverse_derivative(link.link(x)), # 1./link.derivative(x), decimal=decimal) @@ -79,7 +77,7 @@ def test_family_bounds(family, expected): def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: - assert_almost_equal(family.deviance(x, x), 0, decimal=10) + assert_allclose(family.deviance(x, x), 0, atol=1e-9) @pytest.mark.parametrize( @@ -155,7 +153,7 @@ def test_glm_family_argument(f, fam): y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert_equal(type(glm._family_instance), type(fam)) + assert isinstance(glm._family_instance, fam.__class__) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -172,7 +170,7 @@ def test_glm_link_argument(l, link): y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert_equal(type(glm._link_instance), type(link)) + assert isinstance(glm._link_instance, link.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError): @@ -366,7 +364,7 @@ def test_glm_identiy_regression(solver): fit_intercept=False, solver=solver, start_params='zero', tol=1e-7) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef) + assert_allclose(res.coef_, coef) @pytest.mark.parametrize( @@ -375,11 +373,11 @@ def test_glm_identiy_regression(solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), GeneralizedHyperbolicSecant()]) -@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), - ('lbfgs', 1e-6, 6), - ('newton-cg', 1e-7, 6), - ('cd', 1e-7, 6)]) -def test_glm_log_regression(family, solver, tol, dec): +@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), + ('lbfgs', 1e-6), + ('newton-cg', 1e-7), + ('cd', 1e-7)]) +def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T @@ -388,7 +386,7 @@ def test_glm_log_regression(family, solver, tol, dec): alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_array_almost_equal(res.coef_, coef, decimal=dec) + assert_allclose(res.coef_, coef) @pytest.mark.filterwarnings('ignore::DeprecationWarning') @@ -421,10 +419,10 @@ def test_normal_ridge(solver, tol, dec): tol=tol, max_iter=100, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert glm.coef_.shape == (X.shape[1], ) + assert_allclose(glm.coef_, ridge.coef_) + assert glm.intercept_ == pytest.approx(ridge.intercept_) + assert_allclose(glm.predict(T), ridge.predict(T)) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) @@ -435,13 +433,13 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng, fit_dispersion='chisqr') glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert glm.coef_.shape == (X.shape[1], ) + assert_allclose(glm.coef_, ridge.coef_) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert_allclose(glm.predict(T), ridge.predict(T)) mu = glm.predict(X) - assert_almost_equal(glm.dispersion_, - np.sum((y-mu)**2/(n_samples-n_features))) + assert_allclose(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) # 2. With more features than samples and sparse n_samples, n_features, n_predict = 10, 100, 10 @@ -461,7 +459,7 @@ def test_normal_ridge(solver, tol, dec): tol=tol, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) + assert glm.coef_.shape == (X.shape[1], ) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) @@ -474,7 +472,7 @@ def test_normal_ridge(solver, tol, dec): tol=tol*2, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) - assert_equal(glm.coef_.shape, (X.shape[1], )) + assert glm.coef_.shape == (X.shape[1], ) assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) From 61bc6b8e2b1e227f539656744e9b4a4fa9f514f2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 5 Jun 2019 18:45:20 +0200 Subject: [PATCH 055/269] Improve tests --- sklearn/linear_model/tests/test_glm.py | 82 +++++++++++++------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 2c8a9c3d2c72c..7d747c23ae441 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -22,9 +22,7 @@ ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge -from sklearn.utils.testing import ( - assert_almost_equal, - assert_array_equal, assert_array_almost_equal) +from sklearn.utils.testing import assert_array_equal rng = np.random.RandomState(42) @@ -355,16 +353,16 @@ def test_glm_check_input_argument(check_input): @pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) -def test_glm_identiy_regression(solver): +def test_glm_identity_regression(solver): """Test GLM regression with identity link on a simple dataset.""" - coef = [1, 2] + coef = [1., 2.] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=False, solver=solver, start_params='zero', tol=1e-7) res = glm.fit(X, y) - assert_allclose(res.coef_, coef) + assert_allclose(res.coef_, coef, rtol=1e-6) @pytest.mark.parametrize( @@ -386,7 +384,7 @@ def test_glm_log_regression(family, solver, tol): alpha=0, family=family, link='log', fit_intercept=False, solver=solver, start_params='guess', tol=tol) res = glm.fit(X, y) - assert_allclose(res.coef_, coef) + assert_allclose(res.coef_, coef, rtol=5e-6) @pytest.mark.filterwarnings('ignore::DeprecationWarning') @@ -420,9 +418,9 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_) - assert glm.intercept_ == pytest.approx(ridge.intercept_) - assert_allclose(glm.predict(T), ridge.predict(T)) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, solver='svd', normalize=False) @@ -434,9 +432,9 @@ def test_normal_ridge(solver, tol, dec): fit_dispersion='chisqr') glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_allclose(glm.predict(T), ridge.predict(T)) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) mu = glm.predict(X) assert_allclose(glm.dispersion_, np.sum((y-mu)**2/(n_samples-n_features))) @@ -452,7 +450,8 @@ def test_normal_ridge(solver, tol, dec): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, - solver='sag', normalize=False, max_iter=100000) + solver='sag', normalize=False, max_iter=100000, + random_state=42) ridge.fit(X, y) glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, @@ -460,22 +459,24 @@ def test_normal_ridge(solver, tol, dec): check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, - solver='sag', normalize=False, max_iter=1000) + solver='sag', normalize=False, max_iter=1000, + random_state=42) ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=False, tol=tol*2, max_iter=300, solver=solver, check_input=False, random_state=rng) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) - assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) - assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) - assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) + assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) + assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) @pytest.mark.parametrize('solver, tol, dec', @@ -506,10 +507,8 @@ def test_poisson_ridge(solver, tol, dec): solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) - assert_almost_equal(glm.intercept_, -0.12889386979, - decimal=dec) - assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], - decimal=dec) + assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) + assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6) @pytest.mark.parametrize('diag_fisher', [False, True]) @@ -535,14 +534,14 @@ def test_normal_enet(diag_fisher): normalize=False, tol=1e-8, copy_X=True) enet.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) - assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) + assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) # 2. test normal enet on sparse data X = sparse.csc_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) - assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) + assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) def test_poisson_enet(): @@ -569,8 +568,8 @@ def test_poisson_enet(): selection='random', random_state=rng, start_params='guess') glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6) + assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7) # test results with general optimization procedure def obj(coef): @@ -584,10 +583,10 @@ def obj(coef): + alpha * l1_ratio * np.sum(np.abs(coef[1:])) res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, options={'maxiter': 1000, 'disp': False}) - assert_almost_equal(glm.intercept_, res.x[0], decimal=5) - assert_almost_equal(glm.coef_, res.x[1:], decimal=5) - assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), - res.fun, decimal=8) + assert_allclose(glm.intercept_, res.x[0], rtol=1e-5) + assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9) + assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, rtol=1e-8) # same for start_params='zero' and selection='cyclic' # with reduced precision @@ -595,8 +594,8 @@ def obj(coef): link='log', solver='cd', tol=1e-5, selection='cyclic', start_params='zero') glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) + assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) # check warm_start, therefore start with different alpha glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, @@ -609,8 +608,8 @@ def obj(coef): glm.alpha = 1 X = sparse.csr_matrix(X) glm.fit(X, y) - assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) - assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) + assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) @pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) @@ -629,10 +628,11 @@ def test_binomial_enet(alpha): max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), solver='saga') log.fit(X, y) + glm = GeneralizedLinearRegressor( family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', tol=1e-7) glm.fit(X, y) - assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) - assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) + assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) + assert_allclose(log.coef_[0, :], glm.coef_, rtol=2e-6) From b24a7cab9fc7d9f5dcec5b9d7657fee2d0a94283 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 5 Jun 2019 18:53:58 +0200 Subject: [PATCH 056/269] Remove unused dec parameter in tests --- sklearn/linear_model/tests/test_glm.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 7d747c23ae441..a3e943403a7a7 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -388,11 +388,11 @@ def test_glm_log_regression(family, solver, tol): @pytest.mark.filterwarnings('ignore::DeprecationWarning') -@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), - ('lbfgs', 1e-6, 5), - ('newton-cg', 1e-6, 5), - ('cd', 1e-6, 6)]) -def test_normal_ridge(solver, tol, dec): +@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), + ('lbfgs', 1e-6), + ('newton-cg', 1e-6), + ('cd', 1e-6)]) +def test_normal_ridge(solver, tol): """Test ridge regression for Normal distributions. Compare to test_ridge in test_ridge.py. @@ -479,12 +479,12 @@ def test_normal_ridge(solver, tol, dec): assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) -@pytest.mark.parametrize('solver, tol, dec', - [('irls', 1e-7, 6), - ('lbfgs', 1e-7, 5), - ('newton-cg', 1e-7, 5), - ('cd', 1e-7, 7)]) -def test_poisson_ridge(solver, tol, dec): +@pytest.mark.parametrize('solver, tol', + [('irls', 1e-7), + ('lbfgs', 1e-7), + ('newton-cg', 1e-7), + ('cd', 1e-7)]) +def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. Compare to R's glmnet""" From f95b390c04fd4c0a333f08a649d9450ec44f0395 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 18 Jul 2017 21:50:10 +0200 Subject: [PATCH 057/269] ENH: add Generalized Linear Models, issue #5975 * new estimator GeneralizedLinearRegressor * loss functions for Tweedie family and Binomial * elasitc net penalties * control of penalties by matrix P2 and vector P1 * new solvers: coordinate descent, irls * tests * documentation * example for Poisson regression --- doc/modules/classes.rst | 1 + doc/modules/linear_model.rst | 129 + .../plot_poisson_spline_regression.py | 85 + sklearn/linear_model/__init__.py | 6 +- sklearn/linear_model/glm.py | 2331 +++++++++++++++++ sklearn/linear_model/tests/test_glm.py | 640 +++++ 6 files changed, 3191 insertions(+), 1 deletion(-) create mode 100644 examples/linear_model/plot_poisson_spline_regression.py create mode 100644 sklearn/linear_model/glm.py create mode 100644 sklearn/linear_model/tests/test_glm.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2dcf582a6ab39..4158e34e8bb8c 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -727,6 +727,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 02f406f629e04..888566fab3601 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -893,6 +893,135 @@ to warm-starting (see :term:`Glossary `). .. [9] `"Performance Evaluation of Lbfgs vs other solvers" `_ +.. _Generalized_linear_regression: + +Generalized Linear Regression +============================= + +:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two +ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear +combination of the input variables :math:`X` via an inverse link function +:math:`h` as + +.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). + +Secondly, the squared loss function is replaced by the deviance :math:`D` of an +exponential dispersion model (EDM) [11]_. The objective function beeing minimized +becomes + +.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 + +\frac{\alpha(1-\rho)}{2} w^T P_2 w + +with sample weights :math:`s`. +:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in +the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows +for a more versatile L2 penalty. + +Use cases, where a loss different from the squared loss might be appropriate, +are the following: + + * If the target values :math:`y` are counts (non-negative integer valued) or + frequencies (non-negative), you might use a Poisson deviance with log-link. + + * If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. + + * If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). + +Since the linear predictor :math:`Xw` can be negative and +Poisson, Gamma and Inverse Gaussian distributions don't support negative values, +it is convenient to apply a link function different from the identity link +:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(Xw)=\exp(Xw)`. + +Note that the feature matrix `X` should be standardized before fitting. This +ensures that the penalty treats features equally. The estimator can be used as +follows: + + >>> from sklearn.linear_model import GeneralizedLinearRegressor + >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE + GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, + check_input=True, copy_X=True, diag_fisher=False, + family='poisson', fit_dispersion=None, + fit_intercept=True, l1_ratio=0, link='log', + max_iter=100, random_state=None, selection='cyclic', + solver='auto', start_params='guess', tol=0.0001, + verbose=0, warm_start=False) + >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + array([0.24630169, 0.43373464]) + >>> reg.intercept_ #doctest: +ELLIPSIS + -0.76383633... + + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + +Mathematical formulation +------------------------ + +In the unpenalized case, the assumptions are the following: + + * The target values :math:`y_i` are realizations of random variables + :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` + with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter + :math:`\phi` and sample weights :math:`s_i`. + * The aim is to predict the expectation :math:`\mu_i` with + :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + +Note that the first assumption implies +:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance +function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the +same as specifying a unit variance function (they are one-to-one). + +Including penalties helps to avoid overfitting or, in case of L1 penalty, to +obtain sparse solutions. But there are also other motivations to include them, +e.g. accounting for the dependence structure of :math:`y`. + +The objective function, which is independent of :math:`\phi`, is minimized with +respect to the coefficients :math:`w`. + +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as + +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) + +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ + +Two remarks: + +* The deviances for at least Normal, Poisson and Gamma distributions are + strictly consistent scoring functions for the mean :math:`\mu`, see Eq. + (19)-(20) in [12]_. + +* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) + you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. + + +.. topic:: References: + + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. + See also `Exponential dispersion model. `_ + + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ Stochastic Gradient Descent - SGD ================================= diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py new file mode 100644 index 0000000000000..fce85fae1ea8c --- /dev/null +++ b/examples/linear_model/plot_poisson_spline_regression.py @@ -0,0 +1,85 @@ +""" +================================= +Poisson Regression with B-Splines +================================= + +As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` +example, a Poisson regression with penalized B-splines (P-splines) [1]_ is +fitted on slightly different sinusodial, Poisson distributed data and +compared to an AdaBoost model with decision trees. +One can see, that this is a hard problem for both estimators. + +.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines + and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. + `doi:10.1214/ss/1038425655 + `_ + +""" +print(__doc__) + +# Author: Christian Lorentzen +# based on the AdaBoost regression example from Noel Dawe +# License: BSD 3 clause + +# importing necessary libraries +import numpy as np +from scipy.linalg import toeplitz +# from scipy.interpolate import BSpline +from scipy.interpolate import splev +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeRegressor +from sklearn.ensemble import AdaBoostRegressor +from sklearn.linear_model import GeneralizedLinearRegressor + + +# Create the dataset +xmin, xmax = 0, 6 +rng = np.random.RandomState(1) +X = np.linspace(xmin, xmax, 500)[:, np.newaxis] +y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) +y = rng.poisson(y_true, X.shape[0]) + +# b-spline basis +nknots, degree = 40, 3 +ns = nknots - degree - 1 # number of base spline functions +dx = (xmax - xmin) / (nknots - 1 - 2 * degree) +knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) +coef = np.zeros(ns) +splineBasis = np.empty((X.shape[0], ns), dtype=float) +for i in range(ns): + coef[i] = 1 +# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ +# .ravel() + splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() + coef[i] = 0 + +# second order difference matrix +P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) +P2[0, 0] = P2[-1, -1] = 1 + +# Fit regression model +regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), + n_estimators=10, random_state=rng) + +regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', + fit_intercept=True, alpha=0.02, + l1_ratio=0.1, P2=P2) + +regr_1.fit(X, y) +regr_2.fit(splineBasis, y) + +# Predict +y_1 = regr_1.predict(X) +y_2 = regr_2.predict(splineBasis) + +# Plot the results +plt.figure() +plt.plot(X, y_true, c="b", label="true mean") +plt.scatter(X, y, c="k", marker='.', label="training samples") +plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) +plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) +plt.xlabel("data") +plt.ylabel("target") +plt.title("Regression Comparison") +plt.legend() +plt.show() diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 770a1a49b600e..cbb2ad8826358 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,6 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) +from .glm import (TweedieDistribution, + GeneralizedLinearRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -78,4 +80,6 @@ 'orthogonal_mp', 'orthogonal_mp_gram', 'ridge_regression', - 'RANSACRegressor'] + 'RANSACRegressor', + 'GeneralizedLinearRegressor', + 'TweedieDistribution'] diff --git a/sklearn/linear_model/glm.py b/sklearn/linear_model/glm.py new file mode 100644 index 0000000000000..ac0007c1789a8 --- /dev/null +++ b/sklearn/linear_model/glm.py @@ -0,0 +1,2331 @@ +""" +Generalized Linear Models with Exponential Dispersion Family +""" + +# Author: Christian Lorentzen +# some parts and tricks stolen from other sklearn files. +# License: BSD 3 clause + +# TODO: Should the option `normalize` be included (like other linear models)? +# So far, it is not included. User must pass a normalized X. +# TODO: Add cross validation support, e.g. GCV? +# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? +# So far, it does not. +# TODO: Include further classes in class.rst? ExponentialDispersionModel? +# TweedieDistribution? +# TODO: Negative values in P1 are not allowed so far. They could be used +# for group lasso. + +# Design Decisions: +# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. +# Estimators in sklearn are either regressors or classifiers. A GLM can do +# both depending on the distr (Normal => regressor, Binomial => classifier). +# Solution: GeneralizedLinearRegressor since this is the focus. +# - Allow for finer control of penalty terms: +# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude +# factors from the L1 penalty. +# L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be +# a 1st or 2nd order difference matrix (compare B-spline penalties and +# Tikhonov regularization). +# - The link funtion (instance of class Link) is necessary for the evaluation +# of deviance, score, Fisher and Hessian matrix as a functions of the +# coefficients, which is needed by optimizers. +# Solution: link as argument in those functions +# - Which name/symbol for sample_weight in docu? +# sklearn.linear_models uses w for coefficients, standard literature on +# GLMs use beta for coefficients and w for (sample) weights. +# So far, coefficients=w and sample weights=s. +# - The intercept term is the first index, i.e. coef[0] + + +from __future__ import division +from abc import ABCMeta, abstractmethod +import numbers +import numpy as np +from scipy import linalg, sparse, special +import scipy.sparse.linalg as splinalg +from scipy.optimize import fmin_l_bfgs_b +import warnings +from ..base import BaseEstimator, RegressorMixin +from ..exceptions import ConvergenceWarning +from ..utils import check_array, check_X_y +from ..utils.optimize import newton_cg +from ..utils.validation import check_is_fitted, check_random_state + + +def _check_weights(sample_weight, n_samples): + """Check that sample weights are non-negative and have the right shape.""" + if sample_weight is None: + weights = np.ones(n_samples) + elif np.isscalar(sample_weight): + if sample_weight <= 0: + raise ValueError("Sample weights must be non-negative.") + weights = sample_weight * np.ones(n_samples) + else: + _dtype = [np.float64, np.float32] + weights = check_array(sample_weight, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype) + if weights.ndim > 1: + raise ValueError("Sample weight must be 1D array or scalar") + elif weights.shape[0] != n_samples: + raise ValueError("Sample weights must have the same length as " + "y") + if not np.all(weights >= 0): + raise ValueError("Sample weights must be non-negative.") + elif not np.sum(weights) > 0: + raise ValueError("Sample weights must have at least one positive " + "element.") + + return weights + + +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _safe_toarray(X): + """Returns a numpy array.""" + if sparse.issparse(X): + return X.toarray() + else: + return np.asarray(X) + + +def _safe_sandwich_dot(X, d, intercept=False): + """Compute sandwich product X.T @ diag(d) @ X. + + With ``intercept=True``, X is treated as if a column of 1 were appended as + first column of X. + X can be sparse, d must be an ndarray. Always returns a ndarray.""" + if sparse.issparse(X): + temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) + # for older versions of numpy and scipy, temp may be a np.matrix + temp = _safe_toarray(temp) + else: + temp = (X.T * d) @ X + if intercept: + dim = X.shape[1] + 1 + if sparse.issparse(X): + order = 'F' if sparse.isspmatrix_csc(X) else 'C' + else: + order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' + res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) + res[0, 0] = d.sum() + res[1:, 0] = d @ X + res[0, 1:] = res[1:, 0] + res[1:, 1:] = temp + else: + res = temp + return res + + +def _min_norm_sugrad(coef, grad, P2, P1): + """Compute the gradient of all subgradients with minimal L2-norm. + + subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) + + g_i = grad_i + (P2*coef)_i + + if coef_i > 0: g_i + P1_i + if coef_i < 0: g_i - P1_i + if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) + + Parameters + ---------- + coef : ndarray + coef[0] may be intercept. + + grad : ndarray, shape=coef.shape + + P2 : {1d or 2d array, None} + always without intercept, ``None`` means P2 = 0 + + P1 : ndarray + always without intercept + """ + intercept = (coef.size == P1.size + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + # compute grad + coef @ P2 without intercept + grad_wP2 = grad[idx:].copy() + if P2 is None: + pass + elif P2.ndim == 1: + grad_wP2 += coef[idx:] * P2 + else: + grad_wP2 += coef[idx:] @ P2 + res = np.where(coef[idx:] == 0, + np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), + grad_wP2 + np.sign(coef[idx:]) * P1) + if intercept: + return np.concatenate(([grad[0]], res)) + else: + return res + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link funtions.""" + + @abstractmethod + def link(self, mu): + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + raise NotImplementedError + + @abstractmethod + def derivative(self, mu): + """Compute the derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + raise NotImplementedError + + @abstractmethod + def inverse(self, lin_pred): + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linkear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Compute the derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Compute 2nd derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + raise NotImplementedError + + +class IdentityLink(Link): + """The identity link function g(x)=x.""" + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x).""" + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return special.logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return special.expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = special.expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = special.expit(lin_pred) + ep = special.expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) + + +class ExponentialDispersionModel(metaclass=ABCMeta): + r"""Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + include_lower_bound + include_upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _mu_deviance_derivative + _score + _fisher_matrix + _observed_information + _eta_mu_score_fisher + + References + ---------- + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + @property + def lower_bound(self): + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound + + @property + def upper_bound(self): + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound + + @property + def include_lower_bound(self): + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound + + @property + def include_upper_bound(self): + """Get True if upper bound for y is includede: y <= upper_bound.""" + return self._include_upper_bound + + def in_y_range(self, x): + """Returns ``True`` if x is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. + """ + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) + + @abstractmethod + def unit_variance(self, mu): + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + raise NotImplementedError() + + @abstractmethod + def unit_variance_derivative(self, mu): + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. + """ + raise NotImplementedError() + + def variance(self, mu, phi=1, weights=1): + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weights=1): + r"""Compute the derivative of the variance w.r.t. mu. + + Returns + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + raise NotImplementedError() + + def unit_deviance_derivative(self, y, mu): + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + return -2 * (y - mu) / self.unit_variance(mu) + + def deviance(self, y, mu, weights=1): + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return np.sum(weights * self.unit_deviance(y, mu)) + + def deviance_derivative(self, y, mu, weights=1): + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp + + def _score(self, coef, phi, X, y, weights, link): + r"""Compute the score function. + + The score function is the derivative of the + log-likelihood w.r.t. `coef` (:math:`w`). + It is given by + + .. math: + + \mathbf{score}(\boldsymbol{w}) + = \frac{\partial loglike}{\partial\boldsymbol{w}} + = \mathbf{X}^T \mathbf{D} + \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, + + with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and + :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. + Note: The derivative of the deviance w.r.t. coef equals -2 * score. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + d = link.inverse_derivative(lin_pred) + temp = sigma_inv * d * (y - mu) + if coef.size == X.shape[1] + 1: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + return score + + def _fisher_matrix(self, coef, phi, X, y, weights, link): + r"""Compute the Fisher information matrix. + + The Fisher information matrix, also known as expected information + matrix is given by + + .. math: + + \mathbf{F}(\boldsymbol{w}) = + \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial + \boldsymbol{w}} \right] + = \mathrm{E}\left[ + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T}\right] + = \mathbf{X}^T W \mathbf{X} \,, + + with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, + see func:`_score`. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + d = link.inverse_derivative(lin_pred) + d2_sigma_inv = sigma_inv * d * d + intercept = (coef.size == X.shape[1] + 1) + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return fisher_matrix + + def _observed_information(self, coef, phi, X, y, weights, link): + r"""Compute the observed information matrix. + + The observed information matrix, also known as the negative of + the Hessian matrix of the log-likelihood, is given by + + .. math: + + \mathbf{H}(\boldsymbol{w}) = + -\frac{\partial^2 loglike}{\partial\boldsymbol{w} + \partial\boldsymbol{w}^T} + = \mathbf{X}^T \left[ + - \mathbf{D}' \mathbf{R} + + \mathbf{D}^2 \mathbf{V} \mathbf{R} + + \mathbf{D}^2 + \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, + + with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, + :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ + v(\mu_i)} + \right)`, + see :func:`score_` function and :func:`_fisher_matrix`. + """ + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) + dp = link.inverse_derivative2(lin_pred) + d2 = link.inverse_derivative(lin_pred)**2 + v = self.unit_variance_derivative(mu)/self.unit_variance(mu) + r = y - mu + temp = sigma_inv * (-dp * r + d2 * v * r + d2) + intercept = (coef.size == X.shape[1] + 1) + observed_information = _safe_sandwich_dot(X, temp, + intercept=intercept) + return observed_information + + def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, + diag_fisher=False): + """Compute linear predictor, mean, score function and fisher matrix. + + It calculates the linear predictor, the mean, score function + (derivative of log-likelihood) and Fisher information matrix + all in one go as function of `coef` (:math:`w`) and the data. + + Parameters + ---------- + diag_fisher : boolean, optional (default=False) + If ``True``, returns only an array d such that + fisher = X.T @ np.diag(d) @ X. + + Returns + ------- + (eta, mu, score, fisher) : tuple with 4 elements + The 4 elements are: + + * eta: ndarray, shape (X.shape[0],) + * mu: ndarray, shape (X.shape[0],) + * score: ndarray, shape (X.shape[0],) + * fisher: + + * If diag_fisher is ``False``, the full fisher matrix, + an array of shape (X.shape[1], X.shape[1]) + * If diag_fisher is ``True`, an array of shape (X.shape[0]) + """ + intercept = (coef.size == X.shape[1] + 1) + # eta = linear predictor + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) + d1 = link.inverse_derivative(eta) # = h'(eta) + # Alternatively: + # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g + # d1 = 1./link.derivative(mu) + d1_sigma_inv = d1 * sigma_inv + temp = d1_sigma_inv * (y - mu) + if intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X + + d2_sigma_inv = d1 * d1_sigma_inv + if diag_fisher: + fisher_matrix = d2_sigma_inv + else: + fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, + intercept=intercept) + return eta, mu, score, fisher_matrix + + +class TweedieDistribution(ExponentialDispersionModel): + r"""A class for the Tweedie distribution. + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto \mu^power`. + + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + + Parameters + ---------- + power : float (default=0) + The variance power of the `unit_variance` + :math:`v(\mu) = \mu^{power}`. + For ``0 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + # Compound Poisson + self._lower_bound = 0 + self._include_lower_bound = True + elif power == 2: + # GammaDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif (power > 2) and (power < 3): + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + elif power == 3: + # InverseGaussianDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif power > 3: + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + else: + raise ValueError('The power must be a float, i.e. real number, ' + 'got (power={})'.format(power)) + + @property + def power(self): + return self._power + + @power.setter + def power(self, power): + if not isinstance(power, numbers.Real): + raise TypeError('power must be a real number, input was {0}' + .format(power)) + self._power = power + + def unit_variance(self, mu): + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return self.power * np.power(mu, self.power - 1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + # NormalDistribution + return (y - mu)**2 + if p == 1: + # PoissonDistribution + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (special.xlogy(y, y/mu) - y + mu) + elif p == 2: + # GammaDistribution + return 2 * (np.log(mu/y) + y/mu - 1) + else: + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + + +class GeneralizedHyperbolicSecant(ExponentialDispersionModel): + """A class for the Generalized Hyperbolic Secant (GHS) distribution. + + The GHS distribution is for tagets y in (-inf, inf). + """ + def __init__(self): + self._lower_bound = -np.Inf + self._upper_bound = np.Inf + self._include_lower_bound = False + self._include_upper_bound = False + + def unit_variance(self, mu): + return 1 + mu**2 + + def unit_variance_derivative(self, mu): + return 2 * mu + + def unit_deviance(self, y, mu): + return (2 * y * (np.arctan(y) - np.arctan(mu)) + + np.log((1 + mu**2)/(1 + y**2))) + + +class BinomialDistribution(ExponentialDispersionModel): + """A class for the Binomial distribution. + + The Binomial distribution is for tagets y in [0, 1]. + """ + def __init__(self): + self._lower_bound = 0 + self._upper_bound = 1 + self._include_lower_bound = True + self._include_upper_bound = True + + def unit_variance(self, mu): + return mu * (1 - mu) + + def unit_variance_derivative(self, mu): + return 1 - 2 * mu + + def unit_deviance(self, y, mu): + return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) + + +def _irls_step(X, W, P2, z, fit_intercept=True): + """Compute one step in iteratively reweighted least squares. + + Solve A w = b for w with + A = (X' W X + P2) + b = X' W z + z = eta + D^-1 (y-mu) + + See also fit method of :class:`GeneralizedLinearRegressor`. + + Parameters + ---------- + X : {ndarray, sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present) + + W : ndarray, shape (n_samples,) + + P2 : {ndarray, sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix) + + z : ndarray, shape (n_samples,) + Working observations + + fit_intercept : boolean, optional (default=True) + + Returns + ------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + """ + # Note: solve vs least squares, what is more appropriate? + # scipy.linalg.solve seems faster, but scipy.linalg.lstsq + # is more robust. + # Note: X.T @ W @ X is not sparse, even when X is sparse. + # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) + if fit_intercept: + Wz = W * z + if sparse.issparse(X): + b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) + else: + b = np.concatenate(([Wz.sum()], X.T @ Wz)) + A = _safe_sandwich_dot(X, W, intercept=fit_intercept) + if P2.ndim == 1: + idx = np.arange(start=1, stop=A.shape[0]) + A[(idx, idx)] += P2 # add to diag elements without intercept + elif sparse.issparse(P2): + A[1:, 1:] += P2.toarray() + else: + A[1:, 1:] += P2 + else: + if sparse.issparse(X): + XtW = X.transpose().multiply(W) + # for older versions of numpy and scipy, A may be a np.matrix + A = _safe_toarray(XtW @ X) + else: + XtW = (X.T * W) + A = XtW @ X + b = XtW @ z + if P2.ndim == 1: + A[np.diag_indices_from(A)] += P2 + elif sparse.issparse(P2): + A += P2.toarray() + else: + A += P2 + # coef = linalg.solve(A, b, overwrite_a=True, overwrite_b=True) + coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) + return coef + + +def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, + max_iter, tol): + """Solve GLM with L2 penalty by IRLS algorithm. + + Note: If X is sparse, P2 must also be sparse. + """ + # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' + # Obj = objective function = 1/2 Dev + l2/2 w P2 w + # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 + # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) + # D2 = link.inverse_derivative(eta)^2 = D^2 + # W = D2/V(mu) + # l2 = alpha * (1 - l1_ratio) + # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w + # = -X' D (y-mu)/V(mu) + l2 P2 w + # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 + # Use Fisher matrix instead of full info matrix -X'(...) X, + # i.e. E[Dev''] with E[y-mu]=0: + # Obj'' ~ X' W X + l2 P2 + # (1): w = (X' W X + l2 P2)^-1 X' W z, + # with z = eta + D^-1 (y-mu) + # Note: P2 must be symmetrized + # Note: ' denotes derivative, but also transpose for matrices + + # eta = linear predictor + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + # D = h'(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + n_iter = 0 + while n_iter < max_iter: + n_iter += 1 + # coef_old not used so far. + # coef_old = coef + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = hp**2 / V + # working observations + z = eta + (y - mu) / hp + # solve A*coef = b + # A = X' W X + P2, b = X' W z + coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) + # updated linear predictor + # do it here for updated values for tolerance + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + hp = link.inverse_derivative(eta) + V = family.variance(mu, phi=1, weights=weights) + + # which tolerace? |coef - coef_old| or gradient? + # use gradient for compliance with newton-cg and lbfgs + # gradient = -X' D (y-mu)/V(mu) + l2 P2 w + temp = hp * (y - mu) / V + if sparse.issparse(X): + gradient = -(X.transpose() @ temp) + else: + gradient = -(X.T @ temp) + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + gradient += P2 * coef[idx:] + else: + gradient += P2 @ coef[idx:] + if fit_intercept: + gradient = np.concatenate(([-temp.sum()], gradient)) + if (np.max(np.abs(gradient)) <= tol): + converged = True + break + + if not converged: + warnings.warn("irls failed to converge. Increase the number " + "of iterations (currently {0})" + .format(max_iter), ConvergenceWarning) + + return coef, n_iter + + +def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=1000, selection='cyclic', + random_state=None, diag_fisher=False): + """Compute inner loop of coordinate descent, i.e. cycles through features. + + Minimization of 1-d subproblems:: + + min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 + + A = f'(w) + d*H(w) + (w+d)*P2 + B = H+P2 + Note: f'=-score and H=fisher are updated at the end of outer iteration. + """ + # TODO: use sparsity (coefficient already 0 due to L1 penalty) + # => active set of features for featurelist, see paper + # of Improved GLMNET or Gap Safe Screening Rules + # https://arxiv.org/abs/1611.05780 + n_samples, n_features = X.shape + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + B = fisher + if P2.ndim == 1: + coef_P2 = coef[idx:] * P2 + if not diag_fisher: + idiag = np.arange(start=idx, stop=B.shape[0]) + # B[np.diag_indices_from(B)] += P2 + B[(idiag, idiag)] += P2 + else: + coef_P2 = coef[idx:] @ P2 + if not diag_fisher: + if sparse.issparse(P2): + B[idx:, idx:] += P2.toarray() + else: + B[idx:, idx:] += P2 + # A = -score + coef_P2 + A = -score + A[idx:] += coef_P2 + # A += d @ (H+P2) but so far d=0 + # inner loop + for inner_iter in range(1, max_inner_iter+1): + inner_iter += 1 + n_cycles += 1 + # cycle through features, update intercept separately at the end + if selection == 'random': + featurelist = random_state.permutation(n_features) + else: + featurelist = np.arange(n_features) + for j in featurelist: + # minimize_z: a z + 1/2 b z^2 + c |d+z| + # a = A_j + # b = B_jj > 0 + # c = |P1_j| = P1_j > 0, see 1.3 + # d = w_j + d_j + # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) + # with beta = z+d, beta_hat = d-a/b and gamma = c/b + # z = 1/b * S(bd-a,c) - d + # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding + jdx = j+idx # index for arrays containing entries for intercept + a = A[jdx] + if diag_fisher: + # Note: fisher is ndarray of shape (n_samples,) => no idx + # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway + Bj = np.zeros_like(A) + if intercept: + Bj[0] = fisher.sum() + if sparse.issparse(X): + Bj[idx:] = _safe_toarray(X[:, j].transpose() @ + X.multiply(fisher[:, np.newaxis]) + ).ravel() + else: + Bj[idx:] = (fisher * X[:, j]) @ X + + if P2.ndim == 1: + Bj[idx:] += P2[j] + else: + if sparse.issparse(P2): + # slice columns as P2 is csc + Bj[idx:] += P2[:, j].toarray().ravel() + else: + Bj[idx:] += P2[:, j] + b = Bj[jdx] + else: + b = B[jdx, jdx] + + # those ten lines aree what it is all about + if b <= 0: + z = 0 + elif P1[j] == 0: + z = -a/b + elif a + P1[j] < b * (coef[jdx] + d[jdx]): + z = -(a + P1[j])/b + elif a - P1[j] > b * (coef[jdx] + d[jdx]): + z = -(a - P1[j])/b + else: + z = -(coef[jdx] + d[jdx]) + + # update direction d + d[jdx] += z + # update A because d_j is now d_j+z + # A = f'(w) + d*H(w) + (w+d)*P2 + # => A += (H+P2)*e_j z = B_j * z + # Note: B is symmetric B = B.transpose + if diag_fisher: + # Bj = B[:, j] calculated above, still valid + A += Bj * z + else: + # B is symmetric, C- or F-contiguous, but never sparse + if B.flags['F_CONTIGUOUS']: + # slice columns like for sparse csc + A += B[:, jdx] * z + else: # B.flags['C_CONTIGUOUS'] might be true + # slice rows + A += B[jdx, :] * z + # end of cycle over features + # update intercept + if intercept: + if diag_fisher: + Bj = np.zeros_like(A) + Bj[0] = fisher.sum() + Bj[1:] = fisher @ X + b = Bj[0] + else: + b = B[0, 0] + z = 0 if b <= 0 else -A[0]/b + d[0] += z + if diag_fisher: + A += Bj * z + else: + if B.flags['F_CONTIGUOUS']: + A += B[:, 0] * z + else: + A += B[0, :] * z + # end of complete cycle + # stopping criterion for inner loop + # sum_i(|minimum of norm of subgrad of q(d)_i|) + # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 + mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= inner_tol: + if inner_iter == 1: + inner_tol = inner_tol/4. + break + # end of inner loop + return d, coef_P2, n_cycles, inner_tol + + +def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, + max_iter=100, max_inner_iter=1000, tol=1e-4, + selection='cyclic ', random_state=None, + diag_fisher=False, copy_X=True): + """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. + + The objective beeing minimized in the coefficients w=coef is:: + + F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 + + An Improved GLMNET for L1-regularized Logistic Regression: + + 1. Find optimal descent direction d by minimizing + min_d F(w+d) = min_d F(w+d) - F(w) + 2. Quadrdatic approximation of F(w+d)-F(w) = q(d): + using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: + q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d + + ||P1*(w+d)||_1 - ||P1*w||_1 + Then minimize q(d): min_d q(d) + 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): + min_z q(d+z*e_j) + = min_z q(d+z*e_j) - q(d) + = min_z A_j z + 1/2 B_jj z^2 + + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 + A = f'(w) + d*H(w) + (w+d)*P2 + B = H + P2 + + Repeat steps 1-3 until convergence. + Note: Use Fisher matrix instead of Hessian for H. + Note: f' = -score, H = Fisher matrix + + Parameters + ---------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + + X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) + Training data (with intercept included if present). If not sparse, + pass directly as Fortran-contiguous data to avoid + unnecessary memory duplication. + + y : ndarray, shape (n_samples,) + Target values. + + weights: ndarray, shape (n_samples,) + Sample weights with which the deviance is weighted. The weights must + bee normalized and sum to 1. + + P1 : {ndarray}, shape (n_features,) + The L1-penalty vector (=diagonal matrix) + + P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) + The L2-penalty matrix or vector (=diagonal matrix). If a matrix is + passed, it must be symmetric. If X is sparse, P2 must also be sparse. + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + family : ExponentialDispersionModel + + link : Link + + max_iter : int, optional (default=100) + Maximum numer of outer (Newton) iterations. + + max_inner_iter : int, optional (default=1000) + Maximum number of iterations in each inner loop, i.e. max number of + cycles over all features per inner loop. + + tol : float, optional (default=1e-4) + Covergence criterion is + sum_i(|minimum of norm of subgrad of objective_i|)<=tol. + + selection : str, optional (default='cyclic') + If 'random', randomly chose features in inner loop. + + random_state : {int, RandomState instance, None}, optional (default=None) + + diag_fisher : boolean, optional (default=False) + ``False`` calculates full fisher matrix, ``True`` only diagonal matrix + s.t. fisher = X.T @ diag @ X. This saves storage but needs more + matrix-vector multiplications. + + copy_X : boolean, optional (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + Returns + ------- + coef : ndarray, shape (c,) + If fit_intercept=False, shape c=X.shape[1]. + If fit_intercept=True, then c=X.shapee[1] + 1. + + n_iter : numer of outer iterations = newton iterations + + n_cycles : number of cycles over features + + References + ---------- + Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + X = check_array(X, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if P2.ndim == 2: + P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], + order='F', copy=copy_X) + if sparse.issparse(X): + if not sparse.isspmatrix_csc(X): + raise ValueError("If X is sparse, it must be in csc format" + "; got (format={})".format(X.format)) + if not sparse.isspmatrix_csc(P2): + raise ValueError("If X is sparse, P2 must also be sparse csc" + "format. Got P2 not sparse.") + random_state = check_random_state(random_state) + # Note: we already set P2 = l2*P2, P1 = l1*P1 + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + n_iter = 0 # number of outer iterations + n_cycles = 0 # number of (complete) cycles over features + converged = False + n_samples, n_features = X.shape + idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept + # line search parameters + (beta, sigma) = (0.5, 0.01) + # some precalculations + # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a + # 1d array representing a diagonal matrix. + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # set up space for search direction d for inner loop + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) + inner_tol = linalg.norm(inner_tol, ord=1) + # outer loop + while n_iter < max_iter: + n_iter += 1 + # initialize search direction d (to be optimized) with zero + d.fill(0) + # inner loop = _cd_cycle + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, + max_inner_iter=max_inner_iter, selection=selection, + random_state=random_state, diag_fisher=diag_fisher) + # line search by sequence beta^k, k=0, 1, .. + # F(w + lambda d) - F(w) <= lambda * bound + # bound = sigma * (f'(w)*d + w*P2*d + # +||P1 (w+d)||_1 - ||P1 w||_1) + P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) + P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) + # Note: coef_P2 already calculated and still valid + bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) + Fw = (0.5 * family.deviance(y, mu, weights) + + 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) + la = 1./beta + for k in range(20): + la *= beta # starts with la=1 + coef_wd = coef + la * d + mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) + Fwd = (0.5 * family.deviance(y, mu_wd, weights) + + linalg.norm(P1 * coef_wd[idx:], ord=1)) + if P2.ndim == 1: + Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) + else: + Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) + if Fwd - Fw <= sigma * la * bound: + break + # update coefficients + # coef_old = coef.copy() + coef += la * d + # calculate eta, mu, score, Fisher matrix for next iteration + eta, mu, score, fisher = family._eta_mu_score_fisher( + coef=coef, phi=1, X=X, y=y, weights=weights, link=link, + diag_fisher=diag_fisher) + # stopping criterion for outer loop + # sum_i(|minimum-norm of subgrad of F(w)_i|) + # fp_wP2 = f'(w) + w*P2 + # Note: eta, mu and score are already updated + mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) + mn_subgrad = linalg.norm(mn_subgrad, ord=1) + if mn_subgrad <= tol: + converged = True + break + # end of outer loop + if not converged: + warnings.warn("Coordinate descent failed to converge. Increase" + " the maximum number of iterations max_iter" + " (currently {0})".format(max_iter), ConvergenceWarning) + + return coef, n_iter, n_cycles + + +class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): + """Regression via a Generalized Linear Model (GLM) with penalties. + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + the fit minimizes the following objective function with combined L1 and L2 + priors as regularizer:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + + alpha * l1_ratio * ||P1*w||_1 + + 1/2 * alpha * (1 - l1_ratio) * w*P2*w + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + For ``P1=P2='identity'``, the penalty is the elastic net:: + + alpha * l1_ratio * ||w||_1 + + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 + + If you are interested in controlling the L1 and L2 penalties + separately, keep in mind that this is equivalent to:: + + a * L1 + b * L2 + + where:: + + alpha = a + b and l1_ratio = a / (a + b) + + The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, + while ``alpha`` corresponds to the lambda parameter in glmnet. + Specifically, l1_ratio = 1 is the lasso penalty. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms und thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + l1_ratio : float, optional (default=0) + The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For + ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it + is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a + combination of L1 and L2. + + P1 : {'identity', array-like}, shape (n_features,), optional \ + (default='identity') + With this array, you can exclude coefficients from the L1 penalty. + Set the corresponding value to 1 (include) or 0 (exclude). The + default value ``'identity'`` is the same as a 1d array of ones. + Note that n_features = X.shape[1]. + + P2 : {'identity', array-like, sparse matrix}, shape \ + (n_features,) or (n_features, n_features), optional \ + (default='identity') + With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. + This gives a fine control over this penalty (Tikhonov regularization). + A 2d array is directly used as the square matrix P2. A 1d array is + interpreted as diagonal (square) matrix. The default 'identity' sets + the identity matrix, which gives the usual squared L2-norm. If you just + want to exclude certain coefficients, pass a 1d array filled with 1, + and 0 for the coefficients to be excluded. + Note that P2 must be positive semi-definite. + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + or an instance of class ExponentialDispersionModel, \ + optional(default='normal') + The distributional assumption of the GLM, i.e. which distribution from + the EDM, specifies the loss function to be minimized. + + link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + optional (default='auto') + The link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (mu). Option 'auto' sets the link depending on + the chosen family as follows: + + - 'identity' for family 'normal' + + - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + + - 'logit' for family 'binomial' + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (defaul=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statisic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ + optional (default='auto') + Algorithm to use in the optimization problem: + + 'auto' + Sets 'irls' if l1_ratio equals 0, else 'cd'. + + 'cd' + Coordinate descent algorithm. It can deal with L1 as well as L2 + penalties. Note that in order to avoid unnecessary memory + duplication of X in the ``fit`` method, X should be directly passed + as a Fortran-contiguous numpy array or sparse csc matrix. + + 'irls' + Iterated reweighted least squares. + It is the standard algorithm for GLMs. It cannot deal with + L1 penalties. + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + + 'newton-cg', 'lbfgs' + Newton conjugate gradient algorithm cannot deal with L1 penalties. + + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. For the cd solver, covergence is reached + when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the + subgradient of the objective and minimum-norm of ``g_i`` is the element + of the subgradient ``g_i`` with the smallest L2-norm. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to ``fit``), option ``start_params`` sets the + start values for ``coef_`` and ``intercept_``. + + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'guess' + Start values of mu are calculated by family.starting_mu(..). Then, + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step, else the Newton step is + calculated by the cd solver. + This gives usually good starting values. + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the weighted average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. + + selection : str, optional (default='cyclic') + For the solver 'cd' (coordinate descent), the coordinates (features) + can be updated in either cyclic or random order. + If set to 'random', a random coefficient is updated every iteration + rather than looping over features sequentially in the same order. This + (setting to 'random') often leads to significantly faster convergence + especially when tol is higher than 1e-4. + + random_state : {int, RandomState instance, None}, optional (default=None) + The seed of the pseudo random number generator that selects a random + feature to be updated for solver 'cd' (coordinate descent). + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + + diag_fisher : boolean, optional, (default=False) + Only relevant for solver 'cd' (see also ``start_params='guess'``). + If ``False``, the full Fisher matrix (expected Hessian) is computed in + each outer iteration (Newton iteration). If ``True``, only a diagonal + matrix (stored as 1d array) is computed, such that + fisher = X.T @ diag @ X. This saves memory and matrix-matrix + multiplications, but needs more matrix-vector multiplications. If you + use large sparse X or if you have many features, + i.e. n_features >> n_samples, you might set this option to ``True``. + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + check_input : boolean, optional (default=True) + Allow to bypass several checks on input: y values in range of family, + sample_weight non-negative, P2 positive semi-definite. + Don't use this parameter unless you know what you do. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + + If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poission distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, + sample_weight=s)``. The weights are necessary for the right (finite + sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. + + References + ---------- + For the coordinate descent implementation: + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', + fit_intercept=True, family='normal', link='auto', + fit_dispersion=None, solver='auto', max_iter=100, + tol=1e-4, warm_start=False, start_params='guess', + selection='cyclic', random_state=None, diag_fisher=False, + copy_X=True, check_input=True, verbose=0): + self.alpha = alpha + self.l1_ratio = l1_ratio + self.P1 = P1 + self.P2 = P2 + self.fit_intercept = fit_intercept + self.family = family + self.link = link + self.fit_dispersion = fit_dispersion + self.solver = solver + self.max_iter = max_iter + self.tol = tol + self.warm_start = warm_start + self.start_params = start_params + self.selection = selection + self.random_state = random_state + self.diag_fisher = diag_fisher + self.copy_X = copy_X + self.check_input = check_input + self.verbose = verbose + + def fit(self, X, y, sample_weight=None): + """Fit a Generalized Linear Model. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : {None, array-like}, shape (n_samples,),\ + optinal (default=None) + Individual weights w_i for each sample. Note that for an + Exponential Dispersion Model (EDM), one has + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + weighted average with weights=sample_weight. + + Returns + ------- + self : returns an instance of self. + """ + ####################################################################### + # 1. input validation # + ####################################################################### + # 1.1 validate arguments of __init__ ################################## + # Guarantee that self._family_instance is an instance of class + # ExponentialDispersionModel + if isinstance(self.family, ExponentialDispersionModel): + self._family_instance = self.family + else: + if self.family == 'normal': + self._family_instance = NormalDistribution() + elif self.family == 'poisson': + self._family_instance = PoissonDistribution() + elif self.family == 'gamma': + self._family_instance = GammaDistribution() + elif self.family == 'inverse.gaussian': + self._family_instance = InverseGaussianDistribution() + elif self.family == 'binomial': + self._family_instance = BinomialDistribution() + else: + raise ValueError( + "The family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) + + # Guarantee that self._link_instance is set to an instance of + # class Link + if isinstance(self.link, Link): + self._link_instance = self.link + else: + if self.link == 'auto': + if isinstance(self._family_instance, TweedieDistribution): + if self._family_instance.power <= 0: + self._link_instance = IdentityLink() + if self._family_instance.power >= 1: + self._link_instance = LogLink() + elif isinstance(self._family_instance, + GeneralizedHyperbolicSecant): + self._link_instance = IdentityLink() + elif isinstance(self._family_instance, BinomialDistribution): + self._link_instance = LogitLink() + else: + raise ValueError("No default link known for the " + "specified distribution family. Please " + "set link manually, i.e. not to 'auto'; " + "got (link='auto', family={}" + .format(self.family)) + elif self.link == 'identity': + self._link_instance = IdentityLink() + elif self.link == 'log': + self._link_instance = LogLink() + elif self.link == 'logit': + self._link_instance = LogitLink() + else: + raise ValueError( + "The link must be an instance of class Link or " + "an element of ['auto', 'identity', 'log', 'logit']; " + "got (link={0})".format(self.link)) + + # validate further arguments + if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: + raise ValueError("Penalty term must be a non-negative number;" + " got (alpha={0})".format(self.alpha)) + if (not isinstance(self.l1_ratio, numbers.Number) or + self.l1_ratio < 0 or self.l1_ratio > 1): + raise ValueError("l1_ratio must be a number in interval [0, 1];" + " got (l1_ratio={0})".format(self.l1_ratio)) + if not isinstance(self.fit_intercept, bool): + raise ValueError("The argument fit_intercept must be bool;" + " got {0}".format(self.fit_intercept)) + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: + raise ValueError("GeneralizedLinearRegressor supports only solvers" + " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " got {0}".format(self.solver)) + solver = self.solver + if self.solver == 'auto': + if self.l1_ratio == 0: + solver = 'irls' + else: + solver = 'cd' + if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): + raise ValueError("The chosen solver (solver={0}) can't deal " + "with L1 penalties, which are included with " + "(alpha={1}) and (l1_ratio={2})." + .format(solver, self.alpha, self.l1_ratio)) + if (not isinstance(self.max_iter, int) + or self.max_iter <= 0): + raise ValueError("Maximum number of iteration must be a positive " + "integer;" + " got (max_iter={0!r})".format(self.max_iter)) + if not isinstance(self.tol, numbers.Number) or self.tol <= 0: + raise ValueError("Tolerance for stopping criteria must be " + "positive; got (tol={0!r})".format(self.tol)) + if not isinstance(self.warm_start, bool): + raise ValueError("The argument warm_start must be bool;" + " got {0}".format(self.warm_start)) + if self.selection not in ['cyclic', 'random']: + raise ValueError("The argument selection must be 'cyclic' or " + "'random'; got (selection={0})" + .format(self.selection)) + random_state = check_random_state(self.random_state) + if not isinstance(self.diag_fisher, bool): + raise ValueError("The argument diag_fisher must be bool;" + " got {0}".format(self.diag_fisher)) + if not isinstance(self.copy_X, bool): + raise ValueError("The argument copy_X must be bool;" + " got {0}".format(self.copy_X)) + if not isinstance(self.check_input, bool): + raise ValueError("The argument check_input must be bool; got " + "(check_input={0})".format(self.check_input)) + + family = self._family_instance + link = self._link_instance + + # 1.2 validate arguments of fit ####################################### + _dtype = [np.float64, np.float32] + if solver == 'cd': + _stype = ['csc'] + else: + _stype = ['csc', 'csr'] + X, y = check_X_y(X, y, accept_sparse=_stype, + dtype=_dtype, y_numeric=True, multi_output=False, + copy=self.copy_X) + # Without converting y to float, deviance might raise + # ValueError: Integers to negative integer powers are not allowed. + # Also, y must not be sparse. + y = np.asarray(y, dtype=np.float64) + + weights = _check_weights(sample_weight, y.shape[0]) + + n_samples, n_features = X.shape + + # 1.3 arguments to take special care ################################## + # P1, P2, start_params + if isinstance(self.P1, str) and self.P1 == 'identity': + P1 = np.ones(n_features) + else: + P1 = np.atleast_1d(self.P1) + try: + P1 = P1.astype(np.float64, casting='safe', copy=False) + except TypeError: + raise TypeError("The given P1 cannot be converted to a numeric" + "array; got (P1.dtype={0})." + .format(P1.dtype)) + if (P1.ndim != 1) or (P1.shape[0] != n_features): + raise ValueError("P1 must be either 'identity' or a 1d array " + "with the length of X.shape[1]; " + "got (P1.shape[0]={0}), " + "needed (X.shape[1]={1})." + .format(P1.shape[0], n_features)) + # If X is sparse, make P2 sparse, too. + if isinstance(self.P2, str) and self.P2 == 'identity': + if sparse.issparse(X): + P2 = (sparse.dia_matrix((np.ones(n_features), 0), + shape=(n_features, n_features))).tocsc() + else: + P2 = np.ones(n_features) + else: + P2 = check_array(self.P2, copy=True, + accept_sparse=_stype, + dtype=_dtype, ensure_2d=False) + if P2.ndim == 1: + P2 = np.asarray(P2) + if P2.shape[0] != n_features: + raise ValueError("P2 should be a 1d array of shape " + "(n_features,) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0},)), needed ({1},)" + .format(P2.shape[0], X.shape[1])) + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(n_features, n_features))).tocsc() + elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and + P2.shape[0] == X.shape[1]): + if sparse.issparse(X): + P2 = (sparse.dia_matrix((P2, 0), + shape=(n_features, n_features))).tocsc() + else: + raise ValueError("P2 must be either None or an array of shape " + "(n_features, n_features) with " + "n_features=X.shape[1]; " + "got (P2.shape=({0}, {1})), needed ({2}, {2})" + .format(P2.shape[0], P2.shape[1], X.shape[1])) + + start_params = self.start_params + if isinstance(start_params, str): + if start_params not in ['guess', 'zero']: + raise ValueError("The argument start_params must be 'guess', " + "'zero' or an array of correct length; " + "got(start_params={0})".format(start_params)) + else: + start_params = check_array(start_params, accept_sparse=False, + force_all_finite=True, ensure_2d=False, + dtype=_dtype, copy=True) + if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or + (start_params.ndim != 1)): + raise ValueError("Start values for parameters must have the" + "right length and dimension; required (length" + "={0}, ndim=1); got (length={1}, ndim={2})." + .format(X.shape[1] + self.fit_intercept, + start_params.shape[0], + start_params.ndim)) + + l1 = self.alpha * self.l1_ratio + l2 = self.alpha * (1 - self.l1_ratio) + # P1 and P2 are now for sure copies + P1 = l1 * P1 + P2 = l2 * P2 + # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') + # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric + if P2.ndim == 2: + if sparse.issparse(P2): + if sparse.isspmatrix_csc(P2): + P2 = 0.5 * (P2 + P2.transpose()).tocsc() + else: + P2 = 0.5 * (P2 + P2.transpose()).tocsr() + else: + P2 = 0.5 * (P2 + P2.T) + + # For coordinate descent, if X is sparse, P2 must also be csc + if solver == 'cd' and sparse.issparse(X): + P2 = sparse.csc_matrix(P2) + + # 1.4 additional validations ########################################## + if self.check_input: + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # check if P1 has only non-negative values, negative values might + # indicate group lasso in the future. + if not isinstance(self.P1, str): # if self.P1 != 'identity': + if not np.all(P1 >= 0): + raise ValueError("P1 must not have negative values.") + # check if P2 is positive semidefinite + # np.linalg.cholesky(P2) 'only' asserts positive definite + if not isinstance(self.P2, str): # self.P2 != 'identity' + # due to numerical precision, we allow eigenvalues to be a + # tiny bit negative + epsneg = -10 * np.finfo(P2.dtype).epsneg + if P2.ndim == 1 or P2.shape[0] == 1: + p2 = P2 + if sparse.issparse(P2): + p2 = P2.toarray() + if not np.all(p2 >= 0): + raise ValueError("1d array P2 must not have negative " + "values.") + elif sparse.issparse(P2): + # for sparse matrices, not all eigenvals can be computed + # efficiently, use only half of n_features + # k = how many eigenvals to compute + k = np.min([10, n_features // 10 + 1]) + sigma = 0 # start searching near this value + which = 'SA' # find smallest algebraic eigenvalues first + if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, + which=which) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + else: + if not np.all(linalg.eigvalsh(P2) >= epsneg): + raise ValueError("P2 must be positive semi-definite.") + # TODO: if alpha=0 check that X is not rank deficient + # TODO: what else to check? + + ####################################################################### + # 2. rescaling of weights (sample_weight) # + ####################################################################### + # IMPORTANT NOTE: Since we want to minimize + # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # deviance = sum(sample_weight * unit_deviance), + # we rescale weights such that sum(weights) = 1 and this becomes + # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + weights_sum = np.sum(weights) + weights = weights/weights_sum + + ####################################################################### + # 3. initialization of coef = (intercept_, coef_) # + ####################################################################### + # Note: Since phi=self.dispersion_ does not enter the estimation + # of mu_i=E[y_i], set it to 1. + + # set start values for coef + coef = None + if self.warm_start and hasattr(self, 'coef_'): + if self.fit_intercept: + coef = np.concatenate((np.array([self.intercept_]), + self.coef_)) + else: + coef = self.coef_ + elif isinstance(start_params, str): + if start_params == 'guess': + # Set mu=starting_mu of the family and do one Newton step + # If solver=cd use cd, else irls + mu = family.starting_mu(y, weights=weights) + eta = link.link(mu) # linear predictor + if solver in ['cd', 'lbfgs', 'newton-cg']: + # see function _cd_solver + sigma_inv = 1/family.variance(mu, phi=1, weights=weights) + d1 = link.inverse_derivative(eta) + temp = sigma_inv * d1 * (y - mu) + if self.fit_intercept: + score = np.concatenate(([temp.sum()], temp @ X)) + else: + score = temp @ X # sampe as X.T @ temp + + d2_sigma_inv = d1 * d1 * sigma_inv + diag_fisher = self.diag_fisher + if diag_fisher: + fisher = d2_sigma_inv + else: + fisher = \ + _safe_sandwich_dot(X, d2_sigma_inv, + intercept=self.fit_intercept) + # set up space for search direction d for inner loop + if self.fit_intercept: + coef = np.zeros(n_features+1) + else: + coef = np.zeros(n_features) + d = np.zeros_like(coef) + # initial stopping tolerance of inner loop + # use L1-norm of minimum of norm of subgradient of F + # use less restrictive tolerance for initial guess + inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, + P1=P1) + inner_tol = 4 * linalg.norm(inner_tol, ord=1) + # just one outer loop = Newton step + n_cycles = 0 + d, coef_P2, n_cycles, inner_tol = \ + _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, + inner_tol, max_inner_iter=1000, + selection=self.selection, + random_state=self.random_state, + diag_fisher=self.diag_fisher) + coef += d # for simplicity no line search here + else: + # See _irls_solver + # h'(eta) + hp = link.inverse_derivative(eta) + # working weights W, in principle a diagonal matrix + # therefore here just as 1d array + W = (hp**2 / family.variance(mu, phi=1, weights=weights)) + # working observations + z = eta + (y-mu)/hp + # solve A*coef = b + # A = X' W X + l2 P2, b = X' W z + coef = _irls_step(X, W, P2, z, + fit_intercept=self.fit_intercept) + else: # start_params == 'zero' + if self.fit_intercept: + coef = np.zeros(n_features+1) + coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) + else: # assign given array as start values + coef = start_params + + ####################################################################### + # 4. fit # + ####################################################################### + # algorithms for optimiation + # TODO: Parallelize it? + + # 4.1 IRLS ############################################################ + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + if solver == 'irls': + coef, self.n_iter_ = \ + _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, + fit_intercept=self.fit_intercept, family=family, + link=link, max_iter=self.max_iter, tol=self.tol) + + # 4.2 L-BFGS ########################################################## + elif solver == 'lbfgs': + def func(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + dev = family.deviance(y, mu, weights) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + objp = 0.5 * devp + objp[idx:] += L2 + return obj, objp + + args = (X, y, weights, P2, family, link) + coef, loss, info = fmin_l_bfgs_b( + func, coef, fprime=None, args=args, + iprint=(self.verbose > 0) - 1, pgtol=self.tol, + maxiter=self.max_iter, factr=1e3) + if self.verbose > 0: + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) + self.n_iter_ = info['nit'] + + # 4.3 Newton-CG ####################################################### + # We use again the fisher matrix instead of the hessian. More + # precisely, expected hessian of deviance. + elif solver == 'newton-cg': + def func(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = coef[idx:] @ (P2 * coef[idx:]) + else: + L2 = coef[idx:] @ (P2 @ coef[idx:]) + mu = link.inverse(_safe_lin_pred(X, coef)) + return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 + + def grad(coef, X, y, weights, P2, family, link): + mu, devp = \ + family._mu_deviance_derivative(coef, X, y, weights, link) + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + objp = 0.5 * devp + objp[idx:] += L2 + return objp + + def grad_hess(coef, X, y, weights, P2, family, link): + intercept = (coef.size == X.shape[1] + 1) + idx = 1 if intercept else 0 # offset if coef[0] is intercept + if P2.ndim == 1: + L2 = P2 * coef[idx:] + else: + L2 = P2 @ coef[idx:] + eta = _safe_lin_pred(X, coef) + mu = link.inverse(eta) + d1 = link.inverse_derivative(eta) + temp = d1 * family.deviance_derivative(y, mu, weights) + if intercept: + grad = np.concatenate(([0.5 * temp.sum()], + 0.5 * temp @ X + L2)) + else: + grad = 0.5 * temp @ X + L2 # sampe as 0.5* X.T @ temp + L2 + + # expected hessian = fisher = X.T @ diag_matrix @ X + # calculate only diag_matrix + diag = d1**2 / family.variance(mu, phi=1, weights=weights) + if intercept: + h0i = np.concatenate(([diag.sum()], diag @ X)) + + def Hs(coef): + # return (0.5 * fisher + P2) @ coef + # ret = 0.5 * (X.T @ (diag * (X @ coef))) + ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) + if P2.ndim == 1: + ret += P2 * coef[idx:] + else: + ret += P2 @ coef[idx:] + if intercept: + ret = np.concatenate(([0.5 * (h0i @ coef)], + ret + 0.5 * coef[0] * h0i[1:])) + return ret + + return grad, Hs + + args = (X, y, weights, P2, family, link) + coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, + args=args, maxiter=self.max_iter, + tol=self.tol) + + # 4.4 coordinate descent ############################################## + # Note: we already set P1 = l1*P1, see above + # Note: we already set P2 = l2*P2, see above + # Note: we already symmetriezed P2 = 1/2 (P2 + P2') + elif solver == 'cd': + coef, self.n_iter_, self._n_cycles = \ + _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, + P2=P2, fit_intercept=self.fit_intercept, + family=family, link=link, + max_iter=self.max_iter, tol=self.tol, + selection=self.selection, random_state=random_state, + diag_fisher=self.diag_fisher, copy_X=self.copy_X) + + ####################################################################### + # 5. postprocessing # + ####################################################################### + if self.fit_intercept: + self.intercept_ = coef[0] + self.coef_ = coef[1:] + else: + # set intercept to zero as the other linear models do + self.intercept_ = 0. + self.coef_ = coef + + if self.fit_dispersion in ['chisqr', 'deviance']: + # attention because of rescaling of weights + self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum + + return self + + def linear_predictor(self, X): + """Compute the linear_predictor = X*coef_ + intercept_. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Samples. + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values of linear predictor. + """ + check_is_fitted(self, "coef_") + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) + return X @ self.coef_ + self.intercept_ + + def predict(self, X, sample_weight=None): + """Predict uing GLM with feature matrix X. + If sample_weight is given, returns prediction*sample_weight. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Samples. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + + Returns + ------- + C : array, shape (n_samples,) + Returns predicted values times sample_weight. + """ + # TODO: Is copy=True necessary? + X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], + dtype='numeric', copy=True, ensure_2d=True, + allow_nd=False) + eta = self.linear_predictor(X) + mu = self._link_instance.inverse(eta) + weights = _check_weights(sample_weight, X.shape[0]) + + return mu*weights + + def estimate_phi(self, X, y, sample_weight=None): + """Estimate/fit the dispersion parameter phi. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training data. + + y : array-like, shape (n_samples,) + Target values. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + Sample weights. + + Returns + ------- + phi : float + Dispersion parameter. + """ + check_is_fitted(self, "coef_") + _dtype = [np.float64, np.float32] + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], + dtype=_dtype, y_numeric=True, multi_output=False) + n_samples, n_features = X.shape + weights = _check_weights(sample_weight, n_samples) + eta = X @ self.coef_ + if self.fit_intercept is True: + eta += self.intercept_ + n_features += 1 + if n_samples <= n_features: + raise ValueError("Estimation of dispersion parameter phi requires" + " more samples than features, got" + " samples=X.shape[0]={0} and" + " n_features=X.shape[1]+fit_intercept={1}." + .format(n_samples, n_features)) + mu = self._link_instance.inverse(eta) + if self.fit_dispersion == 'chisqr': + chisq = np.sum(weights*(y-mu)**2 / + self._family_instance.unit_variance(mu)) + return chisq/(n_samples - n_features) + elif self.fit_dispersion == 'deviance': + dev = self._family_instance.deviance(y, mu, weights) + return dev/(n_samples - n_features) + + # Note: check_estimator(GeneralizedLinearRegressor) might raise + # "AssertionError: -0.28014056555724598 not greater than 0.5" + # unless GeneralizedLinearRegressor has a score which passes the test. + def score(self, X, y, sample_weight=None): + """Compute D^2, the percentage of deviance explained. + + D^2 is a generalization of the coefficient of determination R^2. + R^2 uses squared error and D^2 deviance. Note that those two are equal + for family='normal'. + + D^2 is defined as + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. + Best possible score is 1.0 and it can be negative (because the model + can be arbitrarily worse). + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Test samples. + + y : array-like, shape (n_samples,) + True values of target. + + sample_weight : {None, array-like}, shape (n_samples,), optional \ + (default=None) + Sample weights. + + Returns + ------- + score : float + D^2 of self.predict(X) w.r.t. y. + """ + # Note, default score defined in RegressorMixin is R^2 score. + # TODO: make D^2 a score function in module metrics (and thereby get + # input validation and so on) + weights = _check_weights(sample_weight, y.shape[0]) + mu = self.predict(X) + dev = self._family_instance.deviance(y, mu, weights=weights) + y_mean = np.average(y, weights=weights) + dev_null = self._family_instance.deviance(y, y_mean, weights=weights) + return 1. - dev / dev_null diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py new file mode 100644 index 0000000000000..7cb3f4a5b5969 --- /dev/null +++ b/sklearn/linear_model/tests/test_glm.py @@ -0,0 +1,640 @@ +import numpy as np +from numpy.testing import assert_allclose +import pytest +import scipy as sp +from scipy import linalg, optimize, sparse + +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model.glm import ( + Link, + IdentityLink, + LogLink, + LogitLink, + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, + GeneralizedHyperbolicSecant, BinomialDistribution, + GeneralizedLinearRegressor) +from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge + +from sklearn.utils.testing import ( + assert_equal, assert_almost_equal, + assert_array_equal, assert_array_almost_equal) + + +@pytest.mark.parametrize('link', Link.__subclasses__()) +def test_link_properties(link): + """Test link inverse and derivative.""" + rng = np.random.RandomState(42) + x = rng.rand(100)*100 + link = link() # instatiate object + decimal = 10 + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + decimal = 8 + assert_almost_equal(link.link(link.inverse(x)), x, decimal=decimal) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_almost_equal(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x), decimal=decimal) + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" + result = family.in_y_range([-1, 0, 1]) + assert_array_equal(result, expected) + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5]), + (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_almost_equal(family.deviance(x, x), 0, decimal=10) + + +@pytest.mark.parametrize( + 'family, link', + [(NormalDistribution(), IdentityLink()), + (PoissonDistribution(), LogLink()), + (GammaDistribution(), LogLink()), + (InverseGaussianDistribution(), LogLink()), + (TweedieDistribution(power=1.5), LogLink()), + (TweedieDistribution(power=4.5), LogLink())]) +def test_fisher_matrix(family, link): + """Test the Fisher matrix numerically. + Trick: Use numerical differentiation with y = mu""" + coef = np.array([-2, 1, 0, 1, 2.5]) + phi = 0.5 + rng = np.random.RandomState(42) + X = rng.randn(10, 5) + lin_pred = np.dot(X, coef) + mu = link.inverse(lin_pred) + weights = rng.randn(10)**2 + 1 + fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + approx = np.array([]).reshape(0, coef.shape[0]) + for i in range(coef.shape[0]): + def f(coef): + return -family._score(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link)[i] + approx = np.vstack( + [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) + assert_allclose(fisher, approx, rtol=1e-3) + + +def test_sample_weights_validation(): + """Test the raised errors in the validation of sample_weight.""" + # 1. scalar value but not positive + X = [[1]] + y = [1] + weights = 0 + glm = GeneralizedLinearRegressor(fit_intercept=False) + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 2. 2d array + weights = [[0]] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 3. 1d but wrong length + weights = [1, 0] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 4. 1d but only zeros (sum not greater than 0) + weights = [0, 0] + X = [[0], [1]] + y = [1, 2] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + # 5. 1d but weith a negative value + weights = [2, -1] + with pytest.raises(ValueError): + glm.fit(X, y, weights) + + +@pytest.mark.parametrize('f, fam', + [('normal', NormalDistribution()), + ('poisson', PoissonDistribution()), + ('gamma', GammaDistribution()), + ('inverse.gaussian', InverseGaussianDistribution()), + ('binomial', BinomialDistribution())]) +def test_glm_family_argument(f, fam): + """Test GLM family argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) + assert_equal(type(glm._family_instance), type(fam)) + + glm = GeneralizedLinearRegressor(family='not a family', + fit_intercept=False) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('l, link', + [('identity', IdentityLink()), + ('log', LogLink()), + ('logit', LogitLink())]) +def test_glm_link_argument(l, link): + """Test GLM link argument set as string.""" + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) + assert_equal(type(glm._link_instance), type(link)) + + glm = GeneralizedLinearRegressor(family='normal', link='not a link') + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('alpha', ['not a number', -4.2]) +def test_glm_alpha_argument(alpha): + """Test GLM for invalid alpha argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) +def test_glm_l1_ratio_argument(l1_ratio): + """Test GLM for invalid l1_ratio argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], + [-1]]) +def test_glm_P1_argument(P1): + """Test GLM for invalid P1 argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) + with pytest.raises((ValueError, TypeError)): + glm.fit(X, y) + + +@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], + sparse.csr_matrix([1, 2, 3]), [-1]]) +def test_glm_P2_argument(P2): + """Test GLM for invalid P2 argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(P2=P2, check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + +def test_glm_P2_positive_semidefinite(): + """Test GLM for a positive semi-definite P2 argument.""" + n_samples, n_features = 10, 5 + y = np.arange(n_samples) + X = np.zeros((n_samples, n_features)) + P2 = np.diag([100, 10, 5, 0, -1E-5]) + rng = np.random.RandomState(42) + # construct random orthogonal matrix Q + Q, R = linalg.qr(rng.randn(n_features, n_features)) + P2 = Q.T @ P2 @ Q + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + P2 = sparse.csr_matrix(P2) + glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, + check_input=True) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) +def test_glm_fit_intercept_argument(fit_intercept): + """Test GLM for invalid fit_intercept argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('solver, l1_ratio', + [('not a solver', 0), (1, 0), ([1], 0), + ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) +def test_glm_solver_argument(solver, l1_ratio): + """Test GLM for invalid solver argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]]) +def test_glm_max_iter_argument(max_iter): + """Test GLM for invalid max_iter argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(max_iter=max_iter) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]]) +def test_glm_tol_argument(tol): + """Test GLM for invalid tol argument.""" + y = np.array([1, 2]) + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(tol=tol) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]]) +def test_glm_warm_start_argument(warm_start): + """Test GLM for invalid warm_start argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(warm_start=warm_start) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('start_params', + ['not a start_params', ['zero'], [0, 0, 0], + [[0, 0]], ['a', 'b']]) +def test_glm_start_params_argument(start_params): + """Test GLM for invalid start_params argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(start_params=start_params) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) +def test_glm_selection_argument(selection): + """Test GLM for invalid selection argument""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(selection=selection) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) +def test_glm_random_state_argument(random_state): + """Test GLM for invalid random_state argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(random_state=random_state) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) +def test_glm_diag_fisher_argument(diag_fisher): + """Test GLM for invalid diag_fisher arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) +def test_glm_copy_X_argument(copy_X): + """Test GLM for invalid copy_X arguments.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(copy_X=copy_X) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) +def test_glm_check_input_argument(check_input): + """Test GLM for invalid check_input argument.""" + y = np.array([1, 2]) + X = np.array([[1], [1]]) + glm = GeneralizedLinearRegressor(check_input=check_input) + with pytest.raises(ValueError): + glm.fit(X, y) + + +@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +def test_glm_identiy_regression(solver): + """Test GLM regression with identity link on a simple dataset.""" + coef = [1, 2] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.dot(X, coef) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False, solver=solver, + start_params='zero', tol=1e-7) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), PoissonDistribution(), + GammaDistribution(), InverseGaussianDistribution(), + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), + GeneralizedHyperbolicSecant()]) +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 6), + ('lbfgs', 1e-6, 6), + ('newton-cg', 1e-7, 6), + ('cd', 1e-7, 6)]) +def test_glm_log_regression(family, solver, tol, dec): + """Test GLM regression with log link on a simple dataset.""" + coef = [0.2, -0.1] + X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T + y = np.exp(np.dot(X, coef)) + glm = GeneralizedLinearRegressor( + alpha=0, family=family, link='log', fit_intercept=False, + solver=solver, start_params='guess', tol=tol) + res = glm.fit(X, y) + assert_array_almost_equal(res.coef_, coef, decimal=dec) + + +@pytest.mark.filterwarnings('ignore::DeprecationWarning') +@pytest.mark.parametrize('solver, tol, dec', [('irls', 1e-6, 5), + ('lbfgs', 1e-6, 5), + ('newton-cg', 1e-5, 5), + ('cd', 1e-6, 6)]) +def test_normal_ridge(solver, tol, dec): + """Test ridge regression for Normal distributions. + + Compare to test_ridge in test_ridge.py. + """ + rng = np.random.RandomState(42) + alpha = 1.0 + + # 1. With more samples than features + n_samples, n_features, n_predict = 100, 7, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, + solver='svd', normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, + solver='svd', normalize=False) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol, max_iter=100, solver=solver, + check_input=False, random_state=rng, + fit_dispersion='chisqr') + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + mu = glm.predict(X) + assert_almost_equal(glm.dispersion_, + np.sum((y-mu)**2/(n_samples-n_features))) + + # 2. With more features than samples and sparse + n_samples, n_features, n_predict = 10, 100, 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=rng) + y = y[0:n_samples] + X, T = X[0:n_samples], X[n_samples:] + + # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, + solver='sag', normalize=False, max_iter=100000) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=True, + tol=tol, max_iter=300, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec) + + ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, + solver='sag', normalize=False, max_iter=1000) + ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + link='identity', fit_intercept=False, + tol=tol*2, max_iter=300, solver=solver, + check_input=False, random_state=rng) + glm.fit(X, y) + assert_equal(glm.coef_.shape, (X.shape[1], )) + assert_array_almost_equal(glm.coef_, ridge.coef_, decimal=dec-1) + assert_almost_equal(glm.intercept_, ridge.intercept_, decimal=dec-1) + assert_array_almost_equal(glm.predict(T), ridge.predict(T), decimal=dec-2) + + +@pytest.mark.parametrize('solver, tol, dec', + [('irls', 1e-7, 6), + ('lbfgs', 1e-7, 5), + ('newton-cg', 1e-7, 5), + ('cd', 1e-7, 7)]) +def test_poisson_ridge(solver, tol, dec): + """Test ridge regression with poisson family and LogLink. + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.12889386979 + # a 0.29019207995 + # b 0.03741173122 + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + rng = np.random.RandomState(42) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + fit_intercept=True, family='poisson', + link='log', tol=tol, + solver=solver, max_iter=300, + random_state=rng) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, -0.12889386979, + decimal=dec) + assert_array_almost_equal(glm.coef_, [0.29019207995, 0.03741173122], + decimal=dec) + + +@pytest.mark.parametrize('diag_fisher', [False, True]) +def test_normal_enet(diag_fisher): + """Test elastic net regression with normal/gaussian family.""" + alpha, l1_ratio = 0.3, 0.7 + n_samples, n_features = 20, 2 + rng = np.random.RandomState(42) + X = rng.randn(n_samples, n_features).copy(order='F') + beta = rng.randn(n_features) + y = 2 + np.dot(X, beta) + rng.randn(n_samples) + + # 1. test normal enet on dense data + glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, + family='normal', link='identity', + fit_intercept=True, tol=1e-8, + max_iter=100, selection='cyclic', + solver='cd', start_params='zero', + check_input=False, + diag_fisher=diag_fisher) + glm.fit(X, y) + + enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, + normalize=False, tol=1e-8, copy_X=True) + enet.fit(X, y) + + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + + # 2. test normal enet on sparse data + X = sparse.csc_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, enet.intercept_, decimal=7) + assert_array_almost_equal(glm.coef_, enet.coef_, decimal=7) + + +def test_poisson_enet(): + """Test elastic net regression with poisson family and LogLink. + + Compare to R's glmnet""" + # library("glmnet") + # options(digits=10) + # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) + # x <- data.matrix(df[,c("a", "b")]) + # y <- df$y + # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", + # standardize=F, thresh=1e-10, nlambda=10000) + # coef(fit, s=1) + # (Intercept) -0.03550978409 + # a 0.16936423283 + # b . + glmnet_intercept = -0.03550978409 + glmnet_coef = [0.16936423283, 0.] + X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T + y = np.array([0, 1, 1, 2]) + rng = np.random.RandomState(42) + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-8, + selection='random', random_state=rng, + start_params='guess') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=7) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=7) + + # test results with general optimization procedure + def obj(coef): + pd = PoissonDistribution() + link = LogLink() + N = y.shape[0] + mu = link.inverse(X @ coef[1:] + coef[0]) + alpha, l1_ratio = (1, 0.5) + return 1./(2.*N) * pd.deviance(y, mu) \ + + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ + + alpha * l1_ratio * np.sum(np.abs(coef[1:])) + res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, + options={'maxiter': 1000, 'disp': False}) + assert_almost_equal(glm.intercept_, res.x[0], decimal=5) + assert_almost_equal(glm.coef_, res.x[1:], decimal=5) + assert_almost_equal(obj(np.concatenate(([glm.intercept_], glm.coef_))), + res.fun, decimal=8) + + # same for start_params='zero' and selection='cyclic' + # with reduced precision + glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', + link='log', solver='cd', tol=1e-5, + selection='cyclic', start_params='zero') + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + # check warm_start, therefore start with different alpha + glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, + family='poisson', max_iter=300, + link='log', solver='cd', tol=1e-5, + selection='cyclic', start_params='zero') + glm.fit(X, y) + # warm start with original alpha and use of sparse matrices + glm.warm_start = True + glm.alpha = 1 + X = sparse.csr_matrix(X) + glm.fit(X, y) + assert_almost_equal(glm.intercept_, glmnet_intercept, decimal=4) + assert_array_almost_equal(glm.coef_, glmnet_coef, decimal=4) + + +@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) +def test_binomial_enet(alpha): + """Test elastic net regression with binomial family and LogitLink. + + Compare to LogisticRegression. + """ + l1_ratio = 0.5 + n_samples = 500 + rng = np.random.RandomState(42) + X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, + n_informative=5, n_redundant=0, n_repeated=0, + random_state=rng) + log = LogisticRegression( + penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, + max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), + solver='saga') + log.fit(X, y) + glm = GeneralizedLinearRegressor( + family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, + alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', + tol=1e-7) + glm.fit(X, y) + assert_almost_equal(log.intercept_[0], glm.intercept_, decimal=6) + assert_array_almost_equal(log.coef_[0, :], glm.coef_, decimal=6) From 9b574bdc0b3a4a1081f73563d0468c864fcbbb22 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 11 Jun 2019 07:30:52 -0500 Subject: [PATCH 058/269] Fix docstrings for the new print_changed_only=True by default --- doc/modules/linear_model.rst | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 52f28346cc047..b9f0e96734ae5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -942,17 +942,11 @@ follows: >>> from sklearn.linear_model import GeneralizedLinearRegressor >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') - >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) # doctest: +NORMALIZE_WHITESPACE - GeneralizedLinearRegressor(P1='identity', P2='identity', alpha=0.5, - check_input=True, copy_X=True, diag_fisher=False, - family='poisson', fit_dispersion=None, - fit_intercept=True, l1_ratio=0, link='log', - max_iter=100, random_state=None, selection='cyclic', - solver='auto', start_params='guess', tol=0.0001, - verbose=0, warm_start=False) - >>> reg.coef_ # doctest: +NORMALIZE_WHITESPACE + >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) + GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> reg.coef_ array([0.24630169, 0.43373464]) - >>> reg.intercept_ #doctest: +ELLIPSIS + >>> reg.intercept_ -0.76383633... From 90299fdd8a4745965eb8f9dbe26ddb685f1531f1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 12 Jun 2019 04:31:50 -0500 Subject: [PATCH 059/269] Increase coverage --- sklearn/linear_model/_glm.py | 22 +++----- sklearn/linear_model/tests/test_glm.py | 78 ++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 8152e84ac7253..055aacf26d747 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -182,7 +182,7 @@ def link(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - raise NotImplementedError + pass @abstractmethod def derivative(self, mu): @@ -193,7 +193,7 @@ def derivative(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - raise NotImplementedError + pass @abstractmethod def inverse(self, lin_pred): @@ -207,7 +207,7 @@ def inverse(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass @abstractmethod def inverse_derivative(self, lin_pred): @@ -218,7 +218,7 @@ def inverse_derivative(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass @abstractmethod def inverse_derivative2(self, lin_pred): @@ -229,7 +229,7 @@ def inverse_derivative2(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - raise NotImplementedError + pass class IdentityLink(Link): @@ -400,7 +400,7 @@ def unit_variance(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - raise NotImplementedError() + pass @abstractmethod def unit_variance_derivative(self, mu): @@ -413,7 +413,7 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Target values. """ - raise NotImplementedError() + pass def variance(self, mu, phi=1, weights=1): r"""Compute the variance function. @@ -473,7 +473,7 @@ def unit_deviance(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - raise NotImplementedError() + pass def unit_deviance_derivative(self, y, mu): r"""Compute the derivative of the unit deviance w.r.t. mu. @@ -1079,7 +1079,6 @@ def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, B[idx:, idx:] += P2.toarray() else: B[idx:, idx:] += P2 - # A = -score + coef_P2 A = -score A[idx:] += coef_P2 # A += d @ (H+P2) but so far d=0 @@ -1302,9 +1301,6 @@ def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], order='F', copy=copy_X) if sparse.issparse(X): - if not sparse.isspmatrix_csc(X): - raise ValueError("If X is sparse, it must be in csc format" - "; got (format={})".format(X.format)) if not sparse.isspmatrix_csc(P2): raise ValueError("If X is sparse, P2 must also be sparse csc" "format. Got P2 not sparse.") @@ -2021,7 +2017,7 @@ def fit(self, X, y, sample_weight=None): _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, max_inner_iter=1000, selection=self.selection, - random_state=self.random_state, + random_state=random_state, diag_fisher=self.diag_fisher) coef += d # for simplicity no line search here else: diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index d42a8739f6aa0..230bbdabae201 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -21,10 +21,20 @@ GeneralizedHyperbolicSecant, BinomialDistribution, ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge +from sklearn.metrics import mean_absolute_error from sklearn.utils.testing import assert_array_equal +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression(n_samples=107, + n_features=10, + n_informative=80, noise=0.5, + random_state=2) + return X, y + + @pytest.mark.parametrize('link', Link.__subclasses__()) def test_link_properties(link): """Test link inverse and derivative.""" @@ -39,6 +49,10 @@ def test_link_properties(link): # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) assert_allclose(link.derivative(link.inverse(x)), 1./link.inverse_derivative(x)) + + assert ( + link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) + # for LogitLink, in the following x should be between 0 and 1. # assert_almost_equal(link.inverse_derivative(link.link(x)), # 1./link.derivative(x), decimal=decimal) @@ -108,7 +122,7 @@ def f(coef): def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" - # 1. scalar value but not positive + # scalar value but not positive X = [[1]] y = [1] weights = 0 @@ -116,17 +130,20 @@ def test_sample_weights_validation(): with pytest.raises(ValueError): glm.fit(X, y, weights) - # 2. 2d array + # Positive weights are accepted + glm.fit(X, y, sample_weight=1) + + # 2d array weights = [[0]] with pytest.raises(ValueError): glm.fit(X, y, weights) - # 3. 1d but wrong length + # 1d but wrong length weights = [1, 0] with pytest.raises(ValueError): glm.fit(X, y, weights) - # 4. 1d but only zeros (sum not greater than 0) + # 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] @@ -643,3 +660,56 @@ def test_binomial_enet(alpha): glm.fit(X, y) assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6) + + +@pytest.mark.parametrize( + "params", + [ + {"solver": "irls", "start_params": "guess"}, + {"solver": "irls", "start_params": "zero"}, + {"solver": "lbfgs", "start_params": "guess"}, + {"solver": "lbfgs", "start_params": "zero"}, + {"solver": "newton-cg"}, + {"solver": "cd", "selection": "cyclic", "diag_fisher": False}, + {"solver": "cd", "selection": "cyclic", "diag_fisher": True}, + {"solver": "cd", "selection": "random", "diag_fisher": False}, + ], + ids=lambda params: ', '.join("%s=%s" % (key, val) + for key, val in params.items()) +) +def test_solver_equivalence(params, regression_data): + X, y = regression_data + est_ref = GeneralizedLinearRegressor(random_state=2) + est_ref.fit(X, y) + + estimator = GeneralizedLinearRegressor(**params) + estimator.set_params(random_state=2) + + estimator.fit(X, y) + + assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4) + assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4) + assert_allclose( + mean_absolute_error(estimator.predict(X), y), + mean_absolute_error(est_ref.predict(X), y), + rtol=1e-4 + ) + + +def test_fit_dispersion(regression_data): + X, y = regression_data + + est1 = GeneralizedLinearRegressor(random_state=2) + est1.fit(X, y) + assert not hasattr(est1, "dispersion_") + + est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr") + est2.fit(X, y) + assert isinstance(est2.dispersion_, float) + + est3 = GeneralizedLinearRegressor( + random_state=2, fit_dispersion="deviance") + est3.fit(X, y) + assert isinstance(est3.dispersion_, float) + + assert_allclose(est2.dispersion_, est3.dispersion_) From e3a5a9aa6855928d89bdc5dad7f3a46ba934fb8e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 12 Jun 2019 12:00:28 -0500 Subject: [PATCH 060/269] More tests and addressing some review comments --- sklearn/linear_model/_glm.py | 51 +++++++++++++------------- sklearn/linear_model/tests/test_glm.py | 43 +++++++++++++++++++++- 2 files changed, 67 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 055aacf26d747..acce438b2b5e2 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -6,8 +6,6 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Should the option `normalize` be included (like other linear models)? -# So far, it is not included. User must pass a normalized X. # TODO: Add cross validation support, e.g. GCV? # TODO: Should GeneralizedLinearRegressor inherit from LinearModel? # So far, it does not. @@ -287,7 +285,6 @@ def inverse_derivative(self, lin_pred): return ep * (1. - ep) def inverse_derivative2(self, lin_pred): - ep = special.expit(lin_pred) ep = special.expit(lin_pred) return ep * (1. - ep) * (1. - 2 * ep) @@ -738,7 +735,19 @@ class TweedieDistribution(ExponentialDispersionModel): For ``0 0) - 1, pgtol=self.tol, maxiter=self.max_iter, factr=1e3) - if self.verbose > 0: - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) + if info["warnflag"] == 1: + warnings.warn("lbfgs failed to converge." + " Increase the number of iterations.", + ConvergenceWarning) + elif info["warnflag"] == 2: + warnings.warn("lbfgs failed for the reason: {0}" + .format(info["task"])) self.n_iter_ = info['nit'] # 4.3 Newton-CG ####################################################### diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 230bbdabae201..7229f21840829 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -22,6 +22,7 @@ ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.metrics import mean_absolute_error +from sklearn.exceptions import ConvergenceWarning from sklearn.utils.testing import assert_array_equal @@ -72,6 +73,23 @@ def test_family_bounds(family, expected): assert_array_equal(result, expected) +def test_tweedie_distribution_power(): + with pytest.raises(ValueError, match="no distribution exists"): + TweedieDistribution(power=0.5) + + with pytest.raises(TypeError, match="must be a real number"): + TweedieDistribution(power=1j) + + with pytest.raises(TypeError, match="must be a real number"): + dist = TweedieDistribution() + dist.power = 1j + + dist = TweedieDistribution() + assert dist._include_lower_bound is False + dist.power = 1 + assert dist._include_lower_bound is True + + @pytest.mark.parametrize( 'family, chk_values', [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), @@ -97,7 +115,8 @@ def test_deviance_zero(family, chk_values): (GammaDistribution(), LogLink()), (InverseGaussianDistribution(), LogLink()), (TweedieDistribution(power=1.5), LogLink()), - (TweedieDistribution(power=4.5), LogLink())]) + (TweedieDistribution(power=4.5), LogLink())], + ids=lambda args: args.__class__.__name__) def test_fisher_matrix(family, link): """Test the Fisher matrix numerically. Trick: Use numerical differentiation with y = mu""" @@ -110,6 +129,11 @@ def test_fisher_matrix(family, link): weights = rng.randn(10)**2 + 1 fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, weights=weights, link=link) + # check that the Fisher matrix is square and positive definite + assert fisher.ndim == 2 + assert fisher.shape[0] == fisher.shape[1] + assert np.all(np.linalg.eigvals(fisher) >= 0) + approx = np.array([]).reshape(0, coef.shape[0]) for i in range(coef.shape[0]): def f(coef): @@ -119,6 +143,13 @@ def f(coef): [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) assert_allclose(fisher, approx, rtol=1e-3) + # check the observed information matrix + oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu, + weights=weights, link=link) + assert oim.ndim == 2 + assert oim.shape == fisher.shape + assert_allclose(oim, fisher) + def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" @@ -713,3 +744,13 @@ def test_fit_dispersion(regression_data): assert isinstance(est3.dispersion_, float) assert_allclose(est2.dispersion_, est3.dispersion_) + + +@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"]) +def test_convergence_warning(solver, regression_data): + X, y = regression_data + + est = GeneralizedLinearRegressor(solver=solver, random_state=2, + max_iter=1, tol=1e-20) + with pytest.warns(ConvergenceWarning): + est.fit(X, y) From 54b80b8db780fbd3b438c6d5dc0001cdafe7a47e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 13 Jun 2019 09:50:53 -0500 Subject: [PATCH 061/269] TST More specific checks of error messages in tests --- sklearn/linear_model/_glm.py | 6 ++-- sklearn/linear_model/tests/test_glm.py | 44 ++++++++++++++------------ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index acce438b2b5e2..c4f8cf7a975d3 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -1941,8 +1941,10 @@ def fit(self, X, y, sample_weight=None): k = np.min([10, n_features // 10 + 1]) sigma = 0 # start searching near this value which = 'SA' # find smallest algebraic eigenvalues first - if not np.all(splinalg.eigsh(P2, k=k, sigma=sigma, - which=which) >= epsneg): + eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma, + which=which, + return_eigenvectors=False) + if not np.all(eigenvalues >= epsneg): raise ValueError("P2 must be positive semi-definite.") else: if not np.all(linalg.eigvalsh(P2) >= epsneg): diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 7229f21840829..de1a5262b36ce 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -158,7 +158,7 @@ def test_sample_weights_validation(): y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="weights must be non-negative"): glm.fit(X, y, weights) # Positive weights are accepted @@ -166,24 +166,26 @@ def test_sample_weights_validation(): # 2d array weights = [[0]] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="must be 1D array or scalar"): glm.fit(X, y, weights) # 1d but wrong length weights = [1, 0] - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="weights must have the same length as y"): glm.fit(X, y, weights) # 1d but only zeros (sum not greater than 0) weights = [0, 0] X = [[0], [1]] y = [1, 2] - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="must have at least one positive element"): glm.fit(X, y, weights) # 5. 1d but with a negative value weights = [2, -1] - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="weights must be non-negative"): glm.fit(X, y, weights) @@ -202,7 +204,7 @@ def test_glm_family_argument(f, fam): glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="family must be"): glm.fit(X, y) @@ -218,7 +220,7 @@ def test_glm_link_argument(l, link): assert isinstance(glm._link_instance, link.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="link must be"): glm.fit(X, y) @@ -228,7 +230,8 @@ def test_glm_alpha_argument(alpha): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', alpha=alpha) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="Penalty term must be a non-negative"): glm.fit(X, y) @@ -238,7 +241,8 @@ def test_glm_l1_ratio_argument(l1_ratio): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="l1_ratio must be a number in interval.*0, 1"): glm.fit(X, y) @@ -276,13 +280,13 @@ def test_glm_P2_positive_semidefinite(): P2 = Q.T @ P2 @ Q glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, check_input=True) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="P2 must be positive semi-definite"): glm.fit(X, y) P2 = sparse.csr_matrix(P2) glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, check_input=True) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="P2 must be positive semi-definite"): glm.fit(X, y) @@ -292,7 +296,7 @@ def test_glm_fit_intercept_argument(fit_intercept): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="fit_intercept must be bool"): glm.fit(X, y) @@ -314,7 +318,7 @@ def test_glm_max_iter_argument(max_iter): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(max_iter=max_iter) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="must be a positive integer"): glm.fit(X, y) @@ -324,7 +328,7 @@ def test_glm_tol_argument(tol): y = np.array([1, 2]) X = np.array([[1], [2]]) glm = GeneralizedLinearRegressor(tol=tol) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="stopping criteria must be positive"): glm.fit(X, y) @@ -334,7 +338,7 @@ def test_glm_warm_start_argument(warm_start): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(warm_start=warm_start) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="warm_start must be bool"): glm.fit(X, y) @@ -356,7 +360,7 @@ def test_glm_selection_argument(selection): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(selection=selection) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="argument selection must be"): glm.fit(X, y) @@ -366,7 +370,7 @@ def test_glm_random_state_argument(random_state): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(random_state=random_state) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="cannot be used to seed"): glm.fit(X, y) @@ -376,7 +380,7 @@ def test_glm_diag_fisher_argument(diag_fisher): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="diag_fisher must be bool"): glm.fit(X, y) @@ -386,7 +390,7 @@ def test_glm_copy_X_argument(copy_X): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(copy_X=copy_X) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="copy_X must be bool"): glm.fit(X, y) @@ -396,7 +400,7 @@ def test_glm_check_input_argument(check_input): y = np.array([1, 2]) X = np.array([[1], [1]]) glm = GeneralizedLinearRegressor(check_input=check_input) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="check_input must be bool"): glm.fit(X, y) From 7db0320f460676d8bed8e01bf64657a89532e2cb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 14 Jun 2019 10:44:16 -0500 Subject: [PATCH 062/269] Add PoissonRegressor alias --- sklearn/linear_model/__init__.py | 5 +- sklearn/linear_model/_glm.py | 163 +++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index e5ede64413eb5..121418f901a1a 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -19,7 +19,7 @@ MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) from ._glm import (TweedieDistribution, - GeneralizedLinearRegressor) + GeneralizedLinearRegressor, PoissonRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -82,4 +82,5 @@ 'ridge_regression', 'RANSACRegressor', 'GeneralizedLinearRegressor', - 'TweedieDistribution'] + 'TweedieDistribution', + 'PoissonRegressor'] diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index c4f8cf7a975d3..b18731e73f328 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -2326,3 +2326,166 @@ def score(self, X, y, sample_weight=None): y_mean = np.average(y, weights=weights) dev_null = self._family_instance.deviance(y, y_mean, weights=weights) return 1. - dev / dev_null + + def _more_tags(self): + return {"requires_positive_y": True} + + +class PoissonRegressor(GeneralizedLinearRegressor): + """Regression with the response variable y following a Poisson distribution + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). + The fit minimizes the following objective function with L2 regularization:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms and thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statistic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls') + Algorithm to use in the optimization problem: + + 'irls' + Iterated reweighted least squares. It is the standard algorithm + for GLMs. + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + 'newton-cg' + Newton conjugate gradient algorithm. + + Note that all solvers except lbfgs use the fisher matrix, i.e. the + expected Hessian instead of the Hessian matrix. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the irls, newton-cg and lbfgs solvers, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` (supersedes option + ``start_params``). If set to ``True`` or if the attribute ``coef_`` + does not exit (first call to ``fit``), option ``start_params`` sets the + start values for ``coef_`` and ``intercept_``. + + start_params : {'guess', 'zero', array of shape (n_features*, )}, \ + optional (default='guess') + Relevant only if ``warm_start=False`` or if fit is called + the first time (``self.coef_`` does not yet exist). + + 'guess' + Start values of mu are calculated by family.starting_mu(..). Then, + one Newton step obtains start values for ``coef_``. If + ``solver='irls'``, it uses one irls step. This gives usually good + starting values. + + 'zero' + All coefficients are set to zero. If ``fit_intercept=True``, the + start value for the intercept is obtained by the weighted average of y. + + array + The array of size n_features* is directly used as start values + for ``coef_``. If ``fit_intercept=True``, the first element + is assumed to be the start value for the ``intercept_``. + Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes + the intercept in counting. + + random_state : {int, RandomState instance, None}, optional (default=None) + If int, random_state is the seed used by the random + number generator; if RandomState instance, random_state is the random + number generator; if None, the random number generator is the + RandomState instance used by `np.random`. Used when ``selection`` == + 'random'. + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. + + If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poisson distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. + The weights are necessary for the right (finite sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + + References + ---------- + For the coordinate descent implementation: + * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin + An Improved GLMNET for L1-regularized Logistic Regression, + Journal of Machine Learning Research 13 (2012) 1999-2030 + https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + """ + def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, + solver='irls', max_iter=100, + tol=1e-4, warm_start=False, start_params='guess', + random_state=None, copy_X=True, check_input=True, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family="poisson", link='log', + fit_dispersion=fit_dispersion, solver=solver, + max_iter=max_iter, tol=tol, warm_start=warm_start, + start_params=start_params, random_state=random_state, + copy_X=copy_X, verbose=verbose) From dcfe9edaf83509d2b6d98ee840c28f674ea4f496 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 27 Jun 2019 08:41:04 -0500 Subject: [PATCH 063/269] TST Simplify comparison with ridge --- sklearn/linear_model/tests/test_glm.py | 94 ++++++-------------------- 1 file changed, 22 insertions(+), 72 deletions(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index de1a5262b36ce..1416bdcfad680 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -26,6 +26,8 @@ from sklearn.utils.testing import assert_array_equal +GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd'] + @pytest.fixture(scope="module") def regression_data(): @@ -404,7 +406,7 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize('solver', ['irls', 'lbfgs', 'newton-cg', 'cd']) +@pytest.mark.parametrize('solver', GLM_SOLVERS) def test_glm_identity_regression(solver): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] @@ -442,97 +444,45 @@ def test_glm_log_regression(family, solver, tol): # newton-cg may issue a LineSearchWarning, which we filter out @pytest.mark.filterwarnings('ignore:The line search algorithm') @pytest.mark.filterwarnings('ignore:Line Search failed') -@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), - ('lbfgs', 1e-6), - ('newton-cg', 1e-6), - ('cd', 1e-6)]) -def test_normal_ridge(solver, tol): +@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) +@pytest.mark.parametrize('fit_intercept', [True, False]) +@pytest.mark.parametrize('solver', GLM_SOLVERS) +def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): """Test ridge regression for Normal distributions. + Case n_samples >> n_features + Compare to test_ridge in test_ridge.py. """ - rng = np.random.RandomState(42) alpha = 1.0 - - # 1. With more samples than features - n_samples, n_features, n_predict = 100, 7, 10 + n_predict = 10 X, y, coef = make_regression(n_samples=n_samples+n_predict, n_features=n_features, n_informative=n_features-2, noise=0.5, - coef=True, random_state=rng) + coef=True, random_state=42) y = y[0:n_samples] X, T = X[0:n_samples], X[n_samples:] - # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-6, - solver='svd', normalize=False) - ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=True, - tol=tol, max_iter=100, solver=solver, - check_input=False, random_state=rng) - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-6) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) - - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-6, - solver='svd', normalize=False) - ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=False, - tol=tol, max_iter=100, solver=solver, - check_input=False, random_state=rng, - fit_dispersion='chisqr') - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-5) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-6) - mu = glm.predict(X) - assert_allclose(glm.dispersion_, - np.sum((y-mu)**2/(n_samples-n_features))) - - # 2. With more features than samples and sparse - n_samples, n_features, n_predict = 10, 100, 10 - X, y, coef = make_regression(n_samples=n_samples+n_predict, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=rng) - y = y[0:n_samples] - X, T = X[0:n_samples], X[n_samples:] + if n_samples > n_features: + ridge_params = {"solver": "svd"} + else: + ridge_params = {"solver": "sag", "max_iter": 10000, "tol": 1e-9} # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=True, tol=1e-9, - solver='sag', normalize=False, max_iter=100000, - random_state=42) + ridge = Ridge(alpha=alpha*n_samples, normalize=False, + random_state=42, **ridge_params) ridge.fit(X, y) + glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', link='identity', fit_intercept=True, - tol=tol, max_iter=300, solver=solver, - check_input=False, random_state=rng) + max_iter=300, solver=solver, tol=1e-6, + check_input=False, random_state=42) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) - ridge = Ridge(alpha=alpha*n_samples, fit_intercept=False, tol=1e-7, - solver='sag', normalize=False, max_iter=1000, - random_state=42) - ridge.fit(X, y) - - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', - link='identity', fit_intercept=False, - tol=tol*2, max_iter=300, solver=solver, - check_input=False, random_state=rng) - glm.fit(X, y) - assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=1e-4) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) - @pytest.mark.parametrize('solver, tol', [('irls', 1e-7), @@ -559,7 +509,7 @@ def test_poisson_ridge(solver, tol): rng = np.random.RandomState(42) glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, fit_intercept=True, family='poisson', - link='log', tol=tol, + link='log', tol=1e-7, solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) @@ -750,7 +700,7 @@ def test_fit_dispersion(regression_data): assert_allclose(est2.dispersion_, est3.dispersion_) -@pytest.mark.parametrize("solver", ["irls", "lbfgs", "newton-cg", "cd"]) +@pytest.mark.parametrize("solver", GLM_SOLVERS) def test_convergence_warning(solver, regression_data): X, y = regression_data From 4879bb6e057a95c38ee4950d13ef2dbd98da0a19 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 28 Jun 2019 09:54:59 -0500 Subject: [PATCH 064/269] EXA Add plot_tweedie_regression_insurance_claims.py --- ...lot_tweedie_regression_insurance_claims.py | 500 ++++++++++++++++++ 1 file changed, 500 insertions(+) create mode 100644 examples/linear_model/plot_tweedie_regression_insurance_claims.py diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py new file mode 100644 index 0000000000000..cb2ff667e8379 --- /dev/null +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -0,0 +1,500 @@ +""" +====================================== +Tweedie regression on insurance claims +====================================== + +This example illustrate the use Poisson, Gamma and Tweedie regression +on the French Motor Third-Party Liability Claims dataset, and is inspired +by an R tutorial [1]. + +Insurance claims data consist of the number of claims and the total claim +amount. Often, the final goal is to predict the expected value, i.e. the mean, +of the total claim amount. There are several possibilities to do that, two of +which are: + +1. Model the number of claims with a Poisson distribution, the average + claim amount as a Gamma distribution and multiply the predictions, to get + the total claim amount. +2. Model total claim amount directly, typically with a Tweedie distribution. + +In this example we will illustrate both approaches. We start by defining a few +helper functions for loading the data and visualizing results. + + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). + `doi:10.2139/ssrn.3164764 `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# License: BSD 3 clause +from functools import partial + +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd + +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import TweedieDistribution +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer + +from sklearn.metrics import mean_absolute_error + + +def load_mtpl2(n_samples=100000): + """Fetcher for French Motor Third-Party Liability Claims dataset + + Parameters + ---------- + n_samples: int, default=100000 + number of samples to select (for faster run time). + """ + + # Note: this should use the OpenML DataFrame fetcher in the future + df_freq = pd.read_csv( + "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", + dtype={"IDpol": np.int}, + index_col=0, + ) + + df_sev = pd.read_csv( + "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", + index_col=0, + ) + + # sum ClaimAmount over identical IDs + df_sev = df_sev.groupby(level=0).sum() + + df = df_freq.join(df_sev, how="left") + df["ClaimAmount"].fillna(0, inplace=True) + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + return df.iloc[:n_samples] + + +def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, + title=None, kind_weight=None, ax=None): + """Plot observed and predicted - aggregated per feature level. + + Parameters + ---------- + df : DataFrame with at least one column named feature + observed : str + a column name of the observed target + predicted : frame + a dataframe, with the same index as df, with the predicted target + weight : str + column name with the values of weights/exposure + """ + # aggregate observed and predicted variables by feature level + df_ = df.loc[:, [feature, weight]].copy() + df_["observed"] = df[observed] * df[weight] + df_["predicted"] = y_predicted * df[weight] + df_ = ( + df_.groupby([feature])[weight, "observed", "predicted"] + .sum() + .assign(observed=lambda x: x["observed"] / x[weight]) + .assign(predicted=lambda x: x["predicted"] / x[weight]) + ) + + ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax) + y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8 + ax.fill_between( + df_.index, + 0, + y_max * df_[weight] / df_[weight].values.max(), + color="g", + alpha=0.1, + ) + ax.set( + ylabel=y_label if y_label is not None else None, + title=title if title is not None else "Train: Observed vs Predicted", + ) + + +############################################################################## +# +# 1. Loading datasets and pre-processing +# -------------------------------------- +# +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``) with the freMTPL2sev table +# containing the claim amount (``ClaimAmount``) for the same user ids. + +df = load_mtpl2(n_samples=100000) + +# Note: filter out claims with zero amount, as the severity model +# requires a strictly positive target values. +df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 + +# correct for unreasonable observations (that might be data error) +df["ClaimNb"].clip(upper=4, inplace=True) +df["Exposure"].clip(upper=1, inplace=True) + +column_trans = ColumnTransformer( + [ + ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ( + "Veh_Brand_Gas_Region", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("BonusMalus", "passthrough", ["BonusMalus"]), + ( + "Density_log", + make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler() + ), + ["Density"], + ), + ], + remainder="drop", +) +X = column_trans.fit_transform(df) + + +df["Frequency"] = df.ClaimNb / df.Exposure +df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1) + +print(df[df.ClaimAmount > 0].head()) + +############################################################################## +# +# 2. Frequency model -- Poisson distribution +# ------------------------------------------- +# +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occuring with a constant rate in a given time interval (``Exposure``). +# Here we model the frequency ``y = ClaimNb / Exposure``, +# which is still a (scaled) Poisson distribution. +# +# A very important property of the Poisson distribution is its mean-variance +# relation: The variance is proportional to the mean. + +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) + +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + + +def mean_deviance(estimator, y, y_pred, weights): + if hasattr(estimator, "_family_instance"): + return estimator._family_instance.deviance(y, y_pred, weights) / len(y) + else: + return np.nan + + +def score_estimator( + estimator, X_train, X_test, df_train, df_test, target, weights +): + res = [] + + for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), + ]: + y, _weights = df[target], df[weights] + + for score_label, metric in [ + ("D² explaned", None), + ("mean deviance", partial(mean_deviance, estimator)), + ("mean abs. error", mean_absolute_error), + ]: + if estimator.__class__.__name__ == "ClaimProdEstimator": + # ClaimProdEstimator is the product of the frequency and + # severity models, together with a denormalized by the exposure + # values. It does not fully follow the scikit-learn API and we + # must handle it separately. + y_pred = estimator.predict(X, exposure=df.Exposure.values) + else: + y_pred = estimator.predict(X) + if metric is None: + if not hasattr(estimator, "score"): + continue + score = estimator.score(X, y, _weights) + else: + score = metric(y, y_pred, _weights) + + res.append( + {"subset": subset_label, "metric": score_label, "score": score} + ) + + res = ( + pd.DataFrame(res) + .set_index(["metric", "subset"]) + .score.unstack(-1) + .round(3) + ) + return res + + +scores = score_estimator( + glm_freq, + X_train, + X_test, + df_train, + df_test, + target="Frequency", + weights="Exposure", +) +print(scores) + +############################################################################## +# +# We can visually compare observed and predicted values, aggregated by +# the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance +# bonus/penalty (``BonusMalus``), + +fig, ax = plt.subplots(2, 2, figsize=(16, 8)) +fig.subplots_adjust(hspace=0.3, wspace=0.2) + +plot_obs_pred( + df_train, + "DrivAge", + "Frequency", + glm_freq.predict(X_train), + weight="Exposure", + y_label="Claim Frequency", + title="train data", + ax=ax[0, 0], +) + +plot_obs_pred( + df_test, + "DrivAge", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[0, 1], +) + +plot_obs_pred( + df_test, + "VehAge", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[1, 0], +) + +plot_obs_pred( + df_test, + "BonusMalus", + "Frequency", + glm_freq.predict(X_test), + weight="Exposure", + y_label="Claim Frequency", + title="test data", + ax=ax[1, 1], +) + + +############################################################################## +# +# 3. Severity model - Gamma Distribution +# --------------------------------------- +# The mean claim amount or severity (`AvgClaimAmount`) can be empirically +# shown to follow a Gamma distribution. We fit a GLM model for the severity +# with the same features as the frequency model. +# +# Note: +# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support +# on :math:`(0, \infty)` not :math:`[0, \infty)`. +# - We use ``ClaimNb`` as sample weights. + +mask_train = df_train["ClaimAmount"] > 0 +mask_test = df_test["ClaimAmount"] > 0 + +glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1) + +glm_sev.fit( + X_train[mask_train.values], + df_train.loc[mask_train, "AvgClaimAmount"], + sample_weight=df_train.loc[mask_train, "ClaimNb"], +) + + +scores = score_estimator( + glm_sev, + X_train[mask_train.values], + X_test[mask_test.values], + df_train[mask_train], + df_test[mask_test], + target="AvgClaimAmount", + weights="ClaimNb", +) +print(scores) + +############################################################################## +# +# Note that the resulting model is conditional on having at least one claim, +# and cannot be used to predict the average claim amount in general, + +print( + "Mean AvgClaim Amount: %.2f " + % df_train.AvgClaimAmount.mean() +) +print( + "Mean AvgClaim Amount | NbClaim > 0: %.2f" + % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() +) +print( + "Predicted Mean AvgClaim Amount: %.2f" + % glm_sev.predict(X_train).mean() +) + + +############################################################################## +# +# We can visually compare observed and predicted values, aggregated for +# the drivers age (``Driv Age``), + +fig, ax = plt.subplots(1, 2, figsize=(16, 4)) + +# plot DivAge +plot_obs_pred( + df_train.loc[mask_train], + "DrivAge", + "AvgClaimAmount", + glm_sev.predict(X_train[mask_train.values]), + weight="Exposure", + y_label="Average Claim Severity", + title="train data", + ax=ax[0], +) + +plot_obs_pred( + df_test.loc[mask_test], + "DrivAge", + "AvgClaimAmount", + glm_sev.predict(X_test[mask_test.values]), + weight="Exposure", + y_label="Average Claim Severity", + title="test data", + ax=ax[1], +) + + +############################################################################## +# +# 3. Total Claims Amount -- Compound Poisson distribution +# ------------------------------------------------------- +# +# As mentionned in the introduction, the total claim amount can be modeled +# either as the product of the frequency model by the severity model. + + +class ClaimProdEstimator: + """Total claim amount estimator + + Computed as the product of the frequency model by the serverity model, + denormalized by exposure. + """ + + def __init__(self, est_freq, est_sev): + self.est_freq = est_freq + self.est_sev = est_sev + + def predict(self, X, exposure): + """Predict the total claim amount + + The predict method is not compatible with the scikit-learn API. + """ + return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) + + +est_prod = ClaimProdEstimator(glm_freq, glm_sev) + +scores = score_estimator( + est_prod, + X_train, + X_test, + df_train, + df_test, + target="ClaimAmount", + weights="Exposure", +) +print(scores) + + +############################################################################## +# +# or as a unique Compound Poisson model, also corresponding to a Tweedie model +# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter +# ``p`` with a grid search, + +from sklearn.model_selection import GridSearchCV + +# this takes a while +params = { + "family": [ + TweedieDistribution(power=power) for power in np.linspace(1, 2, 8) + ] +} + +glm_total = GridSearchCV( + GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1 +) +glm_total.fit( + X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"] +) + + +print( + "Best hyperparameters: power=%.2f\n" + % glm_total.best_estimator_.family.power +) + +scores = score_estimator( + glm_total.best_estimator_, + X_train, + X_test, + df_train, + df_test, + target="ClaimAmount", + weights="Exposure", +) +print(scores) + +############################################################################## +# +# In this example, the mean absolute error is lower for the Compound Poisson +# model than when using separate models for frequency and severity. +# +# We can additionally validate these models by comparing observed and predicted +# total claim amount over the test and train subsets. + +res = [] +for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), +]: + res.append( + { + "subset": subset_label, + "observed": df.ClaimAmount.values.sum(), + "predicted, frequency*severity model": np.sum( + est_prod.predict(X, exposure=df.Exposure.values) + ), + "predicted, tweedie, p=%.2f" + % glm_total.best_estimator_.family.power: np.sum( + glm_total.best_estimator_.predict(X) + ), + } + ) + +print(pd.DataFrame(res).set_index("subset").T) From 56069e5b3f5b453d9e8a487c9d27e20900ce4d63 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 28 Jun 2019 10:11:37 -0500 Subject: [PATCH 065/269] EXA Fix issues with older pandas versions in example --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index cb2ff667e8379..063d12e6e291b 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -137,8 +137,8 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 # correct for unreasonable observations (that might be data error) -df["ClaimNb"].clip(upper=4, inplace=True) -df["Exposure"].clip(upper=1, inplace=True) +df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +df["Exposure"] = df["Exposure"].clip(upper=1) column_trans = ColumnTransformer( [ From 53f3c5f6670bbdefcd5ddf6d63e405401909cf06 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 09:14:30 -0500 Subject: [PATCH 066/269] DOC Add second poisson regression example --- ...plot_poisson_regression_non_normal_loss.py | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 examples/linear_model/plot_poisson_regression_non_normal_loss.py diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py new file mode 100644 index 0000000000000..b06adcb787560 --- /dev/null +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -0,0 +1,257 @@ +""" +====================================== +Poisson regression and non normal loss +====================================== + +This example illustrate the use linear Poisson regression +on the French Motor Third-Party Liability Claims dataset [1] and compare +it with learning models with least squared error. + + +We start by defining a few helper functions for loading the data and +visualizing results. + + +.. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor + Third-Party Liability Claims (November 8, 2018). + `doi:10.2139/ssrn.3164764 `_ + +""" +print(__doc__) + +# Authors: Christian Lorentzen +# Roman Yurchak +# License: BSD 3 clause +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +from scipy.special import xlogy + +from sklearn.compose import ColumnTransformer +from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import StandardScaler, KBinsDiscretizer +from sklearn.ensemble import GradientBoostingRegressor + +from sklearn.metrics import mean_squared_error, mean_absolute_error + + +def load_mtpl2(n_samples=100000): + """Fetcher for French Motor Third-Party Liability Claims dataset + + Parameters + ---------- + n_samples: int, default=100000 + number of samples to select (for faster run time). + """ + + # Note: this should use the OpenML DataFrame fetcher in the future + df_freq = pd.read_csv( + "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", + dtype={"IDpol": np.int}, + index_col=0, + ) + + df_sev = pd.read_csv( + "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", + index_col=0, + ) + + # sum ClaimAmount over identical IDs + df_sev = df_sev.groupby(level=0).sum() + + df = df_freq.join(df_sev, how="left") + df["ClaimAmount"].fillna(0, inplace=True) + + # unquote string fields + for column_name in df.columns[df.dtypes.values == np.object]: + df[column_name] = df[column_name].str.strip("'") + return df.iloc[:n_samples] + + +############################################################################## +# +# 1. Loading datasets and pre-processing +# -------------------------------------- +# +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``) with the freMTPL2sev table +# containing the claim amount (``ClaimAmount``) for the same user ids. + +df = load_mtpl2(n_samples=100000) + +# Note: filter out claims with zero amount, as the severity model +# requires a strictly positive target values. +df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 + +# correct for unreasonable observations (that might be data error) +df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +df["Exposure"] = df["Exposure"].clip(upper=1) + +column_trans = ColumnTransformer( + [ + ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ( + "Veh_Brand_Gas_Region", + OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("BonusMalus", "passthrough", ["BonusMalus"]), + ( + "Density_log", + make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler() + ), + ["Density"], + ), + ], + remainder="drop", +) +X = column_trans.fit_transform(df) + +############################################################################## +# +# The number of claims (``ClaimNb``) is a positive integer that can be modeled +# as a Poisson distribution. It is then assumed to be the number of discrete +# events occurring with a constant rate in a given time interval +# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, +# which is still a (scaled) Poisson distribution. +# +# A very important property of the Poisson distribution is its mean-variance +# relation: The variance is proportional to the mean. + +df["Frequency"] = df.ClaimNb / df.Exposure + +print( + pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() +) + +############################################################################## +# +# It worth noting that 96 % of users have 0 claims, and if we were to convert +# this problem into a binary classification task, it would be significantly +# imbalanced. +# +# To evaluate the pertinence of the used metrics, we will consider as a +# baseline an estimator that returns 0 for any input. + +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) + + +def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): + y_true = np.atleast_1d(y_true) + y_pred = np.atleast_1d(y_pred) + dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) + return np.average(dev, weights=sample_weights) + + +eps = 1e-5 +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, np.zeros(len(df_test)), + df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, np.zeros(len(df_test)), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, eps + np.zeros(len(df_test)), + df_test.Exposure.values)) + + +############################################################################## +# +# We start by modeling the target variable with the least squares linear +# regression model, + + +linregr = LinearRegression() +linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + +print("LinearRegression") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, linregr.predict(X_test), + df_test.Exposure.values)) +print("MSE: %.3f" % mean_absolute_error( + df_test.Frequency.values, linregr.predict(X_test), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps), + df_test.Exposure.values)) + +############################################################################## +# +# The Poisson deviance cannot be computed because negative values are +# predicted by the model, + +print('Number Negatives: %s / total: %s' % ( + (linregr.predict(X_test) < 0).sum(), X_test.shape[0])) + +############################################################################## +# +# Next we fit the Poisson regressor on the target variable, + +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) + +print("PoissonRegressor") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, glm_freq.predict(X_test), + df_test.Exposure.values)) + +############################################################################## +# +# Finally we will consider a non linear model with Gradient boosting that +# still minimizes the least square error. + + +gbr = GradientBoostingRegressor(max_depth=3) +gbr.fit(X_train, df_train.Frequency.values, + sample_weight=df_train.Exposure.values) + + +print("GradientBoostingRegressor") +print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) +print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) +print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( + df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) + +############################################################################## +# +# In this example, although Gradient boosting minimizes the least square error, +# because of a higher predictive power it also results in a smaller Poisson +# deviance than the Poisson regression model. +# +# Evaluating models with a single train / test split is prone to numerical +# errors, we can verify that we would also get equivalent resuts with the +# cross-validation score. +# +# The difference between these models can also be visualized by comparing the +# histogram of observed target values with that of predicted values, + + +fig, ax = plt.subplots(1, 4, figsize=(16, 3)) + +df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0]) + +ax[0].set_title('Experimental data') + +for idx, model in enumerate([linregr, glm_freq, gbr]): + y_pred = model.predict(X_train) + + pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1]) + ax[idx+1].set_title(model.__class__.__name__) + +for axi in ax: + axi.set( + yscale='log', + xlabel="y (Frequency)" + ) From be5a3c485684ae45835258a71a1870a59549fbda Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 09:55:57 -0500 Subject: [PATCH 067/269] Add GeneralizedHyperbolicSecant and BinomialDistributions --- sklearn/linear_model/_glm.py | 109 ++++--------------------- sklearn/linear_model/tests/test_glm.py | 34 +------- 2 files changed, 18 insertions(+), 125 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index b18731e73f328..736e50960dcda 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -6,36 +6,6 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -# TODO: Add cross validation support, e.g. GCV? -# TODO: Should GeneralizedLinearRegressor inherit from LinearModel? -# So far, it does not. -# TODO: Include further classes in class.rst? ExponentialDispersionModel? -# TweedieDistribution? -# TODO: Negative values in P1 are not allowed so far. They could be used -# for group lasso. - -# Design Decisions: -# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor. -# Estimators in sklearn are either regressors or classifiers. A GLM can do -# both depending on the distr (Normal => regressor, Binomial => classifier). -# Solution: GeneralizedLinearRegressor since this is the focus. -# - Allow for finer control of penalty terms: -# L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude -# factors from the L1 penalty. -# L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be -# a 1st or 2nd order difference matrix (compare B-spline penalties and -# Tikhonov regularization). -# - The link function (instance of class Link) is necessary for the evaluation -# of deviance, score, Fisher and Hessian matrix as functions of the -# coefficients, which is needed by optimizers. -# Solution: link as argument in those functions -# - Which name/symbol for sample_weight in docu? -# sklearn.linear_models uses w for coefficients, standard literature on -# GLMs use beta for coefficients and w for (sample) weights. -# So far, coefficients=w and sample weights=s. -# - The intercept term is the first index, i.e. coef[0] - - from __future__ import division from abc import ABCMeta, abstractmethod import numbers @@ -51,6 +21,7 @@ from ..utils.validation import check_is_fitted, check_random_state + def _check_weights(sample_weight, n_samples): """Check that sample weights are non-negative and have the right shape.""" if sample_weight is None: @@ -854,47 +825,12 @@ def __init__(self): super(InverseGaussianDistribution, self).__init__(power=3) -class GeneralizedHyperbolicSecant(ExponentialDispersionModel): - """A class for the Generalized Hyperbolic Secant (GHS) distribution. - - The GHS distribution is for targets y in (-inf, inf). - """ - def __init__(self): - self._lower_bound = -np.Inf - self._upper_bound = np.Inf - self._include_lower_bound = False - self._include_upper_bound = False - - def unit_variance(self, mu): - return 1 + mu**2 - - def unit_variance_derivative(self, mu): - return 2 * mu - - def unit_deviance(self, y, mu): - return (2 * y * (np.arctan(y) - np.arctan(mu)) + - np.log((1 + mu**2)/(1 + y**2))) - - -class BinomialDistribution(ExponentialDispersionModel): - """A class for the Binomial distribution. - - The Binomial distribution is for targets y in [0, 1]. - """ - def __init__(self): - self._lower_bound = 0 - self._upper_bound = 1 - self._include_lower_bound = True - self._include_upper_bound = True - - def unit_variance(self, mu): - return mu * (1 - mu) - - def unit_variance_derivative(self, mu): - return 1 - 2 * mu - - def unit_deviance(self, y, mu): - return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu))) +EDM_DISTRIBUTIONS = { + 'normal': NormalDistribution, + 'poisson': PoissonDistribution, + 'gamma': GammaDistribution, + 'inverse.gaussian': InverseGaussianDistribution, +} def _irls_step(X, W, P2, z, fit_intercept=True): @@ -1690,28 +1626,19 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # 1. input validation # ####################################################################### - # 1.1 validate arguments of __init__ ################################## + # 1.1 validate arguments of __init__ # Guarantee that self._family_instance is an instance of class # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): self._family_instance = self.family + elif self.family in EDM_DISTRIBUTIONS: + self._family_instance = EDM_DISTRIBUTIONS[self.family]() else: - if self.family == 'normal': - self._family_instance = NormalDistribution() - elif self.family == 'poisson': - self._family_instance = PoissonDistribution() - elif self.family == 'gamma': - self._family_instance = GammaDistribution() - elif self.family == 'inverse.gaussian': - self._family_instance = InverseGaussianDistribution() - elif self.family == 'binomial': - self._family_instance = BinomialDistribution() - else: - raise ValueError( - "The family must be an instance of class" - " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " - "'binomial']; got (family={0})".format(self.family)) + raise ValueError( + "The family must be an instance of class" + " ExponentialDispersionModel or an element of" + " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " + "'binomial']; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of # class Link @@ -1724,11 +1651,6 @@ def fit(self, X, y, sample_weight=None): self._link_instance = IdentityLink() if self._family_instance.power >= 1: self._link_instance = LogLink() - elif isinstance(self._family_instance, - GeneralizedHyperbolicSecant): - self._link_instance = IdentityLink() - elif isinstance(self._family_instance, BinomialDistribution): - self._link_instance = LogitLink() else: raise ValueError("No default link known for the " "specified distribution family. Please " @@ -2048,7 +1970,6 @@ def fit(self, X, y, sample_weight=None): # 4. fit # ####################################################################### # algorithms for optimization - # TODO: Parallelize it? # 4.1 IRLS ############################################################ # Note: we already set P2 = l2*P2, see above diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 1416bdcfad680..f51f630ebae7e 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -18,7 +18,6 @@ TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, - GeneralizedHyperbolicSecant, BinomialDistribution, ) from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge from sklearn.metrics import mean_absolute_error @@ -103,7 +102,7 @@ def test_tweedie_distribution_power(): (TweedieDistribution(power=1.5), [0.1, 1.5]), (TweedieDistribution(power=2.5), [0.1, 1.5]), (TweedieDistribution(power=-4), [0.1, 1.5]), - (GeneralizedHyperbolicSecant(), [0.1, 1.5])]) +]) def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: @@ -196,7 +195,7 @@ def test_sample_weights_validation(): ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse.gaussian', InverseGaussianDistribution()), - ('binomial', BinomialDistribution())]) +]) def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions @@ -424,7 +423,7 @@ def test_glm_identity_regression(solver): [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), - GeneralizedHyperbolicSecant()]) +]) @pytest.mark.parametrize('solver, tol', [('irls', 1e-6), ('lbfgs', 1e-6), ('newton-cg', 1e-7), @@ -620,33 +619,6 @@ def obj(coef): assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) -@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10]) -def test_binomial_enet(alpha): - """Test elastic net regression with binomial family and LogitLink. - - Compare to LogisticRegression. - """ - l1_ratio = 0.5 - n_samples = 500 - rng = np.random.RandomState(42) - X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6, - n_informative=5, n_redundant=0, n_repeated=0, - random_state=rng) - log = LogisticRegression( - penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6, - max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha), - solver='saga') - log.fit(X, y) - - glm = GeneralizedLinearRegressor( - family=BinomialDistribution(), link=LogitLink(), fit_intercept=False, - alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic', - tol=1e-7) - glm.fit(X, y) - assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6) - assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6) - - @pytest.mark.parametrize( "params", [ From e67fecb9bc1ee056ad7934803818fd46a0a1f8b3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:00:24 -0500 Subject: [PATCH 068/269] Remove start params option --- sklearn/linear_model/_glm.py | 157 +++-------------------------------- 1 file changed, 13 insertions(+), 144 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 736e50960dcda..18eba80080670 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -1457,33 +1457,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` - as initialization for ``coef_`` and ``intercept_`` (supersedes option - ``start_params``). If set to ``True`` or if the attribute ``coef_`` - does not exit (first call to ``fit``), option ``start_params`` sets the - start values for ``coef_`` and ``intercept_``. - - start_params : {'guess', 'zero', array of shape (n_features*, )}, \ - optional (default='guess') - Relevant only if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not yet exist). - - 'guess' - Start values of mu are calculated by family.starting_mu(..). Then, - one Newton step obtains start values for ``coef_``. If - ``solver='irls'``, it uses one irls step, else the Newton step is - calculated by the cd solver. - This gives usually good starting values. - - 'zero' - All coefficients are set to zero. If ``fit_intercept=True``, the - start value for the intercept is obtained by the weighted average of y. - - array - The array of size n_features* is directly used as start values - for ``coef_``. If ``fit_intercept=True``, the first element - is assumed to be the start value for the ``intercept_``. - Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes - the intercept in counting. + as initialization for ``coef_`` and ``intercept_``. selection : str, optional (default='cyclic') For the solver 'cd' (coordinate descent), the coordinates (features) @@ -1503,7 +1477,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 'random'. diag_fisher : boolean, optional, (default=False) - Only relevant for solver 'cd' (see also ``start_params='guess'``). + Only relevant for solver 'cd'. If ``False``, the full Fisher matrix (expected Hessian) is computed in each outer iteration (Newton iteration). If ``True``, only a diagonal matrix (stored as 1d array) is computed, such that @@ -1576,7 +1550,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, start_params='guess', + tol=1e-4, warm_start=False, selection='cyclic', random_state=None, diag_fisher=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha @@ -1591,7 +1565,6 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', self.max_iter = max_iter self.tol = tol self.warm_start = warm_start - self.start_params = start_params self.selection = selection self.random_state = random_state self.diag_fisher = diag_fisher @@ -1742,7 +1715,7 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape # 1.3 arguments to take special care ################################## - # P1, P2, start_params + # P1, P2 if isinstance(self.P1, str) and self.P1 == 'identity': P1 = np.ones(n_features) else: @@ -1793,25 +1766,6 @@ def fit(self, X, y, sample_weight=None): "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) - start_params = self.start_params - if isinstance(start_params, str): - if start_params not in ['guess', 'zero']: - raise ValueError("The argument start_params must be 'guess', " - "'zero' or an array of correct length; " - "got(start_params={0})".format(start_params)) - else: - start_params = check_array(start_params, accept_sparse=False, - force_all_finite=True, ensure_2d=False, - dtype=_dtype, copy=True) - if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or - (start_params.ndim != 1)): - raise ValueError("Start values for parameters must have the" - "right length and dimension; required (length" - "={0}, ndim=1); got (length={1}, ndim={2})." - .format(X.shape[1] + self.fit_intercept, - start_params.shape[0], - start_params.ndim)) - l1 = self.alpha * self.l1_ratio l2 = self.alpha * (1 - self.l1_ratio) # P1 and P2 are now for sure copies @@ -1899,72 +1853,12 @@ def fit(self, X, y, sample_weight=None): self.coef_)) else: coef = self.coef_ - elif isinstance(start_params, str): - if start_params == 'guess': - # Set mu=starting_mu of the family and do one Newton step - # If solver=cd use cd, else irls - mu = family.starting_mu(y, weights=weights) - eta = link.link(mu) # linear predictor - if solver in ['cd', 'lbfgs', 'newton-cg']: - # see function _cd_solver - sigma_inv = 1/family.variance(mu, phi=1, weights=weights) - d1 = link.inverse_derivative(eta) - temp = sigma_inv * d1 * (y - mu) - if self.fit_intercept: - score = np.concatenate(([temp.sum()], temp @ X)) - else: - score = temp @ X # same as X.T @ temp - - d2_sigma_inv = d1 * d1 * sigma_inv - diag_fisher = self.diag_fisher - if diag_fisher: - fisher = d2_sigma_inv - else: - fisher = \ - _safe_sandwich_dot(X, d2_sigma_inv, - intercept=self.fit_intercept) - # set up space for search direction d for inner loop - if self.fit_intercept: - coef = np.zeros(n_features+1) - else: - coef = np.zeros(n_features) - d = np.zeros_like(coef) - # initial stopping tolerance of inner loop - # use L1-norm of minimum of norm of subgradient of F - # use less restrictive tolerance for initial guess - inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, - P1=P1) - inner_tol = 4 * linalg.norm(inner_tol, ord=1) - # just one outer loop = Newton step - n_cycles = 0 - d, coef_P2, n_cycles, inner_tol = \ - _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, - inner_tol, max_inner_iter=1000, - selection=self.selection, - random_state=random_state, - diag_fisher=self.diag_fisher) - coef += d # for simplicity no line search here - else: - # See _irls_solver - # h'(eta) - hp = link.inverse_derivative(eta) - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = (hp**2 / family.variance(mu, phi=1, weights=weights)) - # working observations - z = eta + (y-mu)/hp - # solve A*coef = b - # A = X' W X + l2 P2, b = X' W z - coef = _irls_step(X, W, P2, z, - fit_intercept=self.fit_intercept) - else: # start_params == 'zero' - if self.fit_intercept: - coef = np.zeros(n_features+1) - coef[0] = link.link(np.average(y, weights=weights)) - else: - coef = np.zeros(n_features) - else: # assign given array as start values - coef = start_params + else: + if self.fit_intercept: + coef = np.zeros(n_features+1) + coef[0] = link.link(np.average(y, weights=weights)) + else: + coef = np.zeros(n_features) ####################################################################### # 4. fit # @@ -2312,32 +2206,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` - as initialization for ``coef_`` and ``intercept_`` (supersedes option - ``start_params``). If set to ``True`` or if the attribute ``coef_`` - does not exit (first call to ``fit``), option ``start_params`` sets the - start values for ``coef_`` and ``intercept_``. - - start_params : {'guess', 'zero', array of shape (n_features*, )}, \ - optional (default='guess') - Relevant only if ``warm_start=False`` or if fit is called - the first time (``self.coef_`` does not yet exist). - - 'guess' - Start values of mu are calculated by family.starting_mu(..). Then, - one Newton step obtains start values for ``coef_``. If - ``solver='irls'``, it uses one irls step. This gives usually good - starting values. - - 'zero' - All coefficients are set to zero. If ``fit_intercept=True``, the - start value for the intercept is obtained by the weighted average of y. - - array - The array of size n_features* is directly used as start values - for ``coef_``. If ``fit_intercept=True``, the first element - is assumed to be the start value for the ``intercept_``. - Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes - the intercept in counting. + as initialization for ``coef_`` and ``intercept_`` . random_state : {int, RandomState instance, None}, optional (default=None) If int, random_state is the seed used by the random @@ -2401,12 +2270,12 @@ class PoissonRegressor(GeneralizedLinearRegressor): """ def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, solver='irls', max_iter=100, - tol=1e-4, warm_start=False, start_params='guess', + tol=1e-4, warm_start=False, random_state=None, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, - start_params=start_params, random_state=random_state, + random_state=random_state, copy_X=copy_X, verbose=verbose) From 62f4448101c16a797d74119fe1df5b45b93136b0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:31:42 -0500 Subject: [PATCH 069/269] Remove L1 penalty and CD solver --- sklearn/linear_model/_glm.py | 536 +------------------------ sklearn/linear_model/tests/test_glm.py | 185 +-------- 2 files changed, 36 insertions(+), 685 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 18eba80080670..15211c6038007 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -93,49 +93,6 @@ def _safe_sandwich_dot(X, d, intercept=False): return res -def _min_norm_sugrad(coef, grad, P2, P1): - """Compute the gradient of all subgradients with minimal L2-norm. - - subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1) - - g_i = grad_i + (P2*coef)_i - - if coef_i > 0: g_i + P1_i - if coef_i < 0: g_i - P1_i - if coef_i = 0: sign(g_i) * max(|g_i|-P1_i, 0) - - Parameters - ---------- - coef : ndarray - coef[0] may be intercept. - - grad : ndarray, shape=coef.shape - - P2 : {1d or 2d array, None} - always without intercept, ``None`` means P2 = 0 - - P1 : ndarray - always without intercept - """ - intercept = (coef.size == P1.size + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - # compute grad + coef @ P2 without intercept - grad_wP2 = grad[idx:].copy() - if P2 is None: - pass - elif P2.ndim == 1: - grad_wP2 += coef[idx:] * P2 - else: - grad_wP2 += coef[idx:] @ P2 - res = np.where(coef[idx:] == 0, - np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0), - grad_wP2 + np.sign(coef[idx:]) * P1) - if intercept: - return np.concatenate(([grad[0]], res)) - else: - return res - - class Link(metaclass=ABCMeta): """Abstract base class for Link functions.""" @@ -915,7 +872,7 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) # D2 = link.inverse_derivative(eta)^2 = D^2 # W = D2/V(mu) - # l2 = alpha * (1 - l1_ratio) + # l2 = alpha # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w # = -X' D (y-mu)/V(mu) + l2 P2 w # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 @@ -981,345 +938,6 @@ def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, return coef, n_iter -def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, - max_inner_iter=1000, selection='cyclic', - random_state=None, diag_fisher=False): - """Compute inner loop of coordinate descent, i.e. cycles through features. - - Minimization of 1-d subproblems:: - - min_z q(d+z*e_j) - q(d) - = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1 - - A = f'(w) + d*H(w) + (w+d)*P2 - B = H+P2 - Note: f'=-score and H=fisher are updated at the end of outer iteration. - """ - # TODO: use sparsity (coefficient already 0 due to L1 penalty) - # => active set of features for featurelist, see paper - # of Improved GLMNET or Gap Safe Screening Rules - # https://arxiv.org/abs/1611.05780 - n_samples, n_features = X.shape - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - B = fisher - if P2.ndim == 1: - coef_P2 = coef[idx:] * P2 - if not diag_fisher: - idiag = np.arange(start=idx, stop=B.shape[0]) - # B[np.diag_indices_from(B)] += P2 - B[(idiag, idiag)] += P2 - else: - coef_P2 = coef[idx:] @ P2 - if not diag_fisher: - if sparse.issparse(P2): - B[idx:, idx:] += P2.toarray() - else: - B[idx:, idx:] += P2 - A = -score - A[idx:] += coef_P2 - # A += d @ (H+P2) but so far d=0 - # inner loop - for inner_iter in range(1, max_inner_iter+1): - inner_iter += 1 - n_cycles += 1 - # cycle through features, update intercept separately at the end - if selection == 'random': - featurelist = random_state.permutation(n_features) - else: - featurelist = np.arange(n_features) - for j in featurelist: - # minimize_z: a z + 1/2 b z^2 + c |d+z| - # a = A_j - # b = B_jj > 0 - # c = |P1_j| = P1_j > 0, see 1.3 - # d = w_j + d_j - # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4) - # with beta = z+d, beta_hat = d-a/b and gamma = c/b - # z = 1/b * S(bd-a,c) - d - # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding - jdx = j+idx # index for arrays containing entries for intercept - a = A[jdx] - if diag_fisher: - # Note: fisher is ndarray of shape (n_samples,) => no idx - # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway - Bj = np.zeros_like(A) - if intercept: - Bj[0] = fisher.sum() - if sparse.issparse(X): - Bj[idx:] = _safe_toarray(X[:, j].transpose() @ - X.multiply(fisher[:, np.newaxis]) - ).ravel() - else: - Bj[idx:] = (fisher * X[:, j]) @ X - - if P2.ndim == 1: - Bj[idx:] += P2[j] - else: - if sparse.issparse(P2): - # slice columns as P2 is csc - Bj[idx:] += P2[:, j].toarray().ravel() - else: - Bj[idx:] += P2[:, j] - b = Bj[jdx] - else: - b = B[jdx, jdx] - - # those ten lines are what it is all about - if b <= 0: - z = 0 - elif P1[j] == 0: - z = -a/b - elif a + P1[j] < b * (coef[jdx] + d[jdx]): - z = -(a + P1[j])/b - elif a - P1[j] > b * (coef[jdx] + d[jdx]): - z = -(a - P1[j])/b - else: - z = -(coef[jdx] + d[jdx]) - - # update direction d - d[jdx] += z - # update A because d_j is now d_j+z - # A = f'(w) + d*H(w) + (w+d)*P2 - # => A += (H+P2)*e_j z = B_j * z - # Note: B is symmetric B = B.transpose - if diag_fisher: - # Bj = B[:, j] calculated above, still valid - A += Bj * z - else: - # B is symmetric, C- or F-contiguous, but never sparse - if B.flags['F_CONTIGUOUS']: - # slice columns like for sparse csc - A += B[:, jdx] * z - else: # B.flags['C_CONTIGUOUS'] might be true - # slice rows - A += B[jdx, :] * z - # end of cycle over features - # update intercept - if intercept: - if diag_fisher: - Bj = np.zeros_like(A) - Bj[0] = fisher.sum() - Bj[1:] = fisher @ X - b = Bj[0] - else: - b = B[0, 0] - z = 0 if b <= 0 else -A[0]/b - d[0] += z - if diag_fisher: - A += Bj * z - else: - if B.flags['F_CONTIGUOUS']: - A += B[:, 0] * z - else: - A += B[0, :] * z - # end of complete cycle - # stopping criterion for inner loop - # sum_i(|minimum of norm of subgrad of q(d)_i|) - # subgrad q(d) = A + subgrad ||P1*(w+d)||_1 - mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= inner_tol: - if inner_iter == 1: - inner_tol = inner_tol/4. - break - # end of inner loop - return d, coef_P2, n_cycles, inner_tol - - -def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link, - max_iter=100, max_inner_iter=1000, tol=1e-4, - selection='cyclic ', random_state=None, - diag_fisher=False, copy_X=True): - """Solve GLM with L1 and L2 penalty by coordinate descent algorithm. - - The objective being minimized in the coefficients w=coef is:: - - F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1 - - An Improved GLMNET for L1-regularized Logistic Regression: - - 1. Find optimal descent direction d by minimizing - min_d F(w+d) = min_d F(w+d) - F(w) - 2. Quadratic approximation of F(w+d)-F(w) = q(d): - using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives: - q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d - + ||P1*(w+d)||_1 - ||P1*w||_1 - Then minimize q(d): min_d q(d) - 3. Coordinate descent by updating coordinate j (d -> d+z*e_j): - min_z q(d+z*e_j) - = min_z q(d+z*e_j) - q(d) - = min_z A_j z + 1/2 B_jj z^2 - + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1 - A = f'(w) + d*H(w) + (w+d)*P2 - B = H + P2 - - Repeat steps 1-3 until convergence. - Note: Use Fisher matrix instead of Hessian for H. - Note: f' = -score, H = Fisher matrix - - Parameters - ---------- - coef : ndarray, shape (c,) - If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shape[1] + 1. - - X : {ndarray, csc sparse matrix}, shape (n_samples, n_features) - Training data (with intercept included if present). If not sparse, - pass directly as Fortran-contiguous data to avoid - unnecessary memory duplication. - - y : ndarray, shape (n_samples,) - Target values. - - weights: ndarray, shape (n_samples,) - Sample weights with which the deviance is weighted. The weights must - bee normalized and sum to 1. - - P1 : {ndarray}, shape (n_features,) - The L1-penalty vector (=diagonal matrix) - - P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features) - The L2-penalty matrix or vector (=diagonal matrix). If a matrix is - passed, it must be symmetric. If X is sparse, P2 must also be sparse. - - fit_intercept : boolean, optional (default=True) - Specifies if a constant (a.k.a. bias or intercept) should be - added to the linear predictor (X*coef+intercept). - - family : ExponentialDispersionModel - - link : Link - - max_iter : int, optional (default=100) - Maximum numer of outer (Newton) iterations. - - max_inner_iter : int, optional (default=1000) - Maximum number of iterations in each inner loop, i.e. max number of - cycles over all features per inner loop. - - tol : float, optional (default=1e-4) - Convergence criterion is - sum_i(|minimum of norm of subgrad of objective_i|)<=tol. - - selection : str, optional (default='cyclic') - If 'random', randomly chose features in inner loop. - - random_state : {int, RandomState instance, None}, optional (default=None) - - diag_fisher : boolean, optional (default=False) - ``False`` calculates full fisher matrix, ``True`` only diagonal matrix - s.t. fisher = X.T @ diag @ X. This saves storage but needs more - matrix-vector multiplications. - - copy_X : boolean, optional (default=True) - If ``True``, X will be copied; else, it may be overwritten. - - Returns - ------- - coef : ndarray, shape (c,) - If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shape[1] + 1. - - n_iter : number of outer iterations = newton iterations - - n_cycles : number of cycles over features - - References - ---------- - Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf - """ - X = check_array(X, 'csc', dtype=[np.float64, np.float32], - order='F', copy=copy_X) - if P2.ndim == 2: - P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32], - order='F', copy=copy_X) - if sparse.issparse(X): - if not sparse.isspmatrix_csc(P2): - raise ValueError("If X is sparse, P2 must also be sparse csc" - "format. Got P2 not sparse.") - random_state = check_random_state(random_state) - # Note: we already set P2 = l2*P2, P1 = l1*P1 - # Note: we already symmetrized P2 = 1/2 (P2 + P2') - n_iter = 0 # number of outer iterations - n_cycles = 0 # number of (complete) cycles over features - converged = False - n_samples, n_features = X.shape - idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept - # line search parameters - (beta, sigma) = (0.5, 0.01) - # some precalculations - # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a - # 1d array representing a diagonal matrix. - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=X, y=y, weights=weights, link=link, - diag_fisher=diag_fisher) - # set up space for search direction d for inner loop - d = np.zeros_like(coef) - # initial stopping tolerance of inner loop - # use L1-norm of minimum of norm of subgradient of F - inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) - inner_tol = linalg.norm(inner_tol, ord=1) - # outer loop - while n_iter < max_iter: - n_iter += 1 - # initialize search direction d (to be optimized) with zero - d.fill(0) - # inner loop = _cd_cycle - d, coef_P2, n_cycles, inner_tol = \ - _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol, - max_inner_iter=max_inner_iter, selection=selection, - random_state=random_state, diag_fisher=diag_fisher) - # line search by sequence beta^k, k=0, 1, .. - # F(w + lambda d) - F(w) <= lambda * bound - # bound = sigma * (f'(w)*d + w*P2*d - # +||P1 (w+d)||_1 - ||P1 w||_1) - P1w_1 = linalg.norm(P1 * coef[idx:], ord=1) - P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1) - # Note: coef_P2 already calculated and still valid - bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1) - Fw = (0.5 * family.deviance(y, mu, weights) + - 0.5 * (coef_P2 @ coef[idx:]) + P1w_1) - la = 1./beta - for k in range(20): - la *= beta # starts with la=1 - coef_wd = coef + la * d - mu_wd = link.inverse(_safe_lin_pred(X, coef_wd)) - Fwd = (0.5 * family.deviance(y, mu_wd, weights) + - linalg.norm(P1 * coef_wd[idx:], ord=1)) - if P2.ndim == 1: - Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:]) - else: - Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:])) - if Fwd - Fw <= sigma * la * bound: - break - # update coefficients - coef += la * d - # calculate eta, mu, score, Fisher matrix for next iteration - eta, mu, score, fisher = family._eta_mu_score_fisher( - coef=coef, phi=1, X=X, y=y, weights=weights, link=link, - diag_fisher=diag_fisher) - # stopping criterion for outer loop - # sum_i(|minimum-norm of subgrad of F(w)_i|) - # fp_wP2 = f'(w) + w*P2 - # Note: eta, mu and score are already updated - mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1) - mn_subgrad = linalg.norm(mn_subgrad, ord=1) - if mn_subgrad <= tol: - converged = True - break - # end of outer loop - if not converged: - warnings.warn("Coordinate descent failed to converge. Increase" - " the maximum number of iterations max_iter" - " (currently {0})".format(max_iter), ConvergenceWarning) - - return coef, n_iter, n_cycles - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. @@ -1329,28 +947,10 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) - + alpha * l1_ratio * ||P1*w||_1 - + 1/2 * alpha * (1 - l1_ratio) * w*P2*w - - with inverse link function h and s=sample_weight. Note that for - ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). - For ``P1=P2='identity'``, the penalty is the elastic net:: - - alpha * l1_ratio * ||w||_1 - + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2 - - If you are interested in controlling the L1 and L2 penalties - separately, keep in mind that this is equivalent to:: + + 1/2 * alpha * w*P2*w - a * L1 + b * L2 - - where:: - - alpha = a + b and l1_ratio = a / (a + b) - - The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet, - while ``alpha`` corresponds to the lambda parameter in glmnet. - Specifically, l1_ratio = 1 is the lasso penalty. + with inverse link function h and s=sample_weight. + The parameter ``alpha`` corresponds to the lambda parameter in glmnet. Read more in the :ref:`User Guide `. @@ -1364,19 +964,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): case, the design matrix X must have full column rank (no collinearities). - l1_ratio : float, optional (default=0) - The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For - ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it - is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a - combination of L1 and L2. - - P1 : {'identity', array-like}, shape (n_features,), optional \ - (default='identity') - With this array, you can exclude coefficients from the L1 penalty. - Set the corresponding value to 1 (include) or 0 (exclude). The - default value ``'identity'`` is the same as a 1d array of ones. - Note that n_features = X.shape[1]. - P2 : {'identity', array-like, sparse matrix}, shape \ (n_features,) or (n_features, n_features), optional \ (default='identity') @@ -1416,18 +1003,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \ + solver : {'auto', 'irls', 'lbfgs', 'newton-cg'}, \ optional (default='auto') Algorithm to use in the optimization problem: 'auto' - Sets 'irls' if l1_ratio equals 0, else 'cd'. - - 'cd' - Coordinate descent algorithm. It can deal with L1 as well as L2 - penalties. Note that in order to avoid unnecessary memory - duplication of X in the ``fit`` method, X should be directly passed - as a Fortran-contiguous numpy array or sparse csc matrix. + Sets 'irls' 'irls' Iterated reweighted least squares. @@ -1450,31 +1031,17 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Stopping criterion. For the irls, newton-cg and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of - the objective function. For the cd solver, convergence is reached - when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the - subgradient of the objective and minimum-norm of ``g_i`` is the element - of the subgradient ``g_i`` with the smallest L2-norm. + the objective function. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_``. - selection : str, optional (default='cyclic') - For the solver 'cd' (coordinate descent), the coordinates (features) - can be updated in either cyclic or random order. - If set to 'random', a random coefficient is updated every iteration - rather than looping over features sequentially in the same order. This - (setting to 'random') often leads to significantly faster convergence - especially when tol is higher than 1e-4. - random_state : {int, RandomState instance, None}, optional (default=None) - The seed of the pseudo random number generator that selects a random - feature to be updated for solver 'cd' (coordinate descent). If int, random_state is the seed used by the random number generator; if RandomState instance, random_state is the random number generator; if None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + RandomState instance used by `np.random`. diag_fisher : boolean, optional, (default=False) Only relevant for solver 'cd'. @@ -1547,15 +1114,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Journal of Machine Learning Research 13 (2012) 1999-2030 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ - def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', + def __init__(self, alpha=1.0, P2='identity', fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, - selection='cyclic', random_state=None, diag_fisher=False, + random_state=None, diag_fisher=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha - self.l1_ratio = l1_ratio - self.P1 = P1 self.P2 = P2 self.fit_intercept = fit_intercept self.family = family @@ -1565,7 +1130,6 @@ def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity', self.max_iter = max_iter self.tol = tol self.warm_start = warm_start - self.selection = selection self.random_state = random_state self.diag_fisher = diag_fisher self.copy_X = copy_X @@ -1645,28 +1209,16 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: raise ValueError("Penalty term must be a non-negative number;" " got (alpha={0})".format(self.alpha)) - if (not isinstance(self.l1_ratio, numbers.Number) or - self.l1_ratio < 0 or self.l1_ratio > 1): - raise ValueError("l1_ratio must be a number in interval [0, 1];" - " got (l1_ratio={0})".format(self.l1_ratio)) if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']: + if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';" + " 'auto', 'irls', 'lbfgs', 'newton-cg';" " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': - if self.l1_ratio == 0: - solver = 'irls' - else: - solver = 'cd' - if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']): - raise ValueError("The chosen solver (solver={0}) can't deal " - "with L1 penalties, which are included with " - "(alpha={1}) and (l1_ratio={2})." - .format(solver, self.alpha, self.l1_ratio)) + solver = 'irls' if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " @@ -1678,10 +1230,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - if self.selection not in ['cyclic', 'random']: - raise ValueError("The argument selection must be 'cyclic' or " - "'random'; got (selection={0})" - .format(self.selection)) random_state = check_random_state(self.random_state) if not isinstance(self.diag_fisher, bool): raise ValueError("The argument diag_fisher must be bool;" @@ -1698,16 +1246,10 @@ def fit(self, X, y, sample_weight=None): # 1.2 validate arguments of fit ####################################### _dtype = [np.float64, np.float32] - if solver == 'cd': - _stype = ['csc'] - else: - _stype = ['csc', 'csr'] + _stype = ['csc', 'csr'] X, y = check_X_y(X, y, accept_sparse=_stype, dtype=_dtype, y_numeric=True, multi_output=False, copy=self.copy_X) - # Without converting y to float, deviance might raise - # ValueError: Integers to negative integer powers are not allowed. - # Also, y must not be sparse. y = np.asarray(y, dtype=np.float64) weights = _check_weights(sample_weight, y.shape[0]) @@ -1715,23 +1257,8 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape # 1.3 arguments to take special care ################################## - # P1, P2 - if isinstance(self.P1, str) and self.P1 == 'identity': - P1 = np.ones(n_features) - else: - P1 = np.atleast_1d(self.P1) - try: - P1 = P1.astype(np.float64, casting='safe', copy=False) - except TypeError: - raise TypeError("The given P1 cannot be converted to a numeric" - "array; got (P1.dtype={0})." - .format(P1.dtype)) - if (P1.ndim != 1) or (P1.shape[0] != n_features): - raise ValueError("P1 must be either 'identity' or a 1d array " - "with the length of X.shape[1]; " - "got (P1.shape[0]={0}), " - "needed (X.shape[1]={1})." - .format(P1.shape[0], n_features)) + # P2 + # If X is sparse, make P2 sparse, too. if isinstance(self.P2, str) and self.P2 == 'identity': if sparse.issparse(X): @@ -1766,10 +1293,8 @@ def fit(self, X, y, sample_weight=None): "got (P2.shape=({0}, {1})), needed ({2}, {2})" .format(P2.shape[0], P2.shape[1], X.shape[1])) - l1 = self.alpha * self.l1_ratio - l2 = self.alpha * (1 - self.l1_ratio) - # P1 and P2 are now for sure copies - P1 = l1 * P1 + l2 = self.alpha + # P2 is now for sure a copy P2 = l2 * P2 # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric @@ -1792,11 +1317,6 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) - # check if P1 has only non-negative values, negative values might - # indicate group lasso in the future. - if not isinstance(self.P1, str): # if self.P1 != 'identity': - if not np.all(P1 >= 0): - raise ValueError("P1 must not have negative values.") # check if P2 is positive semidefinite # np.linalg.cholesky(P2) 'only' asserts positive definite if not isinstance(self.P2, str): # self.P2 != 'identity' @@ -1845,8 +1365,6 @@ def fit(self, X, y, sample_weight=None): # Note: Since phi=self.dispersion_ does not enter the estimation # of mu_i=E[y_i], set it to 1. - # set start values for coef - coef = None if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: coef = np.concatenate((np.array([self.intercept_]), @@ -1975,18 +1493,6 @@ def Hs(coef): args=args, maxiter=self.max_iter, tol=self.tol) - # 4.4 coordinate descent ############################################## - # Note: we already set P1 = l1*P1, see above - # Note: we already set P2 = l2*P2, see above - # Note: we already symmetrized P2 = 1/2 (P2 + P2') - elif solver == 'cd': - coef, self.n_iter_, self._n_cycles = \ - _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1, - P2=P2, fit_intercept=self.fit_intercept, - family=family, link=link, - max_iter=self.max_iter, tol=self.tol, - selection=self.selection, random_state=random_state, - diag_fisher=self.diag_fisher, copy_X=self.copy_X) ####################################################################### # 5. postprocessing # @@ -2097,9 +1603,6 @@ def estimate_phi(self, X, y, sample_weight=None): dev = self._family_instance.deviance(y, mu, weights) return dev/(n_samples - n_features) - # Note: check_estimator(GeneralizedLinearRegressor) might raise - # "AssertionError: -0.28014056555724598 not greater than 0.5" - # unless GeneralizedLinearRegressor has a score which passes the test. def score(self, X, y, sample_weight=None): """Compute D^2, the percentage of deviance explained. @@ -2212,8 +1715,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): If int, random_state is the seed used by the random number generator; if RandomState instance, random_state is the random number generator; if None, the random number generator is the - RandomState instance used by `np.random`. Used when ``selection`` == - 'random'. + RandomState instance used by `np.random`. copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index f51f630ebae7e..8fc1241e1da7a 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -25,7 +25,7 @@ from sklearn.utils.testing import assert_array_equal -GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd'] +GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg'] @pytest.fixture(scope="module") @@ -236,28 +236,6 @@ def test_glm_alpha_argument(alpha): glm.fit(X, y) -@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]]) -def test_glm_l1_ratio_argument(l1_ratio): - """Test GLM for invalid l1_ratio argument.""" - y = np.array([1, 2]) - X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio) - with pytest.raises(ValueError, - match="l1_ratio must be a number in interval.*0, 1"): - glm.fit(X, y) - - -@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3], - [-1]]) -def test_glm_P1_argument(P1): - """Test GLM for invalid P1 argument.""" - y = np.array([1, 2]) - X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True) - with pytest.raises((ValueError, TypeError)): - glm.fit(X, y) - - @pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], sparse.csr_matrix([1, 2, 3]), [-1]]) def test_glm_P2_argument(P2): @@ -301,14 +279,13 @@ def test_glm_fit_intercept_argument(fit_intercept): glm.fit(X, y) -@pytest.mark.parametrize('solver, l1_ratio', - [('not a solver', 0), (1, 0), ([1], 0), - ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)]) -def test_glm_solver_argument(solver, l1_ratio): +@pytest.mark.parametrize('solver', + ['not a solver', 1, [1]]) +def test_glm_solver_argument(solver): """Test GLM for invalid solver argument.""" y = np.array([1, 2]) X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio) + glm = GeneralizedLinearRegressor(solver=solver) with pytest.raises(ValueError): glm.fit(X, y) @@ -343,28 +320,6 @@ def test_glm_warm_start_argument(warm_start): glm.fit(X, y) -@pytest.mark.parametrize('start_params', - ['not a start_params', ['zero'], [0, 0, 0], - [[0, 0]], ['a', 'b']]) -def test_glm_start_params_argument(start_params): - """Test GLM for invalid start_params argument.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(start_params=start_params) - with pytest.raises(ValueError): - glm.fit(X, y) - - -@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']]) -def test_glm_selection_argument(selection): - """Test GLM for invalid selection argument""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(selection=selection) - with pytest.raises(ValueError, match="argument selection must be"): - glm.fit(X, y) - - @pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) def test_glm_random_state_argument(random_state): """Test GLM for invalid random_state argument.""" @@ -413,7 +368,7 @@ def test_glm_identity_regression(solver): y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=False, solver=solver, - start_params='zero', tol=1e-7) + tol=1e-7) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=1e-6) @@ -427,7 +382,7 @@ def test_glm_identity_regression(solver): @pytest.mark.parametrize('solver, tol', [('irls', 1e-6), ('lbfgs', 1e-6), ('newton-cg', 1e-7), - ('cd', 1e-7)]) +]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] @@ -435,7 +390,7 @@ def test_glm_log_regression(family, solver, tol): y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=False, - solver=solver, start_params='guess', tol=tol) + solver=solver, tol=tol) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=5e-6) @@ -472,14 +427,14 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): random_state=42, **ridge_params) ridge.fit(X, y) - glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal', + glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, max_iter=300, solver=solver, tol=1e-6, check_input=False, random_state=42) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) - assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6) + assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) @@ -487,7 +442,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): [('irls', 1e-7), ('lbfgs', 1e-7), ('newton-cg', 1e-7), - ('cd', 1e-7)]) +]) def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. @@ -506,130 +461,24 @@ def test_poisson_ridge(solver, tol): X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) rng = np.random.RandomState(42) - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0, + glm = GeneralizedLinearRegressor(alpha=1, fit_intercept=True, family='poisson', link='log', tol=1e-7, solver=solver, max_iter=300, random_state=rng) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) - assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6) - - -@pytest.mark.parametrize('diag_fisher', [False, True]) -def test_normal_enet(diag_fisher): - """Test elastic net regression with normal/gaussian family.""" - alpha, l1_ratio = 0.3, 0.7 - n_samples, n_features = 20, 2 - rng = np.random.RandomState(42) - X = rng.randn(n_samples, n_features).copy(order='F') - beta = rng.randn(n_features) - y = 2 + np.dot(X, beta) + rng.randn(n_samples) - - # 1. test normal enet on dense data - glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio, - family='normal', link='identity', - fit_intercept=True, tol=1e-8, - max_iter=100, selection='cyclic', - solver='cd', start_params='zero', - check_input=False, - diag_fisher=diag_fisher) - glm.fit(X, y) - - enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True, - normalize=False, tol=1e-8, copy_X=True) - enet.fit(X, y) - - assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) - assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) - - # 2. test normal enet on sparse data - X = sparse.csc_matrix(X) - glm.fit(X, y) - assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7) - assert_allclose(glm.coef_, enet.coef_, rtol=5e-5) - - -def test_poisson_enet(): - """Test elastic net regression with poisson family and LogLink. - - Compare to R's glmnet""" - # library("glmnet") - # options(digits=10) - # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) - # x <- data.matrix(df[,c("a", "b")]) - # y <- df$y - # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson", - # standardize=F, thresh=1e-10, nlambda=10000) - # coef(fit, s=1) - # (Intercept) -0.03550978409 - # a 0.16936423283 - # b . - glmnet_intercept = -0.03550978409 - glmnet_coef = [0.16936423283, 0.] - X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T - y = np.array([0, 1, 1, 2]) - rng = np.random.RandomState(42) - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-8, - selection='random', random_state=rng, - start_params='guess') - glm.fit(X, y) - assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6) - assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7) - - # test results with general optimization procedure - def obj(coef): - pd = PoissonDistribution() - link = LogLink() - N = y.shape[0] - mu = link.inverse(X @ coef[1:] + coef[0]) - alpha, l1_ratio = (1, 0.5) - return 1./(2.*N) * pd.deviance(y, mu) \ - + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \ - + alpha * l1_ratio * np.sum(np.abs(coef[1:])) - res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10, - options={'maxiter': 1000, 'disp': False}) - assert_allclose(glm.intercept_, res.x[0], rtol=5e-5) - assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9) - assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))), - res.fun, rtol=1e-8) - - # same for start_params='zero' and selection='cyclic' - # with reduced precision - glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson', - link='log', solver='cd', tol=1e-5, - selection='cyclic', start_params='zero') - glm.fit(X, y) - assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) - assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) - - # check warm_start, therefore start with different alpha - glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5, - family='poisson', max_iter=300, - link='log', solver='cd', tol=1e-5, - selection='cyclic', start_params='zero') - glm.fit(X, y) - # warm start with original alpha and use of sparse matrices - glm.warm_start = True - glm.alpha = 1 - X = sparse.csr_matrix(X) - glm.fit(X, y) - assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4) - assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4) + assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) @pytest.mark.parametrize( "params", [ - {"solver": "irls", "start_params": "guess"}, - {"solver": "irls", "start_params": "zero"}, - {"solver": "lbfgs", "start_params": "guess"}, - {"solver": "lbfgs", "start_params": "zero"}, + {"solver": "irls" }, + {"solver": "irls" }, + {"solver": "lbfgs" }, + {"solver": "lbfgs"}, {"solver": "newton-cg"}, - {"solver": "cd", "selection": "cyclic", "diag_fisher": False}, - {"solver": "cd", "selection": "cyclic", "diag_fisher": True}, - {"solver": "cd", "selection": "random", "diag_fisher": False}, ], ids=lambda params: ', '.join("%s=%s" % (key, val) for key, val in params.items()) From d25042e23c595b34cd5382a60bfd39613ac3a2ae Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:50:27 -0500 Subject: [PATCH 070/269] Remove newton CG algorithm --- sklearn/linear_model/_glm.py | 100 +++---------------------- sklearn/linear_model/tests/test_glm.py | 8 +- 2 files changed, 13 insertions(+), 95 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 15211c6038007..e11d7c08064ce 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -17,7 +17,6 @@ from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y -from ..utils.optimize import newton_cg from ..utils.validation import check_is_fitted, check_random_state @@ -1003,7 +1002,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'lbfgs', 'newton-cg'}, \ + solver : {'auto', 'irls', 'lbfgs'}, \ optional (default='auto') Algorithm to use in the optimization problem: @@ -1016,10 +1015,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): L1 penalties. 'lbfgs' - Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties. + Calls scipy's L-BFGS-B optimizer. - 'newton-cg', 'lbfgs' - Newton conjugate gradient algorithm cannot deal with L1 penalties. Note that all solvers except lbfgs use the fisher matrix, i.e. the expected Hessian instead of the Hessian matrix. @@ -1028,7 +1025,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, newton-cg and lbfgs solvers, + Stopping criterion. For the irls, and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. @@ -1212,9 +1209,9 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg']: + if self.solver not in ['auto', 'irls', 'lbfgs']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - " 'auto', 'irls', 'lbfgs', 'newton-cg';" + "'auto', 'irls', 'lbfgs';" " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': @@ -1410,6 +1407,9 @@ def func(coef, X, y, weights, P2, family, link): return obj, objp args = (X, y, weights, P2, family, link) + # TODO: refactor this once + # https://github.com/scikit-learn/scikit-learn/pull/14250 + # is merged. coef, loss, info = fmin_l_bfgs_b( func, coef, fprime=None, args=args, iprint=(self.verbose > 0) - 1, pgtol=self.tol, @@ -1423,76 +1423,6 @@ def func(coef, X, y, weights, P2, family, link): .format(info["task"])) self.n_iter_ = info['nit'] - # 4.3 Newton-CG ####################################################### - # We use again the fisher matrix instead of the hessian. More - # precisely, expected hessian of deviance. - elif solver == 'newton-cg': - def func(coef, X, y, weights, P2, family, link): - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = coef[idx:] @ (P2 * coef[idx:]) - else: - L2 = coef[idx:] @ (P2 @ coef[idx:]) - mu = link.inverse(_safe_lin_pred(X, coef)) - return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2 - - def grad(coef, X, y, weights, P2, family, link): - mu, devp = \ - family._mu_deviance_derivative(coef, X, y, weights, link) - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = P2 * coef[idx:] - else: - L2 = P2 @ coef[idx:] - objp = 0.5 * devp - objp[idx:] += L2 - return objp - - def grad_hess(coef, X, y, weights, P2, family, link): - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = P2 * coef[idx:] - else: - L2 = P2 @ coef[idx:] - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - d1 = link.inverse_derivative(eta) - temp = d1 * family.deviance_derivative(y, mu, weights) - if intercept: - grad = np.concatenate(([0.5 * temp.sum()], - 0.5 * temp @ X + L2)) - else: - grad = 0.5 * temp @ X + L2 # same as 0.5* X.T @ temp + L2 - - # expected hessian = fisher = X.T @ diag_matrix @ X - # calculate only diag_matrix - diag = d1**2 / family.variance(mu, phi=1, weights=weights) - if intercept: - h0i = np.concatenate(([diag.sum()], diag @ X)) - - def Hs(coef): - # return (0.5 * fisher + P2) @ coef - # ret = 0.5 * (X.T @ (diag * (X @ coef))) - ret = 0.5 * ((diag * (X @ coef[idx:])) @ X) - if P2.ndim == 1: - ret += P2 * coef[idx:] - else: - ret += P2 @ coef[idx:] - if intercept: - ret = np.concatenate(([0.5 * (h0i @ coef)], - ret + 0.5 * coef[0] * h0i[1:])) - return ret - - return grad, Hs - - args = (X, y, weights, P2, family, link) - coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef, - args=args, maxiter=self.max_iter, - tol=self.tol) - ####################################################################### # 5. postprocessing # @@ -1511,7 +1441,7 @@ def Hs(coef): return self - def linear_predictor(self, X): + def _linear_predictor(self, X): """Compute the linear_predictor = X*coef_ + intercept_. Parameters @@ -1552,7 +1482,7 @@ def predict(self, X, sample_weight=None): X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', copy=True, ensure_2d=True, allow_nd=False) - eta = self.linear_predictor(X) + eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) weights = _check_weights(sample_weight, X.shape[0]) @@ -1682,7 +1612,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls') + solver : {'irls', 'lbfgs'}, optional (default='irls') Algorithm to use in the optimization problem: 'irls' @@ -1692,17 +1622,11 @@ class PoissonRegressor(GeneralizedLinearRegressor): 'lbfgs' Calls scipy's L-BFGS-B optimizer. - 'newton-cg' - Newton conjugate gradient algorithm. - - Note that all solvers except lbfgs use the fisher matrix, i.e. the - expected Hessian instead of the Hessian matrix. - max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, newton-cg and lbfgs solvers, + Stopping criterion. For the irls, and lbfgs solvers, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 8fc1241e1da7a..0cee56afb3042 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -25,7 +25,7 @@ from sklearn.utils.testing import assert_array_equal -GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg'] +GLM_SOLVERS = ['irls', 'lbfgs'] @pytest.fixture(scope="module") @@ -381,7 +381,6 @@ def test_glm_identity_regression(solver): ]) @pytest.mark.parametrize('solver, tol', [('irls', 1e-6), ('lbfgs', 1e-6), - ('newton-cg', 1e-7), ]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" @@ -395,9 +394,6 @@ def test_glm_log_regression(family, solver, tol): assert_allclose(res.coef_, coef, rtol=5e-6) -# newton-cg may issue a LineSearchWarning, which we filter out -@pytest.mark.filterwarnings('ignore:The line search algorithm') -@pytest.mark.filterwarnings('ignore:Line Search failed') @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('solver', GLM_SOLVERS) @@ -441,7 +437,6 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): @pytest.mark.parametrize('solver, tol', [('irls', 1e-7), ('lbfgs', 1e-7), - ('newton-cg', 1e-7), ]) def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. @@ -478,7 +473,6 @@ def test_poisson_ridge(solver, tol): {"solver": "irls" }, {"solver": "lbfgs" }, {"solver": "lbfgs"}, - {"solver": "newton-cg"}, ], ids=lambda params: ', '.join("%s=%s" % (key, val) for key, val in params.items()) From 07ee4954ef118227832d9ac2ad562a5aec7af38b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 10:54:20 -0500 Subject: [PATCH 071/269] Remove fisher_matrix, _observed_information and _eta_mu_score_fisher --- sklearn/linear_model/_glm.py | 120 ------------------------- sklearn/linear_model/tests/test_glm.py | 43 --------- 2 files changed, 163 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index e11d7c08064ce..fa434821bb80e 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -253,9 +253,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta): _mu_deviance_derivative _score - _fisher_matrix - _observed_information - _eta_mu_score_fisher References ---------- @@ -518,123 +515,6 @@ def _score(self, coef, phi, X, y, weights, link): score = temp @ X # sampe as X.T @ temp return score - def _fisher_matrix(self, coef, phi, X, y, weights, link): - r"""Compute the Fisher information matrix. - - The Fisher information matrix, also known as expected information - matrix is given by - - .. math: - - \mathbf{F}(\boldsymbol{w}) = - \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial - \boldsymbol{w}} \right] - = \mathrm{E}\left[ - -\frac{\partial^2 loglike}{\partial\boldsymbol{w} - \partial\boldsymbol{w}^T}\right] - = \mathbf{X}^T W \mathbf{X} \,, - - with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`, - see func:`_score`. - """ - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - d = link.inverse_derivative(lin_pred) - d2_sigma_inv = sigma_inv * d * d - intercept = (coef.size == X.shape[1] + 1) - fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, - intercept=intercept) - return fisher_matrix - - def _observed_information(self, coef, phi, X, y, weights, link): - r"""Compute the observed information matrix. - - The observed information matrix, also known as the negative of - the Hessian matrix of the log-likelihood, is given by - - .. math: - - \mathbf{H}(\boldsymbol{w}) = - -\frac{\partial^2 loglike}{\partial\boldsymbol{w} - \partial\boldsymbol{w}^T} - = \mathbf{X}^T \left[ - - \mathbf{D}' \mathbf{R} - + \mathbf{D}^2 \mathbf{V} \mathbf{R} - + \mathbf{D}^2 - \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,, - - with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`, - :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{ - v(\mu_i)} - \right)`, - see :func:`score_` function and :func:`_fisher_matrix`. - """ - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - dp = link.inverse_derivative2(lin_pred) - d2 = link.inverse_derivative(lin_pred)**2 - v = self.unit_variance_derivative(mu)/self.unit_variance(mu) - r = y - mu - temp = sigma_inv * (-dp * r + d2 * v * r + d2) - intercept = (coef.size == X.shape[1] + 1) - observed_information = _safe_sandwich_dot(X, temp, - intercept=intercept) - return observed_information - - def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link, - diag_fisher=False): - """Compute linear predictor, mean, score function and fisher matrix. - - It calculates the linear predictor, the mean, score function - (derivative of log-likelihood) and Fisher information matrix - all in one go as function of `coef` (:math:`w`) and the data. - - Parameters - ---------- - diag_fisher : boolean, optional (default=False) - If ``True``, returns only an array d such that - fisher = X.T @ np.diag(d) @ X. - - Returns - ------- - (eta, mu, score, fisher) : tuple with 4 elements - The 4 elements are: - - * eta: ndarray, shape (X.shape[0],) - * mu: ndarray, shape (X.shape[0],) - * score: ndarray, shape (X.shape[0],) - * fisher: - - * If diag_fisher is ``False``, the full fisher matrix, - an array of shape (X.shape[1], X.shape[1]) - * If diag_fisher is ``True`, an array of shape (X.shape[0]) - """ - intercept = (coef.size == X.shape[1] + 1) - # eta = linear predictor - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - sigma_inv = 1./self.variance(mu, phi=phi, weights=weights) - d1 = link.inverse_derivative(eta) # = h'(eta) - # Alternatively: - # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g - # d1 = 1./link.derivative(mu) - d1_sigma_inv = d1 * sigma_inv - temp = d1_sigma_inv * (y - mu) - if intercept: - score = np.concatenate(([temp.sum()], temp @ X)) - else: - score = temp @ X - - d2_sigma_inv = d1 * d1_sigma_inv - if diag_fisher: - fisher_matrix = d2_sigma_inv - else: - fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv, - intercept=intercept) - return eta, mu, score, fisher_matrix - class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 0cee56afb3042..b9716388fb36c 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -109,49 +109,6 @@ def test_deviance_zero(family, chk_values): assert_allclose(family.deviance(x, x), 0, atol=1e-9) -@pytest.mark.parametrize( - 'family, link', - [(NormalDistribution(), IdentityLink()), - (PoissonDistribution(), LogLink()), - (GammaDistribution(), LogLink()), - (InverseGaussianDistribution(), LogLink()), - (TweedieDistribution(power=1.5), LogLink()), - (TweedieDistribution(power=4.5), LogLink())], - ids=lambda args: args.__class__.__name__) -def test_fisher_matrix(family, link): - """Test the Fisher matrix numerically. - Trick: Use numerical differentiation with y = mu""" - coef = np.array([-2, 1, 0, 1, 2.5]) - phi = 0.5 - rng = np.random.RandomState(42) - X = rng.randn(10, 5) - lin_pred = np.dot(X, coef) - mu = link.inverse(lin_pred) - weights = rng.randn(10)**2 + 1 - fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link) - # check that the Fisher matrix is square and positive definite - assert fisher.ndim == 2 - assert fisher.shape[0] == fisher.shape[1] - assert np.all(np.linalg.eigvals(fisher) >= 0) - - approx = np.array([]).reshape(0, coef.shape[0]) - for i in range(coef.shape[0]): - def f(coef): - return -family._score(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link)[i] - approx = np.vstack( - [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)]) - assert_allclose(fisher, approx, rtol=1e-3) - - # check the observed information matrix - oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu, - weights=weights, link=link) - assert oim.ndim == 2 - assert oim.shape == fisher.shape - assert_allclose(oim, fisher) - - def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" # scalar value but not positive From d0eb2850b91e74d1e8591f1bbd758cb06d85a3bc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:21:18 -0500 Subject: [PATCH 072/269] Remove matrix L2 penalty and IRLS solver --- sklearn/linear_model/_glm.py | 327 ++----------------------- sklearn/linear_model/tests/test_glm.py | 57 +---- 2 files changed, 26 insertions(+), 358 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index fa434821bb80e..b469b3c4edd17 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -669,164 +669,16 @@ def __init__(self): } -def _irls_step(X, W, P2, z, fit_intercept=True): - """Compute one step in iteratively reweighted least squares. - - Solve A w = b for w with - A = (X' W X + P2) - b = X' W z - z = eta + D^-1 (y-mu) - - See also fit method of :class:`GeneralizedLinearRegressor`. - - Parameters - ---------- - X : {ndarray, sparse matrix}, shape (n_samples, n_features) - Training data (with intercept included if present) - - W : ndarray, shape (n_samples,) - - P2 : {ndarray, sparse matrix}, shape (n_features, n_features) - The L2-penalty matrix or vector (=diagonal matrix) - - z : ndarray, shape (n_samples,) - Working observations - - fit_intercept : boolean, optional (default=True) - - Returns - ------- - coef : ndarray, shape (c,) - If fit_intercept=False, shape c=X.shape[1]. - If fit_intercept=True, then c=X.shapee[1] + 1. - """ - # Note: solve vs least squares, what is more appropriate? - # scipy.linalg.solve seems faster, but scipy.linalg.lstsq - # is more robust. - # Note: X.T @ W @ X is not sparse, even when X is sparse. - # Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b) - if fit_intercept: - Wz = W * z - if sparse.issparse(X): - b = np.concatenate(([Wz.sum()], X.transpose() @ Wz)) - else: - b = np.concatenate(([Wz.sum()], X.T @ Wz)) - A = _safe_sandwich_dot(X, W, intercept=fit_intercept) - if P2.ndim == 1: - idx = np.arange(start=1, stop=A.shape[0]) - A[(idx, idx)] += P2 # add to diag elements without intercept - elif sparse.issparse(P2): - A[1:, 1:] += P2.toarray() - else: - A[1:, 1:] += P2 - else: - if sparse.issparse(X): - XtW = X.transpose().multiply(W) - # for older versions of numpy and scipy, A may be a np.matrix - A = _safe_toarray(XtW @ X) - else: - XtW = (X.T * W) - A = XtW @ X - b = XtW @ z - if P2.ndim == 1: - A[np.diag_indices_from(A)] += P2 - elif sparse.issparse(P2): - A += P2.toarray() - else: - A += P2 - - coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True) - return coef - - -def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link, - max_iter, tol): - """Solve GLM with L2 penalty by IRLS algorithm. - - Note: If X is sparse, P2 must also be sparse. - """ - # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj' - # Obj = objective function = 1/2 Dev + l2/2 w P2 w - # Dev = deviance, s = normalized weights, variance V(mu) but phi=1 - # D = link.inverse_derivative(eta) = diag_matrix(h'(X w)) - # D2 = link.inverse_derivative(eta)^2 = D^2 - # W = D2/V(mu) - # l2 = alpha - # Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w - # = -X' D (y-mu)/V(mu) + l2 P2 w - # Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2 - # Use Fisher matrix instead of full info matrix -X'(...) X, - # i.e. E[Dev''] with E[y-mu]=0: - # Obj'' ~ X' W X + l2 P2 - # (1): w = (X' W X + l2 P2)^-1 X' W z, - # with z = eta + D^-1 (y-mu) - # Note: P2 must be symmetrized - # Note: ' denotes derivative, but also transpose for matrices - - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - # D = h'(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - - converged = False - n_iter = 0 - while n_iter < max_iter: - n_iter += 1 - # coef_old not used so far. - # coef_old = coef - # working weights W, in principle a diagonal matrix - # therefore here just as 1d array - W = hp**2 / V - # working observations - z = eta + (y - mu) / hp - # solve A*coef = b - # A = X' W X + P2, b = X' W z - coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept) - # updated linear predictor - # do it here for updated values for tolerance - eta = _safe_lin_pred(X, coef) - mu = link.inverse(eta) - hp = link.inverse_derivative(eta) - V = family.variance(mu, phi=1, weights=weights) - - # which tolerace? |coef - coef_old| or gradient? - # use gradient for compliance with newton-cg and lbfgs - # gradient = -X' D (y-mu)/V(mu) + l2 P2 w - temp = hp * (y - mu) / V - if sparse.issparse(X): - gradient = -(X.transpose() @ temp) - else: - gradient = -(X.T @ temp) - idx = 1 if fit_intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - gradient += P2 * coef[idx:] - else: - gradient += P2 @ coef[idx:] - if fit_intercept: - gradient = np.concatenate(([-temp.sum()], gradient)) - if (np.max(np.abs(gradient)) <= tol): - converged = True - break - - if not converged: - warnings.warn("irls failed to converge. Increase the number " - "of iterations (currently {0})" - .format(max_iter), ConvergenceWarning) - - return coef, n_iter - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and predicting the mean of the target y as mu=h(X*w). Therefore, - the fit minimizes the following objective function with combined L1 and L2 + the fit minimizes the following objective function with L2 priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) - + 1/2 * alpha * w*P2*w + + 1/2 * alpha * |w|_2 with inverse link function h and s=sample_weight. The parameter ``alpha`` corresponds to the lambda parameter in glmnet. @@ -843,18 +695,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): case, the design matrix X must have full column rank (no collinearities). - P2 : {'identity', array-like, sparse matrix}, shape \ - (n_features,) or (n_features, n_features), optional \ - (default='identity') - With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`. - This gives a fine control over this penalty (Tikhonov regularization). - A 2d array is directly used as the square matrix P2. A 1d array is - interpreted as diagonal (square) matrix. The default 'identity' sets - the identity matrix, which gives the usual squared L2-norm. If you just - want to exclude certain coefficients, pass a 1d array filled with 1, - and 0 for the coefficients to be excluded. - Note that P2 must be positive semi-definite. - fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). @@ -882,17 +722,11 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'auto', 'irls', 'lbfgs'}, \ - optional (default='auto') + solver : {'auto', 'lbfgs'}, optional (default='auto') Algorithm to use in the optimization problem: 'auto' - Sets 'irls' - - 'irls' - Iterated reweighted least squares. - It is the standard algorithm for GLMs. It cannot deal with - L1 penalties. + Sets 'lbfgs' 'lbfgs' Calls scipy's L-BFGS-B optimizer. @@ -905,7 +739,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, and lbfgs solvers, + Stopping criterion. For the lbfgs solver, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. @@ -920,22 +754,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): number generator; if None, the random number generator is the RandomState instance used by `np.random`. - diag_fisher : boolean, optional, (default=False) - Only relevant for solver 'cd'. - If ``False``, the full Fisher matrix (expected Hessian) is computed in - each outer iteration (Newton iteration). If ``True``, only a diagonal - matrix (stored as 1d array) is computed, such that - fisher = X.T @ diag @ X. This saves memory and matrix-matrix - multiplications, but needs more matrix-vector multiplications. If you - use large sparse X or if you have many features, - i.e. n_features >> n_samples, you might set this option to ``True``. - copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. check_input : boolean, optional (default=True) Allow to bypass several checks on input: y values in range of family, - sample_weight non-negative, P2 positive semi-definite. + sample_weight non-negative. Don't use this parameter unless you know what you do. verbose : int, optional (default=0) @@ -991,14 +815,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Journal of Machine Learning Research 13 (2012) 1999-2030 https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ - def __init__(self, alpha=1.0, P2='identity', + def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, - random_state=None, diag_fisher=False, + random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha - self.P2 = P2 self.fit_intercept = fit_intercept self.family = family self.link = link @@ -1008,7 +831,6 @@ def __init__(self, alpha=1.0, P2='identity', self.tol = tol self.warm_start = warm_start self.random_state = random_state - self.diag_fisher = diag_fisher self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -1051,8 +873,8 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian', " - "'binomial']; got (family={0})".format(self.family)) + " ['normal', 'poisson', 'gamma', 'inverse.gaussian']" + "; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of # class Link @@ -1089,13 +911,13 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'irls', 'lbfgs']: + if self.solver not in ['auto', 'lbfgs']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - "'auto', 'irls', 'lbfgs';" + "'auto', 'lbfgs';" " got {0}".format(self.solver)) solver = self.solver if self.solver == 'auto': - solver = 'irls' + solver = 'lbfgs' if (not isinstance(self.max_iter, int) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " @@ -1108,9 +930,6 @@ def fit(self, X, y, sample_weight=None): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) random_state = check_random_state(self.random_state) - if not isinstance(self.diag_fisher, bool): - raise ValueError("The argument diag_fisher must be bool;" - " got {0}".format(self.diag_fisher)) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -1133,95 +952,12 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape - # 1.3 arguments to take special care ################################## - # P2 - - # If X is sparse, make P2 sparse, too. - if isinstance(self.P2, str) and self.P2 == 'identity': - if sparse.issparse(X): - P2 = (sparse.dia_matrix((np.ones(n_features), 0), - shape=(n_features, n_features))).tocsc() - else: - P2 = np.ones(n_features) - else: - P2 = check_array(self.P2, copy=True, - accept_sparse=_stype, - dtype=_dtype, ensure_2d=False) - if P2.ndim == 1: - P2 = np.asarray(P2) - if P2.shape[0] != n_features: - raise ValueError("P2 should be a 1d array of shape " - "(n_features,) with " - "n_features=X.shape[1]; " - "got (P2.shape=({0},)), needed ({1},)" - .format(P2.shape[0], X.shape[1])) - if sparse.issparse(X): - P2 = (sparse.dia_matrix((P2, 0), - shape=(n_features, n_features))).tocsc() - elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and - P2.shape[0] == X.shape[1]): - if sparse.issparse(X): - P2 = (sparse.dia_matrix((P2, 0), - shape=(n_features, n_features))).tocsc() - else: - raise ValueError("P2 must be either None or an array of shape " - "(n_features, n_features) with " - "n_features=X.shape[1]; " - "got (P2.shape=({0}, {1})), needed ({2}, {2})" - .format(P2.shape[0], P2.shape[1], X.shape[1])) - - l2 = self.alpha - # P2 is now for sure a copy - P2 = l2 * P2 - # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2') - # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric - if P2.ndim == 2: - if sparse.issparse(P2): - if sparse.isspmatrix_csc(P2): - P2 = 0.5 * (P2 + P2.transpose()).tocsc() - else: - P2 = 0.5 * (P2 + P2.transpose()).tocsr() - else: - P2 = 0.5 * (P2 + P2.T) - - # For coordinate descent, if X is sparse, P2 must also be csc - if solver == 'cd' and sparse.issparse(X): - P2 = sparse.csc_matrix(P2) - # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " "range for family {0}" .format(family.__class__.__name__)) - # check if P2 is positive semidefinite - # np.linalg.cholesky(P2) 'only' asserts positive definite - if not isinstance(self.P2, str): # self.P2 != 'identity' - # due to numerical precision, we allow eigenvalues to be a - # tiny bit negative - epsneg = -10 * np.finfo(P2.dtype).epsneg - if P2.ndim == 1 or P2.shape[0] == 1: - p2 = P2 - if sparse.issparse(P2): - p2 = P2.toarray() - if not np.all(p2 >= 0): - raise ValueError("1d array P2 must not have negative " - "values.") - elif sparse.issparse(P2): - # for sparse matrices, not all eigenvals can be computed - # efficiently, use only half of n_features - # k = how many eigenvals to compute - k = np.min([10, n_features // 10 + 1]) - sigma = 0 # start searching near this value - which = 'SA' # find smallest algebraic eigenvalues first - eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma, - which=which, - return_eigenvectors=False) - if not np.all(eigenvalues >= epsneg): - raise ValueError("P2 must be positive semi-definite.") - else: - if not np.all(linalg.eigvalsh(P2) >= epsneg): - raise ValueError("P2 must be positive semi-definite.") # TODO: if alpha=0 check that X is not rank deficient # TODO: what else to check? @@ -1229,10 +965,10 @@ def fit(self, X, y, sample_weight=None): # 2. rescaling of weights (sample_weight) # ####################################################################### # IMPORTANT NOTE: Since we want to minimize - # 1/(2*sum(sample_weight)) * deviance + L1 + L2, + # 1/(2*sum(sample_weight)) * deviance + L2, # deviance = sum(sample_weight * unit_deviance), # we rescale weights such that sum(weights) = 1 and this becomes - # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance) + # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance) weights_sum = np.sum(weights) weights = weights/weights_sum @@ -1260,33 +996,21 @@ def fit(self, X, y, sample_weight=None): ####################################################################### # algorithms for optimization - # 4.1 IRLS ############################################################ - # Note: we already set P2 = l2*P2, see above - # Note: we already symmetrized P2 = 1/2 (P2 + P2') - if solver == 'irls': - coef, self.n_iter_ = \ - _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2, - fit_intercept=self.fit_intercept, family=family, - link=link, max_iter=self.max_iter, tol=self.tol) - - # 4.2 L-BFGS ########################################################## - elif solver == 'lbfgs': - def func(coef, X, y, weights, P2, family, link): + # 4.1 L-BFGS ########################################################## + if solver == 'lbfgs': + def func(coef, X, y, weights, alpha, family, link): mu, devp = \ family._mu_deviance_derivative(coef, X, y, weights, link) dev = family.deviance(y, mu, weights) intercept = (coef.size == X.shape[1] + 1) idx = 1 if intercept else 0 # offset if coef[0] is intercept - if P2.ndim == 1: - L2 = P2 * coef[idx:] - else: - L2 = P2 @ coef[idx:] + L2 = alpha * coef[idx:] obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) objp = 0.5 * devp objp[idx:] += L2 return obj, objp - args = (X, y, weights, P2, family, link) + args = (X, y, weights, self.alpha, family, link) # TODO: refactor this once # https://github.com/scikit-learn/scikit-learn/pull/14250 # is merged. @@ -1492,13 +1216,9 @@ class PoissonRegressor(GeneralizedLinearRegressor): the chi squared statistic or the deviance statistic. If None, the dispersion is not estimated. - solver : {'irls', 'lbfgs'}, optional (default='irls') + solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: - 'irls' - Iterated reweighted least squares. It is the standard algorithm - for GLMs. - 'lbfgs' Calls scipy's L-BFGS-B optimizer. @@ -1506,7 +1226,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): The maximal number of iterations for solver algorithms. tol : float, optional (default=1e-4) - Stopping criterion. For the irls, and lbfgs solvers, + Stopping criterion. For the lbfgs solver, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of the objective function. @@ -1575,8 +1295,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf """ def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, - solver='irls', max_iter=100, - tol=1e-4, warm_start=False, + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, random_state=None, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index b9716388fb36c..877b37993f587 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -25,7 +25,7 @@ from sklearn.utils.testing import assert_array_equal -GLM_SOLVERS = ['irls', 'lbfgs'] +GLM_SOLVERS = ['lbfgs'] @pytest.fixture(scope="module") @@ -193,39 +193,6 @@ def test_glm_alpha_argument(alpha): glm.fit(X, y) -@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]], - sparse.csr_matrix([1, 2, 3]), [-1]]) -def test_glm_P2_argument(P2): - """Test GLM for invalid P2 argument.""" - y = np.array([1, 2]) - X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(P2=P2, check_input=True) - with pytest.raises(ValueError): - glm.fit(X, y) - - -def test_glm_P2_positive_semidefinite(): - """Test GLM for a positive semi-definite P2 argument.""" - n_samples, n_features = 10, 5 - y = np.arange(n_samples) - X = np.zeros((n_samples, n_features)) - P2 = np.diag([100, 10, 5, 0, -1E-5]) - rng = np.random.RandomState(42) - # construct random orthogonal matrix Q - Q, R = linalg.qr(rng.randn(n_features, n_features)) - P2 = Q.T @ P2 @ Q - glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, - check_input=True) - with pytest.raises(ValueError, match="P2 must be positive semi-definite"): - glm.fit(X, y) - - P2 = sparse.csr_matrix(P2) - glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False, - check_input=True) - with pytest.raises(ValueError, match="P2 must be positive semi-definite"): - glm.fit(X, y) - - @pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]]) def test_glm_fit_intercept_argument(fit_intercept): """Test GLM for invalid fit_intercept argument.""" @@ -287,16 +254,6 @@ def test_glm_random_state_argument(random_state): glm.fit(X, y) -@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]]) -def test_glm_diag_fisher_argument(diag_fisher): - """Test GLM for invalid diag_fisher arguments.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher) - with pytest.raises(ValueError, match="diag_fisher must be bool"): - glm.fit(X, y) - - @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) def test_glm_copy_X_argument(copy_X): """Test GLM for invalid copy_X arguments.""" @@ -336,9 +293,7 @@ def test_glm_identity_regression(solver): GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), ]) -@pytest.mark.parametrize('solver, tol', [('irls', 1e-6), - ('lbfgs', 1e-6), -]) +@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] @@ -391,10 +346,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) -@pytest.mark.parametrize('solver, tol', - [('irls', 1e-7), - ('lbfgs', 1e-7), -]) +@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) def test_poisson_ridge(solver, tol): """Test ridge regression with poisson family and LogLink. @@ -426,9 +378,6 @@ def test_poisson_ridge(solver, tol): @pytest.mark.parametrize( "params", [ - {"solver": "irls" }, - {"solver": "irls" }, - {"solver": "lbfgs" }, {"solver": "lbfgs"}, ], ids=lambda params: ', '.join("%s=%s" % (key, val) From 1e4b5380b7ad699e5d059ef6d9836c37a7ba16e2 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:23:31 -0500 Subject: [PATCH 073/269] Remove plot_poisson_spline_regression.py example --- .../plot_poisson_spline_regression.py | 85 ------------------- 1 file changed, 85 deletions(-) delete mode 100644 examples/linear_model/plot_poisson_spline_regression.py diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py deleted file mode 100644 index 30b5881bba1f5..0000000000000 --- a/examples/linear_model/plot_poisson_spline_regression.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -================================= -Poisson Regression with B-Splines -================================= - -As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py` -example, a Poisson regression with penalized B-splines (P-splines) [1]_ is -fitted on slightly different sinusoidal, Poisson distributed data and -compared to an AdaBoost model with decision trees. -One can see, that this is a hard problem for both estimators. - -.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines - and penalties". Statist. Sci. 11 (1996), no. 2, 89--121. - `doi:10.1214/ss/1038425655 - `_ - -""" -print(__doc__) - -# Author: Christian Lorentzen -# based on the AdaBoost regression example from Noel Dawe -# License: BSD 3 clause - -# importing necessary libraries -import numpy as np -from scipy.linalg import toeplitz -# from scipy.interpolate import BSpline -from scipy.interpolate import splev -import matplotlib.pyplot as plt -from sklearn.tree import DecisionTreeRegressor -from sklearn.ensemble import AdaBoostRegressor -from sklearn.linear_model import GeneralizedLinearRegressor - - -# Create the dataset -xmin, xmax = 0, 6 -rng = np.random.RandomState(1) -X = np.linspace(xmin, xmax, 500)[:, np.newaxis] -y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel()) -y = rng.poisson(y_true, X.shape[0]) - -# b-spline basis -nknots, degree = 40, 3 -ns = nknots - degree - 1 # number of base spline functions -dx = (xmax - xmin) / (nknots - 1 - 2 * degree) -knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots) -coef = np.zeros(ns) -splineBasis = np.empty((X.shape[0], ns), dtype=float) -for i in range(ns): - coef[i] = 1 -# splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \ -# .ravel() - splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel() - coef[i] = 0 - -# second order difference matrix -P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float) -P2[0, 0] = P2[-1, -1] = 1 - -# Fit regression model -regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), - n_estimators=10, random_state=rng) - -regr_2 = GeneralizedLinearRegressor(family='poisson', link='log', - fit_intercept=True, alpha=0.02, - l1_ratio=0.1, P2=P2) - -regr_1.fit(X, y) -regr_2.fit(splineBasis, y) - -# Predict -y_1 = regr_1.predict(X) -y_2 = regr_2.predict(splineBasis) - -# Plot the results -plt.figure() -plt.plot(X, y_true, c="b", label="true mean") -plt.scatter(X, y, c="k", marker='.', label="training samples") -plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2) -plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2) -plt.xlabel("data") -plt.ylabel("target") -plt.title("Regression Comparison") -plt.legend() -plt.show() From 3265148aa4df59bd1a4c8ea5eb8e2102bb650006 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:38:48 -0500 Subject: [PATCH 074/269] Remove random_state parameter --- sklearn/linear_model/_glm.py | 30 ++++--------------- sklearn/linear_model/tests/test_glm.py | 41 ++++++++------------------ 2 files changed, 17 insertions(+), 54 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index b469b3c4edd17..4a1f7c260e649 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -10,15 +10,13 @@ from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import linalg, sparse, special -import scipy.sparse.linalg as splinalg +from scipy import sparse, special from scipy.optimize import fmin_l_bfgs_b import warnings from ..base import BaseEstimator, RegressorMixin from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y -from ..utils.validation import check_is_fitted, check_random_state - +from ..utils.validation import check_is_fitted def _check_weights(sample_weight, n_samples): @@ -680,7 +678,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * |w|_2 - with inverse link function h and s=sample_weight. + with inverse link function h and s=sample_weight. The parameter ``alpha`` corresponds to the lambda parameter in glmnet. Read more in the :ref:`User Guide `. @@ -742,18 +740,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Stopping criterion. For the lbfgs solver, the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` where ``g_i`` is the i-th component of the gradient (derivative) of - the objective function. + the objective function. warm_start : boolean, optional (default=False) If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_``. - random_state : {int, RandomState instance, None}, optional (default=None) - If int, random_state is the seed used by the random - number generator; if RandomState instance, random_state is the random - number generator; if None, the random number generator is the - RandomState instance used by `np.random`. - copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. @@ -819,7 +811,6 @@ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', fit_dispersion=None, solver='auto', max_iter=100, tol=1e-4, warm_start=False, - random_state=None, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept @@ -830,7 +821,6 @@ def __init__(self, alpha=1.0, self.max_iter = max_iter self.tol = tol self.warm_start = warm_start - self.random_state = random_state self.copy_X = copy_X self.check_input = check_input self.verbose = verbose @@ -929,7 +919,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - random_state = check_random_state(self.random_state) if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) @@ -959,7 +948,6 @@ def fit(self, X, y, sample_weight=None): "range for family {0}" .format(family.__class__.__name__)) # TODO: if alpha=0 check that X is not rank deficient - # TODO: what else to check? ####################################################################### # 2. rescaling of weights (sample_weight) # @@ -1027,7 +1015,6 @@ def func(coef, X, y, weights, alpha, family, link): .format(info["task"])) self.n_iter_ = info['nit'] - ####################################################################### # 5. postprocessing # ####################################################################### @@ -1235,12 +1222,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_`` . - random_state : {int, RandomState instance, None}, optional (default=None) - If int, random_state is the seed used by the random - number generator; if RandomState instance, random_state is the random - number generator; if None, the random number generator is the - RandomState instance used by `np.random`. - copy_X : boolean, optional, (default=True) If ``True``, X will be copied; else, it may be overwritten. @@ -1296,11 +1277,10 @@ class PoissonRegressor(GeneralizedLinearRegressor): """ def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - random_state=None, copy_X=True, check_input=True, verbose=0): + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, - random_state=random_state, copy_X=copy_X, verbose=verbose) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index 877b37993f587..a921c7e065878 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -5,10 +5,8 @@ import numpy as np from numpy.testing import assert_allclose import pytest -import scipy as sp -from scipy import linalg, optimize, sparse -from sklearn.datasets import make_classification, make_regression +from sklearn.datasets import make_regression from sklearn.linear_model import GeneralizedLinearRegressor from sklearn.linear_model._glm import ( Link, @@ -19,7 +17,7 @@ NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, ) -from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge +from sklearn.linear_model import Ridge from sklearn.metrics import mean_absolute_error from sklearn.exceptions import ConvergenceWarning @@ -101,8 +99,7 @@ def test_tweedie_distribution_power(): (TweedieDistribution(power=-1), [0.1, 1.5]), (TweedieDistribution(power=1.5), [0.1, 1.5]), (TweedieDistribution(power=2.5), [0.1, 1.5]), - (TweedieDistribution(power=-4), [0.1, 1.5]), -]) + (TweedieDistribution(power=-4), [0.1, 1.5])]) def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: @@ -151,8 +148,7 @@ def test_sample_weights_validation(): [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), - ('inverse.gaussian', InverseGaussianDistribution()), -]) + ('inverse.gaussian', InverseGaussianDistribution())]) def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions @@ -244,16 +240,6 @@ def test_glm_warm_start_argument(warm_start): glm.fit(X, y) -@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]]) -def test_glm_random_state_argument(random_state): - """Test GLM for invalid random_state argument.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(random_state=random_state) - with pytest.raises(ValueError, match="cannot be used to seed"): - glm.fit(X, y) - - @pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) def test_glm_copy_X_argument(copy_X): """Test GLM for invalid copy_X arguments.""" @@ -291,8 +277,7 @@ def test_glm_identity_regression(solver): 'family', [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), - TweedieDistribution(power=1.5), TweedieDistribution(power=4.5), -]) + TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]) @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)]) def test_glm_log_regression(family, solver, tol): """Test GLM regression with log link on a simple dataset.""" @@ -338,7 +323,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, max_iter=300, solver=solver, tol=1e-6, - check_input=False, random_state=42) + check_input=False) glm.fit(X, y) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) @@ -369,7 +354,7 @@ def test_poisson_ridge(solver, tol): fit_intercept=True, family='poisson', link='log', tol=1e-7, solver=solver, max_iter=300, - random_state=rng) + ) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) @@ -385,11 +370,10 @@ def test_poisson_ridge(solver, tol): ) def test_solver_equivalence(params, regression_data): X, y = regression_data - est_ref = GeneralizedLinearRegressor(random_state=2) + est_ref = GeneralizedLinearRegressor() est_ref.fit(X, y) estimator = GeneralizedLinearRegressor(**params) - estimator.set_params(random_state=2) estimator.fit(X, y) @@ -405,16 +389,15 @@ def test_solver_equivalence(params, regression_data): def test_fit_dispersion(regression_data): X, y = regression_data - est1 = GeneralizedLinearRegressor(random_state=2) + est1 = GeneralizedLinearRegressor() est1.fit(X, y) assert not hasattr(est1, "dispersion_") - est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr") + est2 = GeneralizedLinearRegressor(fit_dispersion="chisqr") est2.fit(X, y) assert isinstance(est2.dispersion_, float) - est3 = GeneralizedLinearRegressor( - random_state=2, fit_dispersion="deviance") + est3 = GeneralizedLinearRegressor(fit_dispersion="deviance") est3.fit(X, y) assert isinstance(est3.dispersion_, float) @@ -425,7 +408,7 @@ def test_fit_dispersion(regression_data): def test_convergence_warning(solver, regression_data): X, y = regression_data - est = GeneralizedLinearRegressor(solver=solver, random_state=2, + est = GeneralizedLinearRegressor(solver=solver, max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y) From 1862ab6811db3a6a4ad54719fcd048dc151d0a37 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 9 Jul 2019 11:55:13 -0500 Subject: [PATCH 075/269] Lint --- sklearn/linear_model/tests/test_glm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py index a921c7e065878..1712f7b5e1d3d 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/tests/test_glm.py @@ -349,7 +349,6 @@ def test_poisson_ridge(solver, tol): # b 0.03741173122 X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T y = np.array([0, 1, 1, 2]) - rng = np.random.RandomState(42) glm = GeneralizedLinearRegressor(alpha=1, fit_intercept=True, family='poisson', link='log', tol=1e-7, From 4154074a7367be310976a8a1bba00ed737cf9e3d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 10 Jul 2019 14:16:05 +0200 Subject: [PATCH 076/269] Fix docstring --- doc/modules/linear_model.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index eba51315d2ae8..681a13cdf9d42 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -940,9 +940,9 @@ follows: >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') >>> reg.coef_ - array([0.24630169, 0.43373464]) + array([0.2463..., 0.4337...]) >>> reg.intercept_ - -0.76383633... + -0.7638... .. topic:: Examples: From c5d77d78d78a57b242e401f0f2f3efa572e74105 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 10 Jul 2019 14:20:42 +0200 Subject: [PATCH 077/269] Remove unused core --- sklearn/linear_model/_glm.py | 68 +----------------------------------- 1 file changed, 1 insertion(+), 67 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 4a1f7c260e649..244c781ae8f48 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -10,7 +10,7 @@ from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import sparse, special +from scipy import special from scipy.optimize import fmin_l_bfgs_b import warnings from ..base import BaseEstimator, RegressorMixin @@ -54,42 +54,6 @@ def _safe_lin_pred(X, coef): return X @ coef -def _safe_toarray(X): - """Returns a numpy array.""" - if sparse.issparse(X): - return X.toarray() - else: - return np.asarray(X) - - -def _safe_sandwich_dot(X, d, intercept=False): - """Compute sandwich product X.T @ diag(d) @ X. - - With ``intercept=True``, X is treated as if a column of 1 were appended as - first column of X. - X can be sparse, d must be an ndarray. Always returns a ndarray.""" - if sparse.issparse(X): - temp = (X.transpose() @ X.multiply(d[:, np.newaxis])) - # for older versions of numpy and scipy, temp may be a np.matrix - temp = _safe_toarray(temp) - else: - temp = (X.T * d) @ X - if intercept: - dim = X.shape[1] + 1 - if sparse.issparse(X): - order = 'F' if sparse.isspmatrix_csc(X) else 'C' - else: - order = 'F' if X.flags['F_CONTIGUOUS'] else 'C' - res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order) - res[0, 0] = d.sum() - res[1:, 0] = d @ X - res[0, 1:] = res[1:, 0] - res[1:, 1:] = temp - else: - res = temp - return res - - class Link(metaclass=ABCMeta): """Abstract base class for Link functions.""" @@ -250,7 +214,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta): starting_mu _mu_deviance_derivative - _score References ---------- @@ -484,35 +447,6 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): devp = temp @ X # sampe as X.T @ temp return mu, devp - def _score(self, coef, phi, X, y, weights, link): - r"""Compute the score function. - - The score function is the derivative of the - log-likelihood w.r.t. `coef` (:math:`w`). - It is given by - - .. math: - - \mathbf{score}(\boldsymbol{w}) - = \frac{\partial loglike}{\partial\boldsymbol{w}} - = \mathbf{X}^T \mathbf{D} - \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,, - - with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and - :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`. - Note: The derivative of the deviance w.r.t. coef equals -2 * score. - """ - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - sigma_inv = 1/self.variance(mu, phi=phi, weights=weights) - d = link.inverse_derivative(lin_pred) - temp = sigma_inv * d * (y - mu) - if coef.size == X.shape[1] + 1: - score = np.concatenate(([temp.sum()], temp @ X)) - else: - score = temp @ X # sampe as X.T @ temp - return score - class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. From 9ab5ac2506bd33d84c96f9f848b62cb4ee5b0853 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:14 +0200 Subject: [PATCH 078/269] Update examples/linear_model/plot_poisson_regression_non_normal_loss.py Co-Authored-By: Alexandre Gramfort --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index b06adcb787560..471c137840e82 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -1,6 +1,6 @@ """ ====================================== -Poisson regression and non normal loss +Poisson regression and non-normal loss ====================================== This example illustrate the use linear Poisson regression From e4d0be190239435e7dcd4d27cd616006909db98b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:26 +0200 Subject: [PATCH 079/269] Update examples/linear_model/plot_poisson_regression_non_normal_loss.py Co-Authored-By: Alexandre Gramfort --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 471c137840e82..0537704b2cf1f 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -3,7 +3,7 @@ Poisson regression and non-normal loss ====================================== -This example illustrate the use linear Poisson regression +This example illustrates the use of linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compare it with learning models with least squared error. From 6ff4d588f5adae2aed6810ddff64733c5bc595a3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:50 +0200 Subject: [PATCH 080/269] Update doc/modules/linear_model.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 681a13cdf9d42..2f750b33623aa 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -929,7 +929,7 @@ Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with -:math:`h(Xw)=\exp(Xw)`. +:math:`h(x^\top w)=\exp(x^\top w)`. Note that the feature matrix `X` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as From 13102d5a2b4538734f162a70979df874a31c7798 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:13:59 +0200 Subject: [PATCH 081/269] Update doc/modules/linear_model.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 2f750b33623aa..49662064827f3 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -928,7 +928,7 @@ are the following: Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link -:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with +:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with :math:`h(x^\top w)=\exp(x^\top w)`. Note that the feature matrix `X` should be standardized before fitting. This From af89e5281b880b19346db625838e1623275b438a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:14:12 +0200 Subject: [PATCH 082/269] Update doc/modules/linear_model.rst Co-Authored-By: Alexandre Gramfort --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 49662064827f3..c8c103288c5ab 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -898,7 +898,7 @@ ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as -.. math:: \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p). +.. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an exponential dispersion model (EDM) [11]_. The objective function being minimized From ddc4b717bcece9c8d0347d46b9f4a89b681aec96 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 13 Jul 2019 15:27:22 +0200 Subject: [PATCH 083/269] Use scipy.optimize.minimize interface for LBFGS optimizer --- sklearn/linear_model/_glm.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py index 244c781ae8f48..939249e42e4f4 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm.py @@ -6,16 +6,16 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -from __future__ import division from abc import ABCMeta, abstractmethod import numbers + import numpy as np from scipy import special -from scipy.optimize import fmin_l_bfgs_b -import warnings +import scipy.optimize + from ..base import BaseEstimator, RegressorMixin -from ..exceptions import ConvergenceWarning from ..utils import check_array, check_X_y +from ..utils.optimize import _check_optimize_result from ..utils.validation import check_is_fitted @@ -933,21 +933,18 @@ def func(coef, X, y, weights, alpha, family, link): return obj, objp args = (X, y, weights, self.alpha, family, link) - # TODO: refactor this once - # https://github.com/scikit-learn/scikit-learn/pull/14250 - # is merged. - coef, loss, info = fmin_l_bfgs_b( - func, coef, fprime=None, args=args, - iprint=(self.verbose > 0) - 1, pgtol=self.tol, - maxiter=self.max_iter, factr=1e3) - if info["warnflag"] == 1: - warnings.warn("lbfgs failed to converge." - " Increase the number of iterations.", - ConvergenceWarning) - elif info["warnflag"] == 2: - warnings.warn("lbfgs failed for the reason: {0}" - .format(info["task"])) - self.n_iter_ = info['nit'] + + opt_res = scipy.optimize.minimize( + func, coef, method="L-BFGS-B", jac=True, + options={ + "maxiter": self.max_iter, + "iprint": (self.verbose > 0) - 1, + "gtol": self.tol, + "ftol": 1e3*np.finfo(float).eps, + }, + args=args) + self.n_iter_ = _check_optimize_result("lbfgs", opt_res) + coef = opt_res.x ####################################################################### # 5. postprocessing # From 426ae1d711c27d14a7dd7d22763fee5d9d20c1f5 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jul 2019 18:20:09 +0200 Subject: [PATCH 084/269] EXA wording and score in plot_tweedie_regression_insurance_claims.html --- ...lot_tweedie_regression_insurance_claims.py | 137 ++++++++++-------- 1 file changed, 78 insertions(+), 59 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 063d12e6e291b..00111b811f923 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -3,7 +3,7 @@ Tweedie regression on insurance claims ====================================== -This example illustrate the use Poisson, Gamma and Tweedie regression +This example illustrates the use of Poisson, Gamma and Tweedie regression on the French Motor Third-Party Liability Claims dataset, and is inspired by an R tutorial [1]. @@ -13,9 +13,10 @@ which are: 1. Model the number of claims with a Poisson distribution, the average - claim amount as a Gamma distribution and multiply the predictions, to get - the total claim amount. -2. Model total claim amount directly, typically with a Tweedie distribution. + claim amount as a Gamma distribution and multiply the predictions of both in + order to get the total claim amount. +2. Model total claim amount directly, typically with a Tweedie distribution of + Tweedie power :math:`p \\in (1, 2)`. In this example we will illustrate both approaches. We start by defining a few helper functions for loading the data and visualizing results. @@ -49,7 +50,7 @@ def load_mtpl2(n_samples=100000): - """Fetcher for French Motor Third-Party Liability Claims dataset + """Fetch the French Motor Third-Party Liability Claims dataset. Parameters ---------- @@ -81,24 +82,27 @@ def load_mtpl2(n_samples=100000): return df.iloc[:n_samples] -def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, +def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, title=None, kind_weight=None, ax=None): """Plot observed and predicted - aggregated per feature level. Parameters ---------- - df : DataFrame with at least one column named feature + df : DataFrame with at least three columns named feature, weight and + observed + feature: str + a column name of df for the feature to be plotted + weight : str + column name of df with the values of weights or exposure observed : str - a column name of the observed target + a column name of df with the observed target predicted : frame a dataframe, with the same index as df, with the predicted target - weight : str - column name with the values of weights/exposure """ # aggregate observed and predicted variables by feature level df_ = df.loc[:, [feature, weight]].copy() df_["observed"] = df[observed] * df[weight] - df_["predicted"] = y_predicted * df[weight] + df_["predicted"] = predicted * df[weight] df_ = ( df_.groupby([feature])[weight, "observed", "predicted"] .sum() @@ -126,9 +130,10 @@ def plot_obs_pred(df, feature, observed, y_predicted, weight, y_label=None, # 1. Loading datasets and pre-processing # -------------------------------------- # -# We construct the freMTPL2 dataset by joining the freMTPL2freq table, -# containing the number of claims (``ClaimNb``) with the freMTPL2sev table -# containing the claim amount (``ClaimAmount``) for the same user ids. +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# containing the number of claims (``ClaimNb``), with the freMTPL2sev table, +# containing the claim amount (``ClaimAmount``) for the same policy ids +# (``IDpol``). df = load_mtpl2(n_samples=100000) @@ -206,14 +211,14 @@ def score_estimator( y, _weights = df[target], df[weights] for score_label, metric in [ - ("D² explaned", None), + ("D² explained", None), ("mean deviance", partial(mean_deviance, estimator)), ("mean abs. error", mean_absolute_error), ]: if estimator.__class__.__name__ == "ClaimProdEstimator": - # ClaimProdEstimator is the product of the frequency and - # severity models, together with a denormalized by the exposure - # values. It does not fully follow the scikit-learn API and we + # ClaimProdEstimator is the product of frequency and severity + # models, denormalized by the exposure values. + # It does not fully follow the scikit-learn API and we # must handle it separately. y_pred = estimator.predict(X, exposure=df.Exposure.values) else: @@ -253,50 +258,50 @@ def score_estimator( # # We can visually compare observed and predicted values, aggregated by # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance -# bonus/penalty (``BonusMalus``), +# bonus/malus (``BonusMalus``). fig, ax = plt.subplots(2, 2, figsize=(16, 8)) fig.subplots_adjust(hspace=0.3, wspace=0.2) plot_obs_pred( - df_train, - "DrivAge", - "Frequency", - glm_freq.predict(X_train), + df=df_train, + feature="DrivAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_train), y_label="Claim Frequency", title="train data", ax=ax[0, 0], ) plot_obs_pred( - df_test, - "DrivAge", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="DrivAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[0, 1], ) plot_obs_pred( - df_test, - "VehAge", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="VehAge", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[1, 0], ) plot_obs_pred( - df_test, - "BonusMalus", - "Frequency", - glm_freq.predict(X_test), + df=df_test, + feature="BonusMalus", weight="Exposure", + observed="Frequency", + predicted=glm_freq.predict(X_test), y_label="Claim Frequency", title="test data", ax=ax[1, 1], @@ -308,12 +313,13 @@ def score_estimator( # 3. Severity model - Gamma Distribution # --------------------------------------- # The mean claim amount or severity (`AvgClaimAmount`) can be empirically -# shown to follow a Gamma distribution. We fit a GLM model for the severity -# with the same features as the frequency model. +# shown to follow approximately a Gamma distribution. We fit a GLM model for +# the severity with the same features as the frequency model. # # Note: -# - We filter out ``ClaimAmount == 0``` as the Gamma distribution as support -# on :math:`(0, \infty)` not :math:`[0, \infty)`. +# +# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support +# on :math:`(0, \infty)`, not :math:`[0, \infty)`. # - We use ``ClaimNb`` as sample weights. mask_train = df_train["ClaimAmount"] > 0 @@ -341,19 +347,20 @@ def score_estimator( ############################################################################## # -# Note that the resulting model is conditional on having at least one claim, -# and cannot be used to predict the average claim amount in general, +# Note that the resulting model is the average claim amount per claim. As such, +# it is conditional on having at least one claim, and cannot be used to predict +# the average claim amount per policy in general. print( - "Mean AvgClaim Amount: %.2f " + "Mean AvgClaim Amount per policy: %.2f " % df_train.AvgClaimAmount.mean() ) print( - "Mean AvgClaim Amount | NbClaim > 0: %.2f" + "Mean AvgClaim Amount | NbClaim > 0: %.2f" % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() ) print( - "Predicted Mean AvgClaim Amount: %.2f" + "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" % glm_sev.predict(X_train).mean() ) @@ -361,28 +368,28 @@ def score_estimator( ############################################################################## # # We can visually compare observed and predicted values, aggregated for -# the drivers age (``Driv Age``), +# the drivers age (``DrivAge``). fig, ax = plt.subplots(1, 2, figsize=(16, 4)) # plot DivAge plot_obs_pred( - df_train.loc[mask_train], - "DrivAge", - "AvgClaimAmount", - glm_sev.predict(X_train[mask_train.values]), + df=df_train.loc[mask_train], + feature="DrivAge", weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_train[mask_train.values]), y_label="Average Claim Severity", title="train data", ax=ax[0], ) plot_obs_pred( - df_test.loc[mask_test], - "DrivAge", - "AvgClaimAmount", - glm_sev.predict(X_test[mask_test.values]), + df=df_test.loc[mask_test], + feature="DrivAge", weight="Exposure", + observed="AvgClaimAmount", + predicted=glm_sev.predict(X_test[mask_test.values]), y_label="Average Claim Severity", title="test data", ax=ax[1], @@ -391,31 +398,41 @@ def score_estimator( ############################################################################## # -# 3. Total Claims Amount -- Compound Poisson distribution +# 4. Total Claims Amount -- Compound Poisson distribution # ------------------------------------------------------- # # As mentionned in the introduction, the total claim amount can be modeled -# either as the product of the frequency model by the severity model. +# either as the product of the frequency model by the severity model, class ClaimProdEstimator: - """Total claim amount estimator + """Total claim amount estimator. Computed as the product of the frequency model by the serverity model, - denormalized by exposure. + denormalized by exposure. Use Tweedie deviance with `p=1.5`. """ def __init__(self, est_freq, est_sev): self.est_freq = est_freq self.est_sev = est_sev + self._family_instance = TweedieDistribution(power=1.5) def predict(self, X, exposure): - """Predict the total claim amount + """Predict the total claim amount. The predict method is not compatible with the scikit-learn API. """ return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) + def score(self, X, y, sample_weight=None): + """Compute D², the percentage of deviance explained.""" + mu = self.predict(X, exposure=sample_weight) + dev = self._family_instance.deviance(y, mu, weights=sample_weight) + y_mean = np.average(y, weights=sample_weight) + dev_null = self._family_instance.deviance(y, y_mean, + weights=sample_weight) + return 1. - dev / dev_null + est_prod = ClaimProdEstimator(glm_freq, glm_sev) @@ -476,7 +493,9 @@ def predict(self, X, exposure): # model than when using separate models for frequency and severity. # # We can additionally validate these models by comparing observed and predicted -# total claim amount over the test and train subsets. +# total claim amount over the test and train subsets. We see that in our case +# the frequency-severity model underestimates the total claim amount, whereas +# the Tweedie model overestimates. res = [] for subset_label, X, df in [ From a4043847d0dfde68bb09dc0d9de3ab10fc07d41b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 15 Jul 2019 14:15:26 +0200 Subject: [PATCH 085/269] Address review comments --- .../plot_tweedie_regression_insurance_claims.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 00111b811f923..1c8dd42df336d 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -46,7 +46,7 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.metrics import mean_absolute_error +from sklearn.metrics import mean_absolute_error, mean_squared_error def load_mtpl2(n_samples=100000): @@ -83,7 +83,7 @@ def load_mtpl2(n_samples=100000): def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, - title=None, kind_weight=None, ax=None): + title=None, ax=None): """Plot observed and predicted - aggregated per feature level. Parameters @@ -141,9 +141,11 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # requires a strictly positive target values. df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 -# correct for unreasonable observations (that might be data error) +# Correct for unreasonable observations (that might be data error) +# and a few exceptionally large claim amounts df["ClaimNb"] = df["ClaimNb"].clip(upper=4) df["Exposure"] = df["Exposure"].clip(upper=1) +df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) column_trans = ColumnTransformer( [ @@ -188,7 +190,9 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +# Some of the features are colinear, we use a weak penalization to avoid +# numerical issues. +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) @@ -214,6 +218,7 @@ def score_estimator( ("D² explained", None), ("mean deviance", partial(mean_deviance, estimator)), ("mean abs. error", mean_absolute_error), + ("mean squared error", mean_squared_error), ]: if estimator.__class__.__name__ == "ClaimProdEstimator": # ClaimProdEstimator is the product of frequency and severity @@ -325,7 +330,7 @@ def score_estimator( mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 -glm_sev = GeneralizedLinearRegressor(family="gamma", alpha=1) +glm_sev = GeneralizedLinearRegressor(family="gamma") glm_sev.fit( X_train[mask_train.values], From 65796a3d3e74aff111c187b6be20e0e9287e0797 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 16:41:02 +0200 Subject: [PATCH 086/269] Review comments on the documentation --- doc/modules/linear_model.rst | 50 +++++++++++++++++------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index c8c103288c5ab..b6f7c2b82c1f5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -893,27 +893,23 @@ to warm-starting (see :term:`Glossary `). Generalized Linear Regression ============================= -:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two -ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear +:class:`GeneralizedLinearRegressor` generalizes linear models in two ways +[10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as .. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The objective function being minimized -becomes +exponential dispersion model (EDM) [11]_. The objective function being +minimized becomes -.. math:: \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1 - +\frac{\alpha(1-\rho)}{2} w^T P_2 w +.. math:: \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2 -with sample weights :math:`s`. -:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in -the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows -for a more versatile L2 penalty. +with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. -Use cases, where a loss different from the squared loss might be appropriate, -are the following: +In the following use cases, a loss different from the squared loss might be +appropriate, * If the target values :math:`y` are counts (non-negative integer valued) or frequencies (non-negative), you might use a Poisson deviance with log-link. @@ -928,10 +924,10 @@ are the following: Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link -:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with -:math:`h(x^\top w)=\exp(x^\top w)`. +:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the +log-link with :math:`h(x^\top w)=\exp(x^\top w)`. -Note that the feature matrix `X` should be standardized before fitting. This +Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows: @@ -947,7 +943,8 @@ follows: .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py` + * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` + * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py` Mathematical formulation ------------------------ @@ -967,12 +964,9 @@ Note that the first assumption implies function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the same as specifying a unit variance function (they are one-to-one). -Including penalties helps to avoid overfitting or, in case of L1 penalty, to -obtain sparse solutions. But there are also other motivations to include them, -e.g. accounting for the dependence structure of :math:`y`. - -The objective function, which is independent of :math:`\phi`, is minimized with -respect to the coefficients :math:`w`. +The objective function (the penalized negative log likelihood) is +independent of :math:`\phi` and is minimized with respect to the +coefficients :math:`w`. The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as @@ -1005,12 +999,16 @@ Two remarks: .. topic:: References: - .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51. - See also `Exponential dispersion model. `_ + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ - .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. `_ + .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. + `_ Stochastic Gradient Descent - SGD ================================= From e44afe7b6c9d2c7169ea2d61ecc58c389e67c6de Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 18:08:30 +0200 Subject: [PATCH 087/269] Split the implementation into several files --- sklearn/linear_model/__init__.py | 3 +- sklearn/linear_model/_glm/__init__.py | 5 + sklearn/linear_model/_glm/distribution.py | 443 ++++++++++++++ sklearn/linear_model/{_glm.py => _glm/glm.py} | 576 +----------------- sklearn/linear_model/_glm/link.py | 135 ++++ sklearn/linear_model/_glm/tests/__init__.py | 1 + .../_glm/tests/test_distribution.py | 61 ++ .../linear_model/{ => _glm}/tests/test_glm.py | 78 +-- sklearn/linear_model/_glm/tests/test_link.py | 38 ++ 9 files changed, 702 insertions(+), 638 deletions(-) create mode 100644 sklearn/linear_model/_glm/__init__.py create mode 100644 sklearn/linear_model/_glm/distribution.py rename sklearn/linear_model/{_glm.py => _glm/glm.py} (62%) create mode 100644 sklearn/linear_model/_glm/link.py create mode 100644 sklearn/linear_model/_glm/tests/__init__.py create mode 100644 sklearn/linear_model/_glm/tests/test_distribution.py rename sklearn/linear_model/{ => _glm}/tests/test_glm.py (81%) create mode 100644 sklearn/linear_model/_glm/tests/test_link.py diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 121418f901a1a..1c0df55d27c90 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,8 +18,7 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (TweedieDistribution, - GeneralizedLinearRegressor, PoissonRegressor) +from ._glm.glm import (GeneralizedLinearRegressor, PoissonRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py new file mode 100644 index 0000000000000..9a88e5604de8a --- /dev/null +++ b/sklearn/linear_model/_glm/__init__.py @@ -0,0 +1,5 @@ +# License: BSD 3 clause + +from . import distribution + +__all__ = ['distribution'] diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py new file mode 100644 index 0000000000000..2dc720124b48b --- /dev/null +++ b/sklearn/linear_model/_glm/distribution.py @@ -0,0 +1,443 @@ +""" +Distribution functions used in GLM +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod +import numbers + +import numpy as np +from scipy.special import xlogy + + +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +class ExponentialDispersionModel(metaclass=ABCMeta): + r"""Base class for reproductive Exponential Dispersion Models (EDM). + + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. + + Attributes + ---------- + lower_bound + upper_bound + include_lower_bound + include_upper_bound + + Methods + ------- + in_y_range + unit_variance + unit_variance_derivative + variance + variance_derivative + unit_deviance + unit_deviance_derivative + deviance + deviance_derivative + starting_mu + + _mu_deviance_derivative + + References + ---------- + + https://en.wikipedia.org/wiki/Exponential_dispersion_model. + """ + @property + def lower_bound(self): + """Get the lower bound of values for Y~EDM.""" + return self._lower_bound + + @property + def upper_bound(self): + """Get the upper bound of values for Y~EDM.""" + return self._upper_bound + + @property + def include_lower_bound(self): + """Get True if lower bound for y is included: y >= lower_bound.""" + return self._include_lower_bound + + @property + def include_upper_bound(self): + """Get True if upper bound for y is included: y <= upper_bound.""" + return self._include_upper_bound + + def in_y_range(self, x): + """Returns ``True`` if x is in the valid range of Y~EDM. + + Parameters + ---------- + x : array, shape (n_samples,) + Target values. + """ + if self.include_lower_bound: + if self.include_upper_bound: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater_equal(x, self.lower_bound), + np.less(x, self.upper_bound)) + else: + if self.include_upper_bound: + return np.logical_and(np.greater(x, self.lower_bound), + np.less_equal(x, self.upper_bound)) + else: + return np.logical_and(np.greater(x, self.lower_bound), + np.less(x, self.upper_bound)) + + @abstractmethod + def unit_variance(self, mu): + r"""Compute the unit variance function. + + The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as + + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + pass + + @abstractmethod + def unit_variance_derivative(self, mu): + r"""Compute the derivative of the unit variance w.r.t. mu. + + Return :math:`v'(\mu)`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Target values. + """ + pass + + def variance(self, mu, phi=1, weights=1): + r"""Compute the variance function. + + The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is + :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, + with unit variance :math:`v(\mu)` and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance(mu) + + def variance_derivative(self, mu, phi=1, weights=1): + r"""Compute the derivative of the variance w.r.t. mu. + + Returns + :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] + =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` + and weights :math:`s_i`. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + + phi : float (default=1) + Dispersion parameter. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return phi/weights * self.unit_variance_derivative(mu) + + @abstractmethod + def unit_deviance(self, y, mu): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + pass + + def unit_deviance_derivative(self, y, mu): + r"""Compute the derivative of the unit deviance w.r.t. mu. + + The derivative of the unit deviance is given by + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + """ + return -2 * (y - mu) / self.unit_variance(mu) + + def deviance(self, y, mu, weights=1): + r"""Compute the deviance. + + The deviance is a weighted sum of the per sample unit deviances, + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return np.sum(weights * self.unit_deviance(y, mu)) + + def deviance_derivative(self, y, mu, weights=1): + """Compute the derivative of the deviance w.r.t. mu. + + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + """ + return weights * self.unit_deviance_derivative(y, mu) + + def starting_mu(self, y, weights=1, ind_weight=0.5): + """Set starting values for the mean mu. + + These may be good starting points for the (unpenalized) IRLS solver. + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + weights : array, shape (n_samples,) (default=1) + Weights or exposure to which variance is inverse proportional. + + ind_weight : float (default=0.5) + Must be between 0 and 1. Specifies how much weight is given to the + individual observations instead of the mean of y. + """ + return (ind_weight * y + + (1. - ind_weight) * np.average(y, weights=weights)) + + def _mu_deviance_derivative(self, coef, X, y, weights, link): + """Compute mu and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + mu = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * self.deviance_derivative(y, mu, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # sampe as X.T @ temp + return mu, devp + + +class TweedieDistribution(ExponentialDispersionModel): + r"""A class for the Tweedie distribution. + + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto \mu^power`. + + Special cases are: + + ===== ================ + Power Distribution + ===== ================ + 0 Normal + 1 Poisson + (0,1) Compound Poisson + 2 Gamma + 3 Inverse Gaussian + + Parameters + ---------- + power : float (default=0) + The variance power of the `unit_variance` + :math:`v(\mu) = \mu^{power}`. + For ``0 0) and (power < 1): + raise ValueError('For 0 1) and (power < 2): + # Compound Poisson + self._lower_bound = 0 + self._include_lower_bound = True + elif power == 2: + # GammaDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif (power > 2) and (power < 3): + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + elif power == 3: + # InverseGaussianDistribution + self._lower_bound = 0 + self._include_lower_bound = False + elif power > 3: + # Positive Stable + self._lower_bound = 0 + self._include_lower_bound = False + else: # pragma: no cover + # this branch should be unreachable. + raise ValueError + + self._power = power + + def unit_variance(self, mu): + """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return np.power(mu, self.power) + + def unit_variance_derivative(self, mu): + """Compute the derivative of the unit variance of a Tweedie + distribution v(mu)=power*mu**(power-1). + + Parameters + ---------- + mu : array, shape (n_samples,) + Predicted mean. + """ + return self.power * np.power(mu, self.power - 1) + + def unit_deviance(self, y, mu): + p = self.power + if p == 0: + # NormalDistribution + return (y - mu)**2 + if p == 1: + # PoissonDistribution + # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 + return 2 * (xlogy(y, y/mu) - y + mu) + elif p == 2: + # GammaDistribution + return 2 * (np.log(mu/y) + y/mu - 1) + else: + # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) + # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) + return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - + y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + + +class NormalDistribution(TweedieDistribution): + """Class for the Normal (aka Gaussian) distribution""" + def __init__(self): + super(NormalDistribution, self).__init__(power=0) + + +class PoissonDistribution(TweedieDistribution): + """Class for the scaled Poisson distribution""" + def __init__(self): + super(PoissonDistribution, self).__init__(power=1) + + +class GammaDistribution(TweedieDistribution): + """Class for the Gamma distribution""" + def __init__(self): + super(GammaDistribution, self).__init__(power=2) + + +class InverseGaussianDistribution(TweedieDistribution): + """Class for the scaled InverseGaussianDistribution distribution""" + def __init__(self): + super(InverseGaussianDistribution, self).__init__(power=3) + + +EDM_DISTRIBUTIONS = { + 'normal': NormalDistribution, + 'poisson': PoissonDistribution, + 'gamma': GammaDistribution, + 'inverse.gaussian': InverseGaussianDistribution, +} diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm/glm.py similarity index 62% rename from sklearn/linear_model/_glm.py rename to sklearn/linear_model/_glm/glm.py index 939249e42e4f4..bff8ea43fd550 100644 --- a/sklearn/linear_model/_glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -6,17 +6,26 @@ # some parts and tricks stolen from other sklearn files. # License: BSD 3 clause -from abc import ABCMeta, abstractmethod import numbers import numpy as np -from scipy import special import scipy.optimize -from ..base import BaseEstimator, RegressorMixin -from ..utils import check_array, check_X_y -from ..utils.optimize import _check_optimize_result -from ..utils.validation import check_is_fitted +from ...base import BaseEstimator, RegressorMixin +from ...utils import check_array, check_X_y +from ...utils.optimize import _check_optimize_result +from ...utils.validation import check_is_fitted +from .distribution import ( + ExponentialDispersionModel, + TweedieDistribution, + EDM_DISTRIBUTIONS +) +from .link import ( + Link, + IdentityLink, + LogLink, + LogitLink, +) def _check_weights(sample_weight, n_samples): @@ -46,561 +55,6 @@ def _check_weights(sample_weight, n_samples): return weights -def _safe_lin_pred(X, coef): - """Compute the linear predictor taking care if intercept is present.""" - if coef.size == X.shape[1] + 1: - return X @ coef[1:] + coef[0] - else: - return X @ coef - - -class Link(metaclass=ABCMeta): - """Abstract base class for Link functions.""" - - @abstractmethod - def link(self, mu): - """Compute the link function g(mu). - - The link function links the mean mu=E[Y] to the so called linear - predictor (X*w), i.e. g(mu) = linear predictor. - - Parameters - ---------- - mu : array, shape (n_samples,) - Usually the (predicted) mean. - """ - pass - - @abstractmethod - def derivative(self, mu): - """Compute the derivative of the link g'(mu). - - Parameters - ---------- - mu : array, shape (n_samples,) - Usually the (predicted) mean. - """ - pass - - @abstractmethod - def inverse(self, lin_pred): - """Compute the inverse link function h(lin_pred). - - Gives the inverse relationship between linear predictor and the mean - mu=E[Y], i.e. h(linear predictor) = mu. - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass - - @abstractmethod - def inverse_derivative(self, lin_pred): - """Compute the derivative of the inverse link function h'(lin_pred). - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass - - @abstractmethod - def inverse_derivative2(self, lin_pred): - """Compute 2nd derivative of the inverse link function h''(lin_pred). - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass - - -class IdentityLink(Link): - """The identity link function g(x)=x.""" - - def link(self, mu): - return mu - - def derivative(self, mu): - return np.ones_like(mu) - - def inverse(self, lin_pred): - return lin_pred - - def inverse_derivative(self, lin_pred): - return np.ones_like(lin_pred) - - def inverse_derivative2(self, lin_pred): - return np.zeros_like(lin_pred) - - -class LogLink(Link): - """The log link function g(x)=log(x).""" - - def link(self, mu): - return np.log(mu) - - def derivative(self, mu): - return 1./mu - - def inverse(self, lin_pred): - return np.exp(lin_pred) - - def inverse_derivative(self, lin_pred): - return np.exp(lin_pred) - - def inverse_derivative2(self, lin_pred): - return np.exp(lin_pred) - - -class LogitLink(Link): - """The logit link function g(x)=logit(x).""" - - def link(self, mu): - return special.logit(mu) - - def derivative(self, mu): - return 1. / (mu * (1 - mu)) - - def inverse(self, lin_pred): - return special.expit(lin_pred) - - def inverse_derivative(self, lin_pred): - ep = special.expit(lin_pred) - return ep * (1. - ep) - - def inverse_derivative2(self, lin_pred): - ep = special.expit(lin_pred) - return ep * (1. - ep) * (1. - 2 * ep) - - -class ExponentialDispersionModel(metaclass=ABCMeta): - r"""Base class for reproductive Exponential Dispersion Models (EDM). - - The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by - - .. math:: p(y| \theta, \phi) = c(y, \phi) - \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) - = \tilde{c}(y, \phi) - \exp\left(-\frac{d(y, \mu)}{2\phi}\right) - - with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, - variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, - unit variance :math:`v(\mu)` and - unit deviance :math:`d(y,\mu)`. - - Attributes - ---------- - lower_bound - upper_bound - include_lower_bound - include_upper_bound - - Methods - ------- - in_y_range - unit_variance - unit_variance_derivative - variance - variance_derivative - unit_deviance - unit_deviance_derivative - deviance - deviance_derivative - starting_mu - - _mu_deviance_derivative - - References - ---------- - - https://en.wikipedia.org/wiki/Exponential_dispersion_model. - """ - @property - def lower_bound(self): - """Get the lower bound of values for Y~EDM.""" - return self._lower_bound - - @property - def upper_bound(self): - """Get the upper bound of values for Y~EDM.""" - return self._upper_bound - - @property - def include_lower_bound(self): - """Get True if lower bound for y is included: y >= lower_bound.""" - return self._include_lower_bound - - @property - def include_upper_bound(self): - """Get True if upper bound for y is included: y <= upper_bound.""" - return self._include_upper_bound - - def in_y_range(self, x): - """Returns ``True`` if x is in the valid range of Y~EDM. - - Parameters - ---------- - x : array, shape (n_samples,) - Target values. - """ - if self.include_lower_bound: - if self.include_upper_bound: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less(x, self.upper_bound)) - else: - if self.include_upper_bound: - return np.logical_and(np.greater(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater(x, self.lower_bound), - np.less(x, self.upper_bound)) - - @abstractmethod - def unit_variance(self, mu): - r"""Compute the unit variance function. - - The unit variance :math:`v(\mu)` determines the variance as - a function of the mean :math:`\mu` by - :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. - It can also be derived from the unit deviance :math:`d(y,\mu)` as - - .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ - \partial\mu^2}}\big|_{y=\mu} - - See also :func:`variance`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - """ - pass - - @abstractmethod - def unit_variance_derivative(self, mu): - r"""Compute the derivative of the unit variance w.r.t. mu. - - Return :math:`v'(\mu)`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Target values. - """ - pass - - def variance(self, mu, phi=1, weights=1): - r"""Compute the variance function. - - The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is - :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, - with unit variance :math:`v(\mu)` and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance(mu) - - def variance_derivative(self, mu, phi=1, weights=1): - r"""Compute the derivative of the variance w.r.t. mu. - - Returns - :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] - =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` - and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance_derivative(mu) - - @abstractmethod - def unit_deviance(self, y, mu): - r"""Compute the unit deviance. - - The unit_deviance :math:`d(y,\mu)` can be defined by the - log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - """ - pass - - def unit_deviance_derivative(self, y, mu): - r"""Compute the derivative of the unit deviance w.r.t. mu. - - The derivative of the unit deviance is given by - :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` - with unit variance :math:`v(\mu)`. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - """ - return -2 * (y - mu) / self.unit_variance(mu) - - def deviance(self, y, mu, weights=1): - r"""Compute the deviance. - - The deviance is a weighted sum of the per sample unit deviances, - :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` - with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. - In terms of the log-likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{s}) - - loglike(y,y,\frac{phi}{s})\right)`. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return np.sum(weights * self.unit_deviance(y, mu)) - - def deviance_derivative(self, y, mu, weights=1): - """Compute the derivative of the deviance w.r.t. mu. - - It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - mu : array, shape (n_samples,) - Predicted mean. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return weights * self.unit_deviance_derivative(y, mu) - - def starting_mu(self, y, weights=1, ind_weight=0.5): - """Set starting values for the mean mu. - - These may be good starting points for the (unpenalized) IRLS solver. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - - ind_weight : float (default=0.5) - Must be between 0 and 1. Specifies how much weight is given to the - individual observations instead of the mean of y. - """ - return (ind_weight * y + - (1. - ind_weight) * np.average(y, weights=weights)) - - def _mu_deviance_derivative(self, coef, X, y, weights, link): - """Compute mu and the derivative of the deviance w.r.t coef.""" - lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) - d1 = link.inverse_derivative(lin_pred) - temp = d1 * self.deviance_derivative(y, mu, weights) - if coef.size == X.shape[1] + 1: - devp = np.concatenate(([temp.sum()], temp @ X)) - else: - devp = temp @ X # sampe as X.T @ temp - return mu, devp - - -class TweedieDistribution(ExponentialDispersionModel): - r"""A class for the Tweedie distribution. - - A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely - defined by it's mean-variance relationship - :math:`\mathrm{Var}[Y] \propto \mu^power`. - - Special cases are: - - ===== ================ - Power Distribution - ===== ================ - 0 Normal - 1 Poisson - (0,1) Compound Poisson - 2 Gamma - 3 Inverse Gaussian - - Parameters - ---------- - power : float (default=0) - The variance power of the `unit_variance` - :math:`v(\mu) = \mu^{power}`. - For ``0 0) and (power < 1): - raise ValueError('For 0 1) and (power < 2): - # Compound Poisson - self._lower_bound = 0 - self._include_lower_bound = True - elif power == 2: - # GammaDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif (power > 2) and (power < 3): - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False - elif power == 3: - # InverseGaussianDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif power > 3: - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False - else: # pragma: no cover - # this branch should be unreachable. - raise ValueError - - self._power = power - - def unit_variance(self, mu): - """Compute the unit variance of a Tweedie distribution v(mu)=mu**power. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - """ - return np.power(mu, self.power) - - def unit_variance_derivative(self, mu): - """Compute the derivative of the unit variance of a Tweedie - distribution v(mu)=power*mu**(power-1). - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - """ - return self.power * np.power(mu, self.power - 1) - - def unit_deviance(self, y, mu): - p = self.power - if p == 0: - # NormalDistribution - return (y - mu)**2 - if p == 1: - # PoissonDistribution - # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (special.xlogy(y, y/mu) - y + mu) - elif p == 2: - # GammaDistribution - return 2 * (np.log(mu/y) + y/mu - 1) - else: - # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) - # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) - return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) - - -class NormalDistribution(TweedieDistribution): - """Class for the Normal (aka Gaussian) distribution""" - def __init__(self): - super(NormalDistribution, self).__init__(power=0) - - -class PoissonDistribution(TweedieDistribution): - """Class for the scaled Poisson distribution""" - def __init__(self): - super(PoissonDistribution, self).__init__(power=1) - - -class GammaDistribution(TweedieDistribution): - """Class for the Gamma distribution""" - def __init__(self): - super(GammaDistribution, self).__init__(power=2) - - -class InverseGaussianDistribution(TweedieDistribution): - """Class for the scaled InverseGaussianDistribution distribution""" - def __init__(self): - super(InverseGaussianDistribution, self).__init__(power=3) - - -EDM_DISTRIBUTIONS = { - 'normal': NormalDistribution, - 'poisson': PoissonDistribution, - 'gamma': GammaDistribution, - 'inverse.gaussian': InverseGaussianDistribution, -} - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py new file mode 100644 index 0000000000000..f79f6163ada48 --- /dev/null +++ b/sklearn/linear_model/_glm/link.py @@ -0,0 +1,135 @@ +""" +Link functions used in GLM +""" + +# Author: Christian Lorentzen +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod + +import numpy as np +from scipy.special import expit, logit + + +class Link(metaclass=ABCMeta): + """Abstract base class for Link functions.""" + + @abstractmethod + def link(self, mu): + """Compute the link function g(mu). + + The link function links the mean mu=E[Y] to the so called linear + predictor (X*w), i.e. g(mu) = linear predictor. + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + pass + + @abstractmethod + def derivative(self, mu): + """Compute the derivative of the link g'(mu). + + Parameters + ---------- + mu : array, shape (n_samples,) + Usually the (predicted) mean. + """ + pass + + @abstractmethod + def inverse(self, lin_pred): + """Compute the inverse link function h(lin_pred). + + Gives the inverse relationship between linear predictor and the mean + mu=E[Y], i.e. h(linear predictor) = mu. + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + pass + + @abstractmethod + def inverse_derivative(self, lin_pred): + """Compute the derivative of the inverse link function h'(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + pass + + @abstractmethod + def inverse_derivative2(self, lin_pred): + """Compute 2nd derivative of the inverse link function h''(lin_pred). + + Parameters + ---------- + lin_pred : array, shape (n_samples,) + Usually the (fitted) linear predictor. + """ + pass + + +class IdentityLink(Link): + """The identity link function g(x)=x.""" + + def link(self, mu): + return mu + + def derivative(self, mu): + return np.ones_like(mu) + + def inverse(self, lin_pred): + return lin_pred + + def inverse_derivative(self, lin_pred): + return np.ones_like(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.zeros_like(lin_pred) + + +class LogLink(Link): + """The log link function g(x)=log(x).""" + + def link(self, mu): + return np.log(mu) + + def derivative(self, mu): + return 1./mu + + def inverse(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative(self, lin_pred): + return np.exp(lin_pred) + + def inverse_derivative2(self, lin_pred): + return np.exp(lin_pred) + + +class LogitLink(Link): + """The logit link function g(x)=logit(x).""" + + def link(self, mu): + return logit(mu) + + def derivative(self, mu): + return 1. / (mu * (1 - mu)) + + def inverse(self, lin_pred): + return expit(lin_pred) + + def inverse_derivative(self, lin_pred): + ep = expit(lin_pred) + return ep * (1. - ep) + + def inverse_derivative2(self, lin_pred): + ep = expit(lin_pred) + return ep * (1. - ep) * (1. - 2 * ep) diff --git a/sklearn/linear_model/_glm/tests/__init__.py b/sklearn/linear_model/_glm/tests/__init__.py new file mode 100644 index 0000000000000..588cf7e93eef0 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/__init__.py @@ -0,0 +1 @@ +# License: BSD 3 clause diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py new file mode 100644 index 0000000000000..1a2efdba0e563 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -0,0 +1,61 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause + +from numpy.testing import assert_allclose +from numpy.testing import assert_array_equal +import pytest + +from sklearn.linear_model._glm.distribution import ( + TweedieDistribution, + NormalDistribution, PoissonDistribution, + GammaDistribution, InverseGaussianDistribution, +) + + +@pytest.mark.parametrize( + 'family, expected', + [(NormalDistribution(), [True, True, True]), + (PoissonDistribution(), [False, True, True]), + (TweedieDistribution(power=1.5), [False, True, True]), + (GammaDistribution(), [False, False, True]), + (InverseGaussianDistribution(), [False, False, True]), + (TweedieDistribution(power=4.5), [False, False, True])]) +def test_family_bounds(family, expected): + """Test the valid range of distributions at -1, 0, 1.""" + result = family.in_y_range([-1, 0, 1]) + assert_array_equal(result, expected) + + +def test_tweedie_distribution_power(): + with pytest.raises(ValueError, match="no distribution exists"): + TweedieDistribution(power=0.5) + + with pytest.raises(TypeError, match="must be a real number"): + TweedieDistribution(power=1j) + + with pytest.raises(TypeError, match="must be a real number"): + dist = TweedieDistribution() + dist.power = 1j + + dist = TweedieDistribution() + assert dist._include_lower_bound is False + dist.power = 1 + assert dist._include_lower_bound is True + + +@pytest.mark.parametrize( + 'family, chk_values', + [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), + (PoissonDistribution(), [0.1, 1.5]), + (GammaDistribution(), [0.1, 1.5]), + (InverseGaussianDistribution(), [0.1, 1.5]), + (TweedieDistribution(power=-2.5), [0.1, 1.5]), + (TweedieDistribution(power=-1), [0.1, 1.5]), + (TweedieDistribution(power=1.5), [0.1, 1.5]), + (TweedieDistribution(power=2.5), [0.1, 1.5]), + (TweedieDistribution(power=-4), [0.1, 1.5])]) +def test_deviance_zero(family, chk_values): + """Test deviance(y,y) = 0 for different families.""" + for x in chk_values: + assert_allclose(family.deviance(x, x), 0, atol=1e-9) diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py similarity index 81% rename from sklearn/linear_model/tests/test_glm.py rename to sklearn/linear_model/_glm/tests/test_glm.py index 1712f7b5e1d3d..ffac6dd11c243 100644 --- a/sklearn/linear_model/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -8,11 +8,12 @@ from sklearn.datasets import make_regression from sklearn.linear_model import GeneralizedLinearRegressor -from sklearn.linear_model._glm import ( - Link, +from sklearn.linear_model._glm.link import ( IdentityLink, LogLink, LogitLink, +) +from sklearn.linear_model._glm.distribution import ( TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, @@ -21,8 +22,6 @@ from sklearn.metrics import mean_absolute_error from sklearn.exceptions import ConvergenceWarning -from sklearn.utils.testing import assert_array_equal - GLM_SOLVERS = ['lbfgs'] @@ -35,77 +34,6 @@ def regression_data(): return X, y -@pytest.mark.parametrize('link', Link.__subclasses__()) -def test_link_properties(link): - """Test link inverse and derivative.""" - rng = np.random.RandomState(42) - x = rng.rand(100)*100 - link = link() # instantiate object - if isinstance(link, LogitLink): - # careful for large x, note expit(36) = 1 - # limit max eta to 15 - x = x / 100 * 15 - assert_allclose(link.link(link.inverse(x)), x) - # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) - assert_allclose(link.derivative(link.inverse(x)), - 1./link.inverse_derivative(x)) - - assert ( - link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) - - # for LogitLink, in the following x should be between 0 and 1. - # assert_almost_equal(link.inverse_derivative(link.link(x)), - # 1./link.derivative(x), decimal=decimal) - - -@pytest.mark.parametrize( - 'family, expected', - [(NormalDistribution(), [True, True, True]), - (PoissonDistribution(), [False, True, True]), - (TweedieDistribution(power=1.5), [False, True, True]), - (GammaDistribution(), [False, False, True]), - (InverseGaussianDistribution(), [False, False, True]), - (TweedieDistribution(power=4.5), [False, False, True])]) -def test_family_bounds(family, expected): - """Test the valid range of distributions at -1, 0, 1.""" - result = family.in_y_range([-1, 0, 1]) - assert_array_equal(result, expected) - - -def test_tweedie_distribution_power(): - with pytest.raises(ValueError, match="no distribution exists"): - TweedieDistribution(power=0.5) - - with pytest.raises(TypeError, match="must be a real number"): - TweedieDistribution(power=1j) - - with pytest.raises(TypeError, match="must be a real number"): - dist = TweedieDistribution() - dist.power = 1j - - dist = TweedieDistribution() - assert dist._include_lower_bound is False - dist.power = 1 - assert dist._include_lower_bound is True - - -@pytest.mark.parametrize( - 'family, chk_values', - [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]), - (PoissonDistribution(), [0.1, 1.5]), - (GammaDistribution(), [0.1, 1.5]), - (InverseGaussianDistribution(), [0.1, 1.5]), - (TweedieDistribution(power=-2.5), [0.1, 1.5]), - (TweedieDistribution(power=-1), [0.1, 1.5]), - (TweedieDistribution(power=1.5), [0.1, 1.5]), - (TweedieDistribution(power=2.5), [0.1, 1.5]), - (TweedieDistribution(power=-4), [0.1, 1.5])]) -def test_deviance_zero(family, chk_values): - """Test deviance(y,y) = 0 for different families.""" - for x in chk_values: - assert_allclose(family.deviance(x, x), 0, atol=1e-9) - - def test_sample_weights_validation(): """Test the raised errors in the validation of sample_weight.""" # scalar value but not positive diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py new file mode 100644 index 0000000000000..3a2a21c4c04e0 --- /dev/null +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -0,0 +1,38 @@ +# Authors: Christian Lorentzen +# +# License: BSD 3 clause +import numpy as np +from numpy.testing import assert_allclose +import pytest + +from sklearn.linear_model._glm.link import ( + IdentityLink, + LogLink, + LogitLink, +) + + +LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink] + + +@pytest.mark.parametrize('link', LINK_FUNCTIONS) +def test_link_properties(link): + """Test link inverse and derivative.""" + rng = np.random.RandomState(42) + x = rng.rand(100)*100 + link = link() # instantiate object + if isinstance(link, LogitLink): + # careful for large x, note expit(36) = 1 + # limit max eta to 15 + x = x / 100 * 15 + assert_allclose(link.link(link.inverse(x)), x) + # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + assert_allclose(link.derivative(link.inverse(x)), + 1./link.inverse_derivative(x)) + + assert ( + link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) + + # for LogitLink, in the following x should be between 0 and 1. + # assert_almost_equal(link.inverse_derivative(link.link(x)), + # 1./link.derivative(x), decimal=decimal) From 5927379debd8e591d72c5915ad434fc0592f5665 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 18:27:36 +0200 Subject: [PATCH 088/269] Fix CI --- sklearn/linear_model/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 1c0df55d27c90..8fc662a48a6ae 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -81,5 +81,4 @@ 'ridge_regression', 'RANSACRegressor', 'GeneralizedLinearRegressor', - 'TweedieDistribution', 'PoissonRegressor'] From a6df2a788b5a4907b6c3a9b51a758f5b05e95257 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 18:57:53 +0200 Subject: [PATCH 089/269] Add test_deviance_derivative --- sklearn/linear_model/__init__.py | 2 +- sklearn/linear_model/_glm/__init__.py | 4 +- .../_glm/tests/test_distribution.py | 44 +++++++++++++++++-- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 8fc662a48a6ae..e8f63e95cdfa8 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,7 +18,7 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm.glm import (GeneralizedLinearRegressor, PoissonRegressor) +from ._glm import (GeneralizedLinearRegressor, PoissonRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py index 9a88e5604de8a..ac0f2c206ffe0 100644 --- a/sklearn/linear_model/_glm/__init__.py +++ b/sklearn/linear_model/_glm/__init__.py @@ -1,5 +1,5 @@ # License: BSD 3 clause -from . import distribution +from .glm import (GeneralizedLinearRegressor, PoissonRegressor) -__all__ = ['distribution'] +__all__ = ['GeneralizedLinearRegressor', 'PoissonRegressor'] diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index 1a2efdba0e563..f457fc882a809 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -1,9 +1,12 @@ # Authors: Christian Lorentzen # # License: BSD 3 clause - -from numpy.testing import assert_allclose -from numpy.testing import assert_array_equal +import numpy as np +from numpy.testing import ( + assert_allclose, + assert_array_equal, +) +from scipy.optimize import check_grad import pytest from sklearn.linear_model._glm.distribution import ( @@ -59,3 +62,38 @@ def test_deviance_zero(family, chk_values): """Test deviance(y,y) = 0 for different families.""" for x in chk_values: assert_allclose(family.deviance(x, x), 0, atol=1e-9) + + +@pytest.mark.parametrize( + 'family', + [NormalDistribution(), + PoissonDistribution(), + GammaDistribution(), + InverseGaussianDistribution(), + TweedieDistribution(power=-2.5), + TweedieDistribution(power=-1), + TweedieDistribution(power=1.5), + TweedieDistribution(power=2.5), + TweedieDistribution(power=-4)], + ids=lambda x: x.__class__.__name__ +) +def test_deviance_derivative(family): + """Test deviance derivative for different families.""" + rng = np.random.RandomState(0) + y_true = rng.rand(10) + # make data positive + y_true += np.abs(y_true.min()) + 1e-2 + + y_pred = y_true + np.fmax(rng.rand(10), 0.) + + dev = family.deviance(y_true, y_pred) + assert isinstance(dev, float) + dev_derivative = family.deviance_derivative(y_true, y_pred) + assert dev_derivative.shape == y_pred.shape + + err = check_grad( + lambda mu: family.deviance(y_true, mu), + lambda mu: family.deviance_derivative(y_true, mu), + y_pred, + ) / np.linalg.norm(dev_derivative) + assert err < 1e-6 From 5af89a70fd4a168f3ab1d0d1966d6357e6bc4e68 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 16 Jul 2019 19:58:54 +0200 Subject: [PATCH 090/269] Fix sklearn/linear_model/setup.py --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 2 +- sklearn/linear_model/_glm/tests/test_distribution.py | 2 +- sklearn/linear_model/setup.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 1c8dd42df336d..fba1cc42e20a7 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -40,7 +40,7 @@ from sklearn.compose import ColumnTransformer from sklearn.linear_model import GeneralizedLinearRegressor -from sklearn.linear_model._glm import TweedieDistribution +from sklearn.linear_model._glm.distribution import TweedieDistribution from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index f457fc882a809..f984077bed5cb 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -96,4 +96,4 @@ def test_deviance_derivative(family): lambda mu: family.deviance_derivative(y_true, mu), y_pred, ) / np.linalg.norm(dev_derivative) - assert err < 1e-6 + assert abs(err) < 1e-6 diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py index 8226412fdecbd..5cf7040d4c9d4 100644 --- a/sklearn/linear_model/setup.py +++ b/sklearn/linear_model/setup.py @@ -42,6 +42,7 @@ def configuration(parent_package='', top_path=None): # add other directories config.add_subpackage('tests') + config.add_subpackage('_glm') return config From cd347d4610d00dca9e7a01de2067595a4e4b8a59 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 17 Jul 2019 14:01:44 +0200 Subject: [PATCH 091/269] Remove variance and variance_derivative methods from distributions --- sklearn/linear_model/_glm/distribution.py | 72 ++--------------------- 1 file changed, 4 insertions(+), 68 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 2dc720124b48b..18e7cf0c0a227 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -44,16 +44,13 @@ class ExponentialDispersionModel(metaclass=ABCMeta): Methods ------- + deviance + deviance_derivative in_y_range - unit_variance - unit_variance_derivative - variance - variance_derivative unit_deviance unit_deviance_derivative - deviance - deviance_derivative - starting_mu + unit_variance + unit_variance_derivative _mu_deviance_derivative @@ -139,47 +136,6 @@ def unit_variance_derivative(self, mu): """ pass - def variance(self, mu, phi=1, weights=1): - r"""Compute the variance function. - - The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is - :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`, - with unit variance :math:`v(\mu)` and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance(mu) - - def variance_derivative(self, mu, phi=1, weights=1): - r"""Compute the derivative of the variance w.r.t. mu. - - Returns - :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i] - =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)` - and weights :math:`s_i`. - - Parameters - ---------- - mu : array, shape (n_samples,) - Predicted mean. - - phi : float (default=1) - Dispersion parameter. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - """ - return phi/weights * self.unit_variance_derivative(mu) - @abstractmethod def unit_deviance(self, y, mu): r"""Compute the unit deviance. @@ -257,26 +213,6 @@ def deviance_derivative(self, y, mu, weights=1): """ return weights * self.unit_deviance_derivative(y, mu) - def starting_mu(self, y, weights=1, ind_weight=0.5): - """Set starting values for the mean mu. - - These may be good starting points for the (unpenalized) IRLS solver. - - Parameters - ---------- - y : array, shape (n_samples,) - Target values. - - weights : array, shape (n_samples,) (default=1) - Weights or exposure to which variance is inverse proportional. - - ind_weight : float (default=0.5) - Must be between 0 and 1. Specifies how much weight is given to the - individual observations instead of the mean of y. - """ - return (ind_weight * y + - (1. - ind_weight) * np.average(y, weights=weights)) - def _mu_deviance_derivative(self, coef, X, y, weights, link): """Compute mu and the derivative of the deviance w.r.t coef.""" lin_pred = _safe_lin_pred(X, coef) From 0d7f9cd9babe928a01266f43f1f5e4e41829075b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 17 Jul 2019 15:05:49 +0200 Subject: [PATCH 092/269] Improve coverage --- sklearn/linear_model/_glm/distribution.py | 6 ++-- sklearn/linear_model/_glm/link.py | 10 +++---- sklearn/linear_model/_glm/tests/test_glm.py | 31 +++++++++++++++++++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 18e7cf0c0a227..6fffd816ff8f9 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -121,7 +121,7 @@ def unit_variance(self, mu): mu : array, shape (n_samples,) Predicted mean. """ - pass + pass # pragma: no cover @abstractmethod def unit_variance_derivative(self, mu): @@ -134,7 +134,7 @@ def unit_variance_derivative(self, mu): mu : array, shape (n_samples,) Target values. """ - pass + pass # pragma: no cover @abstractmethod def unit_deviance(self, y, mu): @@ -153,7 +153,7 @@ def unit_deviance(self, y, mu): mu : array, shape (n_samples,) Predicted mean. """ - pass + pass # pragma: no cover def unit_deviance_derivative(self, y, mu): r"""Compute the derivative of the unit deviance w.r.t. mu. diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index f79f6163ada48..ec9a7b7736eb1 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -26,7 +26,7 @@ def link(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - pass + pass # pragma: no cover @abstractmethod def derivative(self, mu): @@ -37,7 +37,7 @@ def derivative(self, mu): mu : array, shape (n_samples,) Usually the (predicted) mean. """ - pass + pass # pragma: no cover @abstractmethod def inverse(self, lin_pred): @@ -51,7 +51,7 @@ def inverse(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - pass + pass # pragma: no cover @abstractmethod def inverse_derivative(self, lin_pred): @@ -62,7 +62,7 @@ def inverse_derivative(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - pass + pass # pragma: no cover @abstractmethod def inverse_derivative2(self, lin_pred): @@ -73,7 +73,7 @@ def inverse_derivative2(self, lin_pred): lin_pred : array, shape (n_samples,) Usually the (fitted) linear predictor. """ - pass + pass # pragma: no cover class IdentityLink(Link): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index ffac6dd11c243..a0ac77b65aa9f 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -219,6 +219,37 @@ def test_glm_log_regression(family, solver, tol): assert_allclose(res.coef_, coef, rtol=5e-6) +@pytest.mark.parametrize('fit_intercept', [True, False]) +def test_warm_start(fit_intercept): + n_samples, n_features = 100, 10 + n_predict = 10 + X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=42) + + glm1 = GeneralizedLinearRegressor( + warm_start=False, + fit_intercept=fit_intercept, + max_iter=1000 + ) + glm1.fit(X, y) + + glm2 = GeneralizedLinearRegressor( + warm_start=True, + fit_intercept=fit_intercept, + max_iter=1 + ) + glm2.fit(X, y) + assert glm1.score(X, y) > glm2.score(X, y) + glm2.set_params(max_iter=1000) + glm2.fit(X, y) + assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-4, atol=1e-5) + assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) + # TODO: investigate why this doesn't match + # assert glm1.n_iter_ == glm2.n_iter_ + 2 + + @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('solver', GLM_SOLVERS) From dbffad84fcab9521265e85d5e7cbf9da0673b380 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 17 Jul 2019 15:44:44 +0200 Subject: [PATCH 093/269] Remove mentions of the binomial distribution --- sklearn/linear_model/_glm/glm.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index bff8ea43fd550..0115f9d160337 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -85,7 +85,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \ + family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} \ or an instance of class ExponentialDispersionModel, \ optional(default='normal') The distributional assumption of the GLM, i.e. which distribution from @@ -101,8 +101,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' - - 'logit' for family 'binomial' - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) Method for estimation of the dispersion parameter phi. Whether to use the chi squared statistic or the deviance statistic. If None, the From 3187204dc4c1bb41a6131c49262c8380a34b9ec1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 17:37:11 +0200 Subject: [PATCH 094/269] Use common simple weight validation --- sklearn/linear_model/_glm/glm.py | 37 +++------------------ sklearn/linear_model/_glm/tests/test_glm.py | 19 ++--------- 2 files changed, 7 insertions(+), 49 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 0115f9d160337..546e71c583fa8 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -14,7 +14,7 @@ from ...base import BaseEstimator, RegressorMixin from ...utils import check_array, check_X_y from ...utils.optimize import _check_optimize_result -from ...utils.validation import check_is_fitted +from ...utils.validation import check_is_fitted, _check_sample_weight from .distribution import ( ExponentialDispersionModel, TweedieDistribution, @@ -28,33 +28,6 @@ ) -def _check_weights(sample_weight, n_samples): - """Check that sample weights are non-negative and have the right shape.""" - if sample_weight is None: - weights = np.ones(n_samples) - elif np.isscalar(sample_weight): - if sample_weight <= 0: - raise ValueError("Sample weights must be non-negative.") - weights = sample_weight * np.ones(n_samples) - else: - _dtype = [np.float64, np.float32] - weights = check_array(sample_weight, accept_sparse=False, - force_all_finite=True, ensure_2d=False, - dtype=_dtype) - if weights.ndim > 1: - raise ValueError("Sample weight must be 1D array or scalar") - elif weights.shape[0] != n_samples: - raise ValueError("Sample weights must have the same length as " - "y") - if not np.all(weights >= 0): - raise ValueError("Sample weights must be non-negative.") - elif not np.sum(weights) > 0: - raise ValueError("Sample weights must have at least one positive " - "element.") - - return weights - - class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. @@ -323,7 +296,7 @@ def fit(self, X, y, sample_weight=None): copy=self.copy_X) y = np.asarray(y, dtype=np.float64) - weights = _check_weights(sample_weight, y.shape[0]) + weights = _check_sample_weight(sample_weight, X) n_samples, n_features = X.shape @@ -458,7 +431,7 @@ def predict(self, X, sample_weight=None): allow_nd=False) eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) - weights = _check_weights(sample_weight, X.shape[0]) + weights = _check_sample_weight(sample_weight, X) return mu*weights @@ -487,7 +460,7 @@ def estimate_phi(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype, y_numeric=True, multi_output=False) n_samples, n_features = X.shape - weights = _check_weights(sample_weight, n_samples) + weights = _check_sample_weight(sample_weight, X) eta = X @ self.coef_ if self.fit_intercept is True: eta += self.intercept_ @@ -542,7 +515,7 @@ def score(self, X, y, sample_weight=None): # Note, default score defined in RegressorMixin is R^2 score. # TODO: make D^2 a score function in module metrics (and thereby get # input validation and so on) - weights = _check_weights(sample_weight, y.shape[0]) + weights = _check_sample_weight(sample_weight, X) mu = self.predict(X) dev = self._family_instance.deviance(y, mu, weights=weights) y_mean = np.average(y, weights=weights) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index a0ac77b65aa9f..2909c85ba8c40 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -41,8 +41,6 @@ def test_sample_weights_validation(): y = [1] weights = 0 glm = GeneralizedLinearRegressor(fit_intercept=False) - with pytest.raises(ValueError, match="weights must be non-negative"): - glm.fit(X, y, weights) # Positive weights are accepted glm.fit(X, y, sample_weight=1) @@ -54,21 +52,8 @@ def test_sample_weights_validation(): # 1d but wrong length weights = [1, 0] - with pytest.raises(ValueError, - match="weights must have the same length as y"): - glm.fit(X, y, weights) - - # 1d but only zeros (sum not greater than 0) - weights = [0, 0] - X = [[0], [1]] - y = [1, 2] - with pytest.raises(ValueError, - match="must have at least one positive element"): - glm.fit(X, y, weights) - - # 5. 1d but with a negative value - weights = [2, -1] - with pytest.raises(ValueError, match="weights must be non-negative"): + msg = r"sample_weight.shape == \(2,\), expected \(1,\)!" + with pytest.raises(ValueError, match=msg): glm.fit(X, y, weights) From cc03c1ad58db4108ea1bfc3d2d47225be8822bd8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 19 Jul 2019 17:41:01 +0200 Subject: [PATCH 095/269] Simplify comments formatting --- sklearn/linear_model/_glm/glm.py | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 546e71c583fa8..ea3e4964529bf 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -208,12 +208,6 @@ def fit(self, X, y, sample_weight=None): ------- self : returns an instance of self. """ - ####################################################################### - # 1. input validation # - ####################################################################### - # 1.1 validate arguments of __init__ - # Guarantee that self._family_instance is an instance of class - # ExponentialDispersionModel if isinstance(self.family, ExponentialDispersionModel): self._family_instance = self.family elif self.family in EDM_DISTRIBUTIONS: @@ -288,7 +282,6 @@ def fit(self, X, y, sample_weight=None): family = self._family_instance link = self._link_instance - # 1.2 validate arguments of fit ####################################### _dtype = [np.float64, np.float32] _stype = ['csc', 'csr'] X, y = check_X_y(X, y, accept_sparse=_stype, @@ -300,7 +293,6 @@ def fit(self, X, y, sample_weight=None): n_samples, n_features = X.shape - # 1.4 additional validations ########################################## if self.check_input: if not np.all(family.in_y_range(y)): raise ValueError("Some value(s) of y are out of the valid " @@ -308,9 +300,8 @@ def fit(self, X, y, sample_weight=None): .format(family.__class__.__name__)) # TODO: if alpha=0 check that X is not rank deficient - ####################################################################### - # 2. rescaling of weights (sample_weight) # - ####################################################################### + # rescaling of sample_weight + # # IMPORTANT NOTE: Since we want to minimize # 1/(2*sum(sample_weight)) * deviance + L2, # deviance = sum(sample_weight * unit_deviance), @@ -319,9 +310,7 @@ def fit(self, X, y, sample_weight=None): weights_sum = np.sum(weights) weights = weights/weights_sum - ####################################################################### - # 3. initialization of coef = (intercept_, coef_) # - ####################################################################### + # initialization of coef = (intercept_, coef) # Note: Since phi=self.dispersion_ does not enter the estimation # of mu_i=E[y_i], set it to 1. @@ -338,12 +327,8 @@ def fit(self, X, y, sample_weight=None): else: coef = np.zeros(n_features) - ####################################################################### - # 4. fit # - ####################################################################### # algorithms for optimization - # 4.1 L-BFGS ########################################################## if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): mu, devp = \ @@ -371,9 +356,6 @@ def func(coef, X, y, weights, alpha, family, link): self.n_iter_ = _check_optimize_result("lbfgs", opt_res) coef = opt_res.x - ####################################################################### - # 5. postprocessing # - ####################################################################### if self.fit_intercept: self.intercept_ = coef[0] self.coef_ = coef[1:] @@ -425,9 +407,8 @@ def predict(self, X, sample_weight=None): C : array, shape (n_samples,) Returns predicted values times sample_weight. """ - # TODO: Is copy=True necessary? X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', copy=True, ensure_2d=True, + dtype='numeric', ensure_2d=True, allow_nd=False) eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) From aa52b4a32e216bfdd516776ac6796f8c0b4e5137 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 22 Jul 2019 15:50:47 +0200 Subject: [PATCH 096/269] Refactor to use TweedieDistribition in metrics --- sklearn/linear_model/_glm/distribution.py | 33 ++++++++++++++--------- sklearn/metrics/regression.py | 14 +++------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 6fffd816ff8f9..b99e1b40b2871 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -330,21 +330,28 @@ def unit_variance_derivative(self, mu): def unit_deviance(self, y, mu): p = self.power - if p == 0: - # NormalDistribution - return (y - mu)**2 - if p == 1: - # PoissonDistribution - # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0 - return 2 * (xlogy(y, y/mu) - y + mu) + if p < 0: + # 'Extreme stable', y_true any realy number, y_pred > 0 + dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) - + y * np.power(mu, 1-p)/(1-p) + + np.power(mu, 2-p)/(2-p)) + elif p == 0: + # Normal distribution, y_true and y_pred any real number + dev = (y - mu)**2 + elif p < 1: + raise ValueError("Tweedie deviance is only defined for p<=0 and " + "p>=1.") + elif p == 1: + # Poisson distribution + dev = 2 * (xlogy(y, y/mu) - y + mu) elif p == 2: - # GammaDistribution - return 2 * (np.log(mu/y) + y/mu - 1) + # Gamma distribution + dev = 2 * (np.log(mu/y) + y/mu - 1) else: - # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p)) - # - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p)) - return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) - - y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + dev = 2 * (np.power(y, 2-p)/((1-p) * (2-p)) - + y * np.power(mu, 1-p)/(1-p) + + np.power(mu, 2-p)/(2-p)) + return dev class NormalDistribution(TweedieDistribution): diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 2cba3d31ec84a..2f6c442c1f824 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -665,6 +665,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): >>> mean_tweedie_deviance(y_true, y_pred, p=1) 1.4260... """ + from ..linear_model._glm.distribution import TweedieDistribution y_type, y_true, y_pred, _ = _check_reg_targets( y_true, y_pred, None, dtype=[np.float64, np.float32]) if y_type == 'continuous-multioutput': @@ -681,12 +682,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): # 'Extreme stable', y_true any realy number, y_pred > 0 if (y_pred <= 0).any(): raise ValueError(message + "strictly positive y_pred.") - dev = 2 * (np.power(np.maximum(y_true, 0), 2-p)/((1-p) * (2-p)) - - y_true * np.power(y_pred, 1-p)/(1-p) + - np.power(y_pred, 2-p)/(2-p)) elif p == 0: - # Normal distribution, y_true and y_pred any real number - dev = (y_true - y_pred)**2 + pass elif p < 1: raise ValueError("Tweedie deviance is only defined for p<=0 and " "p>=1.") @@ -695,12 +692,10 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): if (y_true < 0).any() or (y_pred <= 0).any(): raise ValueError(message + "non-negative y_true and strictly " "positive y_pred.") - dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) elif p == 2: # Gamma distribution, y_true and y_pred > 0 if (y_true <= 0).any() or (y_pred <= 0).any(): raise ValueError(message + "strictly positive y_true and y_pred.") - dev = 2 * (np.log(y_pred/y_true) + y_true/y_pred - 1) else: if p < 2: # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0 @@ -712,9 +707,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): raise ValueError(message + "strictly positive y_true and " "y_pred.") - dev = 2 * (np.power(y_true, 2-p)/((1-p) * (2-p)) - - y_true * np.power(y_pred, 1-p)/(1-p) + - np.power(y_pred, 2-p)/(2-p)) + dist = TweedieDistribution(power=p) + dev = dist.unit_deviance(y_true, y_pred) return np.average(dev, weights=sample_weight) From 816aa8f36f3f96a6616a7aac0b4d2583b71f0fc7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jul 2019 07:54:28 +0200 Subject: [PATCH 097/269] WIP --- .../plot_poisson_regression_non_normal_loss.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 0537704b2cf1f..6b12370a9433c 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -25,7 +25,6 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd -from scipy.special import xlogy from sklearn.compose import ColumnTransformer from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression @@ -36,6 +35,7 @@ from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error +from sklearn.metrics import mean_poisson_deviance def load_mtpl2(n_samples=100000): @@ -140,13 +140,6 @@ def load_mtpl2(n_samples=100000): df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) -def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): - y_true = np.atleast_1d(y_true) - y_pred = np.atleast_1d(y_pred) - dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred) - return np.average(dev, weights=sample_weights) - - eps = 1e-5 print("MSE: %.3f" % mean_squared_error( df_test.Frequency.values, np.zeros(len(df_test)), @@ -154,7 +147,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, np.zeros(len(df_test)), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, eps + np.zeros(len(df_test)), df_test.Exposure.values)) @@ -175,7 +168,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): print("MSE: %.3f" % mean_absolute_error( df_test.Frequency.values, linregr.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps), df_test.Exposure.values)) @@ -201,7 +194,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, glm_freq.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, glm_freq.predict(X_test), df_test.Exposure.values)) @@ -221,7 +214,7 @@ def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None): df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score( +print("mean Poisson deviance: %.3f" % mean_poisson_deviance( df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) ############################################################################## From 6500c81a354c13de43ce0956240893b21c3979fb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jul 2019 16:47:13 +0200 Subject: [PATCH 098/269] Use Poisson deviance in examples --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 6b12370a9433c..ba9c0c8898fcd 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -184,7 +184,7 @@ def load_mtpl2(n_samples=100000): # # Next we fit the Poisson regressor on the target variable, -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0) +glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0, max_iter=1000) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) print("PoissonRegressor") @@ -215,7 +215,8 @@ def load_mtpl2(n_samples=100000): print("MAE: %.3f" % mean_absolute_error( df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) + df_test.Frequency.values, np.fmax(gbr.predict(X_test), eps), + df_test.Exposure.values)) ############################################################################## # From 59a6d9dd2aeae4901f138f6d1c892c48c427f9a8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 25 Jul 2019 18:22:26 +0200 Subject: [PATCH 099/269] Use PoissonRegressor and GammaRegressor in examples --- doc/modules/classes.rst | 2 + ...plot_poisson_regression_non_normal_loss.py | 4 +- ...lot_tweedie_regression_insurance_claims.py | 15 +- sklearn/linear_model/__init__.py | 2 +- sklearn/linear_model/_glm/__init__.py | 4 +- sklearn/linear_model/_glm/glm.py | 146 +++++++++++++++--- sklearn/linear_model/_glm/tests/test_glm.py | 3 +- 7 files changed, 145 insertions(+), 31 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 6346c0c65fadc..177cd0780f9be 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -750,6 +750,7 @@ Kernels: linear_model.BayesianRidge linear_model.ElasticNet linear_model.ElasticNetCV + linear_model.GammaRegressor linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars @@ -771,6 +772,7 @@ Kernels: linear_model.PassiveAggressiveClassifier linear_model.PassiveAggressiveRegressor linear_model.Perceptron + linear_model.PoissonRegressor linear_model.RANSACRegressor linear_model.Ridge linear_model.RidgeClassifier diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index ba9c0c8898fcd..713866a712aea 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -27,7 +27,7 @@ import pandas as pd from sklearn.compose import ColumnTransformer -from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression +from sklearn.linear_model import PoissonRegressor, LinearRegression from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -184,7 +184,7 @@ def load_mtpl2(n_samples=100000): # # Next we fit the Poisson regressor on the target variable, -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0, max_iter=1000) +glm_freq = PoissonRegressor(alpha=0, max_iter=1000) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) print("PoissonRegressor") diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index fba1cc42e20a7..55d0ca24ce477 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -39,7 +39,8 @@ import pandas as pd from sklearn.compose import ColumnTransformer -from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model import PoissonRegressor, GammaRegressor +from sklearn.linear_model._glm import GeneralizedLinearRegressor from sklearn.linear_model._glm.distribution import TweedieDistribution from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline @@ -192,7 +193,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # Some of the features are colinear, we use a weak penalization to avoid # numerical issues. -glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2) +glm_freq = PoissonRegressor(alpha=1e-2) glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) @@ -330,7 +331,7 @@ def score_estimator( mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 -glm_sev = GeneralizedLinearRegressor(family="gamma") +glm_sev = GammaRegressor() glm_sev.fit( X_train[mask_train.values], @@ -464,12 +465,16 @@ def score(self, X, y, sample_weight=None): # this takes a while params = { "family": [ - TweedieDistribution(power=power) for power in np.linspace(1, 2, 8) + TweedieDistribution(power=power) + # exclude upper bound as power=2 does not support null y samples. + for power in np.linspace(1 + 1e-4, 2 - 1e-4, 8) ] } + glm_total = GridSearchCV( - GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1 + GeneralizedLinearRegressor(tol=1e-3, max_iter=500), cv=3, + param_grid=params, n_jobs=-1 ) glm_total.fit( X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"] diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index e8f63e95cdfa8..d4227a126e5ec 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,7 +18,7 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (GeneralizedLinearRegressor, PoissonRegressor) +from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py index ac0f2c206ffe0..a7d4b67817730 100644 --- a/sklearn/linear_model/_glm/__init__.py +++ b/sklearn/linear_model/_glm/__init__.py @@ -1,5 +1,5 @@ # License: BSD 3 clause -from .glm import (GeneralizedLinearRegressor, PoissonRegressor) +from .glm import GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor -__all__ = ['GeneralizedLinearRegressor', 'PoissonRegressor'] +__all__ = ["GeneralizedLinearRegressor", "PoissonRegressor", "GammaRegressor"] diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index ea3e4964529bf..12486e188951a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -24,7 +24,6 @@ Link, IdentityLink, LogLink, - LogitLink, ) @@ -64,7 +63,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \ + link : {'auto', 'identity', 'log'} or an instance of class Link, \ optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (mu). Option 'auto' sets the link depending on @@ -160,11 +159,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): References ---------- - For the coordinate descent implementation: - * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', @@ -240,12 +241,10 @@ def fit(self, X, y, sample_weight=None): self._link_instance = IdentityLink() elif self.link == 'log': self._link_instance = LogLink() - elif self.link == 'logit': - self._link_instance = LogitLink() else: raise ValueError( "The link must be an instance of class Link or " - "an element of ['auto', 'identity', 'log', 'logit']; " + "an element of ['auto', 'identity', 'log']; " "got (link={0})".format(self.link)) if not isinstance(self.alpha, numbers.Number) or self.alpha < 0: @@ -606,18 +605,127 @@ class PoissonRegressor(GeneralizedLinearRegressor): References ---------- - For the coordinate descent implementation: - * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin - An Improved GLMNET for L1-regularized Logistic Regression, - Journal of Machine Learning Research 13 (2012) 1999-2030 - https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ """ - def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None, - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, alpha=1.0, fit_intercept=True, link='log', + fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family="poisson", link=link, + fit_dispersion=fit_dispersion, solver=solver, + max_iter=max_iter, tol=tol, warm_start=warm_start, + copy_X=copy_X, verbose=verbose) + + +class GammaRegressor(GeneralizedLinearRegressor): + """Regression with the response variable y following a Gamma distribution + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). + The fit minimizes the following objective function with L2 regularization:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + alpha : float, optional (default=1) + Constant that multiplies the penalty terms and thus determines the + regularization strength. + See the notes for the exact mathematical meaning of this + parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + case, the design matrix X must have full column rank + (no collinearities). + + fit_intercept : boolean, optional (default=True) + Specifies if a constant (a.k.a. bias or intercept) should be + added to the linear predictor (X*coef+intercept). + + fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) + Method for estimation of the dispersion parameter phi. Whether to use + the chi squared statistic or the deviance statistic. If None, the + dispersion is not estimated. + + solver : {'lbfgs'}, optional (default='lbfgs') + Algorithm to use in the optimization problem: + + 'lbfgs' + Calls scipy's L-BFGS-B optimizer. + + max_iter : int, optional (default=100) + The maximal number of iterations for solver algorithms. + + tol : float, optional (default=1e-4) + Stopping criterion. For the lbfgs solver, + the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` + where ``g_i`` is the i-th component of the gradient (derivative) of + the objective function. + + warm_start : boolean, optional (default=False) + If set to ``True``, reuse the solution of the previous call to ``fit`` + as initialization for ``coef_`` and ``intercept_`` . + + copy_X : boolean, optional, (default=True) + If ``True``, X will be copied; else, it may be overwritten. + + verbose : int, optional (default=0) + For the lbfgs solver set verbose to any positive number for verbosity. + + Attributes + ---------- + coef_ : array, shape (n_features,) + Estimated coefficients for the linear predictor (X*coef_+intercept_) in + the GLM. + + intercept_ : float + Intercept (a.k.a. bias) added to linear predictor. + + dispersion_ : float + The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. + + n_iter_ : int + Actual number of iterations used in solver. + + Notes + ----- + The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + :ref:`User Guide `. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + + References + ---------- + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ + """ + def __init__(self, alpha=1.0, fit_intercept=True, link='log', + fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="poisson", link='log', + family="gamma", link=link, fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 2909c85ba8c40..089f251d77049 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -77,8 +77,7 @@ def test_glm_family_argument(f, fam): @pytest.mark.parametrize('l, link', [('identity', IdentityLink()), - ('log', LogLink()), - ('logit', LogitLink())]) + ('log', LogLink())]) def test_glm_link_argument(l, link): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions From 03a8a2d06acaf1b2d57bdef51991fecb7482cf75 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 11:12:48 +0200 Subject: [PATCH 100/269] Improve documentation wording --- doc/modules/linear_model.rst | 41 ++++++++++--------- ...plot_poisson_regression_non_normal_loss.py | 5 +-- sklearn/linear_model/__init__.py | 6 ++- sklearn/linear_model/_glm/tests/test_glm.py | 1 - 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index b6f7c2b82c1f5..0eaaab2ecded5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -893,7 +893,7 @@ to warm-starting (see :term:`Glossary `). Generalized Linear Regression ============================= -:class:`GeneralizedLinearRegressor` generalizes linear models in two ways +Generalized Linear Models (GLM) extend linear models in two ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear combination of the input variables :math:`X` via an inverse link function :math:`h` as @@ -901,13 +901,31 @@ combination of the input variables :math:`X` via an inverse link function .. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The objective function being -minimized becomes +exponential dispersion model (EDM) [11]_. The minimized objective function is +the penalized negative log likelihood, .. math:: \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. +The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +likelihood as + +.. math:: d(y, \mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) + - loglike(y,y,\phi)\right) \\ + D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) + +===================================== =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +===================================== =============================== ================================= ============================================ +Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +===================================== =============================== ================================= ============================================ + + In the following use cases, a loss different from the squared loss might be appropriate, @@ -968,23 +986,6 @@ The objective function (the penalized negative log likelihood) is independent of :math:`\phi` and is minimized with respect to the coefficients :math:`w`. -The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` -likelihood as - -.. math:: d(y, \mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - - loglike(y,y,\phi)\right) \\ - D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) - -===================================== =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` -===================================== =============================== ================================= ============================================ -Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` -Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` -Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` -===================================== =============================== ================================= ============================================ - Two remarks: * The deviances for at least Normal, Poisson and Gamma distributions are diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 713866a712aea..695a7c7594472 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -4,9 +4,8 @@ ====================================== This example illustrates the use of linear Poisson regression -on the French Motor Third-Party Liability Claims dataset [1] and compare -it with learning models with least squared error. - +on the French Motor Third-Party Liability Claims dataset [1] and compares +it with models learned with least squared error. We start by defining a few helper functions for loading the data and visualizing results. diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index d4227a126e5ec..63a52a9c9898e 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,7 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor) +from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, + GammaRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -81,4 +82,5 @@ 'ridge_regression', 'RANSACRegressor', 'GeneralizedLinearRegressor', - 'PoissonRegressor'] + 'PoissonRegressor', + 'GammaRegressor'] diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 089f251d77049..542c18b65cad2 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -11,7 +11,6 @@ from sklearn.linear_model._glm.link import ( IdentityLink, LogLink, - LogitLink, ) from sklearn.linear_model._glm.distribution import ( TweedieDistribution, From bbf7f38fa69a9335397fa1cdfd73d153cbc3670d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 11:29:54 +0200 Subject: [PATCH 101/269] Use dataframe OpenML fetcher --- ...plot_poisson_regression_non_normal_loss.py | 19 ++++++++----------- ...lot_tweedie_regression_insurance_claims.py | 19 ++++++++----------- 2 files changed, 16 insertions(+), 22 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 695a7c7594472..347a424d8f4fe 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -25,6 +25,7 @@ import matplotlib.pyplot as plt import pandas as pd +from sklearn.datasets import fetch_openml from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, LinearRegression from sklearn.model_selection import train_test_split @@ -46,20 +47,16 @@ def load_mtpl2(n_samples=100000): number of samples to select (for faster run time). """ - # Note: this should use the OpenML DataFrame fetcher in the future - df_freq = pd.read_csv( - "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", - dtype={"IDpol": np.int}, - index_col=0, - ) + # freMTPL2freq dataset from https://www.openml.org/d/41214 + df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] + df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) + df_freq.set_index('IDpol', inplace=True) - df_sev = pd.read_csv( - "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", - index_col=0, - ) + # freMTPL2sev dataset from https://www.openml.org/d/41215 + df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] # sum ClaimAmount over identical IDs - df_sev = df_sev.groupby(level=0).sum() + df_sev = df_sev.groupby('IDpol').sum() df = df_freq.join(df_sev, how="left") df["ClaimAmount"].fillna(0, inplace=True) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 55d0ca24ce477..418127699dc1a 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -38,6 +38,7 @@ import matplotlib.pyplot as plt import pandas as pd +from sklearn.datasets import fetch_openml from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, GammaRegressor from sklearn.linear_model._glm import GeneralizedLinearRegressor @@ -59,20 +60,16 @@ def load_mtpl2(n_samples=100000): number of samples to select (for faster run time). """ - # Note: this should use the OpenML DataFrame fetcher in the future - df_freq = pd.read_csv( - "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv", - dtype={"IDpol": np.int}, - index_col=0, - ) + # freMTPL2freq dataset from https://www.openml.org/d/41214 + df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] + df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) + df_freq.set_index('IDpol', inplace=True) - df_sev = pd.read_csv( - "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff", - index_col=0, - ) + # freMTPL2sev dataset from https://www.openml.org/d/41215 + df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] # sum ClaimAmount over identical IDs - df_sev = df_sev.groupby(level=0).sum() + df_sev = df_sev.groupby('IDpol').sum() df = df_freq.join(df_sev, how="left") df["ClaimAmount"].fillna(0, inplace=True) From 49a3a8e116322c59f56b9d0b3fdb77028b144dd3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 12:00:26 +0200 Subject: [PATCH 102/269] Refactor distibution bounds --- sklearn/linear_model/_glm/distribution.py | 111 +++++------------- .../_glm/tests/test_distribution.py | 21 +++- 2 files changed, 50 insertions(+), 82 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index b99e1b40b2871..e6635d2003850 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -6,6 +6,7 @@ # License: BSD 3 clause from abc import ABCMeta, abstractmethod +from collections import namedtuple import numbers import numpy as np @@ -20,6 +21,10 @@ def _safe_lin_pred(X, coef): return X @ coef +DistributionBoundary = namedtuple("DistributionBoundary", + ("value", "inclusive")) + + class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). @@ -35,13 +40,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta): unit variance :math:`v(\mu)` and unit deviance :math:`d(y,\mu)`. - Attributes - ---------- - lower_bound - upper_bound - include_lower_bound - include_upper_bound - Methods ------- deviance @@ -52,55 +50,33 @@ class ExponentialDispersionModel(metaclass=ABCMeta): unit_variance unit_variance_derivative - _mu_deviance_derivative - References ---------- - https://en.wikipedia.org/wiki/Exponential_dispersion_model. """ - @property - def lower_bound(self): - """Get the lower bound of values for Y~EDM.""" - return self._lower_bound - @property - def upper_bound(self): - """Get the upper bound of values for Y~EDM.""" - return self._upper_bound - - @property - def include_lower_bound(self): - """Get True if lower bound for y is included: y >= lower_bound.""" - return self._include_lower_bound - - @property - def include_upper_bound(self): - """Get True if upper bound for y is included: y <= upper_bound.""" - return self._include_upper_bound - - def in_y_range(self, x): - """Returns ``True`` if x is in the valid range of Y~EDM. + def in_y_range(self, y): + """Returns ``True`` if y is in the valid range of Y~EDM. Parameters ---------- - x : array, shape (n_samples,) + y : array, shape (n_samples,) Target values. """ - if self.include_lower_bound: - if self.include_upper_bound: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater_equal(x, self.lower_bound), - np.less(x, self.upper_bound)) + if hasattr(self, '_upper_bound'): + # All currently supported distributions have an upper bound at + # +inf, however this may need to be implemented for other + # distributions + raise NotImplementedError + + if not isinstance(self._lower_bound, DistributionBoundary): + raise TypeError('_lower_bound attribute must be of type ' + 'DistributionBoundary') + + if self._lower_bound.inclusive: + return np.greater_equal(y, self._lower_bound.value) else: - if self.include_upper_bound: - return np.logical_and(np.greater(x, self.lower_bound), - np.less_equal(x, self.upper_bound)) - else: - return np.logical_and(np.greater(x, self.lower_bound), - np.less(x, self.upper_bound)) + return np.greater(y, self._lower_bound.value) @abstractmethod def unit_variance(self, mu): @@ -265,42 +241,17 @@ def power(self, power): raise TypeError('power must be a real number, input was {0}' .format(power)) - self._upper_bound = np.Inf - self._include_upper_bound = False - if power < 0: - # Extreme Stable - self._lower_bound = -np.Inf - self._include_lower_bound = False - elif power == 0: - # NormalDistribution - self._lower_bound = -np.Inf - self._include_lower_bound = False - elif (power > 0) and (power < 1): + if power <= 0: + # Extreme Stable or Normal distribution + self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False) + elif 0 < power < 1: raise ValueError('For 0 1) and (power < 2): - # Compound Poisson - self._lower_bound = 0 - self._include_lower_bound = True - elif power == 2: - # GammaDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif (power > 2) and (power < 3): - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False - elif power == 3: - # InverseGaussianDistribution - self._lower_bound = 0 - self._include_lower_bound = False - elif power > 3: - # Positive Stable - self._lower_bound = 0 - self._include_lower_bound = False + elif 1 <= power < 2: + # Poisson or Compound Poisson distribution + self._lower_bound = DistributionBoundary(0, inclusive=True) + elif power >= 2: + # Gamma, Positive Stable, Inverse Gaussian distributions + self._lower_bound = DistributionBoundary(0, inclusive=False) else: # pragma: no cover # this branch should be unreachable. raise ValueError diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index f984077bed5cb..f9c329f35caa4 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -13,6 +13,7 @@ TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, + DistributionBoundary ) @@ -30,6 +31,19 @@ def test_family_bounds(family, expected): assert_array_equal(result, expected) +def test_invalid_distribution_bound(): + dist = TweedieDistribution() + dist._lower_bound = 0 + with pytest.raises(TypeError, + match="must be of type DistributionBoundary"): + dist.in_y_range([-1, 0, 1]) + + dist = TweedieDistribution() + dist._upper_bound = None + with pytest.raises(NotImplementedError): + dist.in_y_range([-1, 0, 1]) + + def test_tweedie_distribution_power(): with pytest.raises(ValueError, match="no distribution exists"): TweedieDistribution(power=0.5) @@ -42,9 +56,12 @@ def test_tweedie_distribution_power(): dist.power = 1j dist = TweedieDistribution() - assert dist._include_lower_bound is False + assert isinstance(dist._lower_bound, DistributionBoundary) + + assert dist._lower_bound.inclusive is False dist.power = 1 - assert dist._include_lower_bound is True + assert dist._lower_bound.value == 0.0 + assert dist._lower_bound.inclusive is True @pytest.mark.parametrize( From 228e8c8633d3bae7bbe9a36fa3fccb97567fd653 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 15:13:58 +0200 Subject: [PATCH 103/269] Move deviance checks under destribution --- sklearn/linear_model/_glm/distribution.py | 70 +++++++++++++++++-- .../_glm/tests/test_distribution.py | 3 +- sklearn/metrics/regression.py | 35 +--------- sklearn/metrics/tests/test_regression.py | 12 ++-- 4 files changed, 74 insertions(+), 46 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e6635d2003850..c30996662114e 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -113,7 +113,7 @@ def unit_variance_derivative(self, mu): pass # pragma: no cover @abstractmethod - def unit_deviance(self, y, mu): + def unit_deviance(self, y, mu, check_input=False): r"""Compute the unit deviance. The unit_deviance :math:`d(y,\mu)` can be defined by the @@ -128,6 +128,14 @@ def unit_deviance(self, y, mu): mu : array, shape (n_samples,) Predicted mean. + + check_input : bool, default=False + If True raise an exception on invalid y or mu values, otherwise + they will be propagated as NaN. + Returns + ------- + deviance: array, shape (n_samples,) + Computed deviance """ pass # pragma: no cover @@ -245,7 +253,8 @@ def power(self, power): # Extreme Stable or Normal distribution self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False) elif 0 < power < 1: - raise ValueError('For 0=1.') elif 1 <= power < 2: # Poisson or Compound Poisson distribution self._lower_bound = DistributionBoundary(0, inclusive=True) @@ -279,15 +288,66 @@ def unit_variance_derivative(self, mu): """ return self.power * np.power(mu, self.power - 1) - def unit_deviance(self, y, mu): + def unit_deviance(self, y, mu, check_input=False): + r"""Compute the unit deviance. + + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + + Parameters + ---------- + y : array, shape (n_samples,) + Target values. + + mu : array, shape (n_samples,) + Predicted mean. + + check_input : bool, default=False + If True raise an exception on invalid y or mu values, otherwise + they will be propagated as NaN. + Returns + ------- + deviance: array, shape (n_samples,) + Computed deviance + """ p = self.power + + if check_input: + message = ("Mean Tweedie deviance error with p={} can only be " + "used on ".format(p)) + if p < 0: + # 'Extreme stable', y any realy number, mu > 0 + if (mu <= 0).any(): + raise ValueError(message + "strictly positive mu.") + elif p == 0: + # Normal, y and mu can be any real number + pass + elif 0 < p < 1: + raise ValueError("Tweedie deviance is only defined for p<=0 " + "and p>=1.") + elif 1 <= p < 2: + # Poisson and Compount poisson distribution, y >= 0, mu > 0 + if (y < 0).any() or (mu <= 0).any(): + raise ValueError(message + "non-negative y and strictly " + "positive mu.") + elif p >= 2: + # Gamma and Extreme stable distribution, y and mu > 0 + if (y <= 0).any() or (mu <= 0).any(): + raise ValueError(message + "strictly positive y and mu.") + else: # pragma: nocover + # Unreachable statement + raise ValueError + if p < 0: - # 'Extreme stable', y_true any realy number, y_pred > 0 + # 'Extreme stable', y any realy number, mu > 0 dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) - y * np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p)) + elif p == 0: - # Normal distribution, y_true and y_pred any real number + # Normal distribution, y and mu any real number dev = (y - mu)**2 elif p < 1: raise ValueError("Tweedie deviance is only defined for p<=0 and " diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index f9c329f35caa4..82e493b7a2149 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -45,7 +45,8 @@ def test_invalid_distribution_bound(): def test_tweedie_distribution_power(): - with pytest.raises(ValueError, match="no distribution exists"): + msg = "distribution is only defined for p<=0 and p>=1" + with pytest.raises(ValueError, match=msg): TweedieDistribution(power=0.5) with pytest.raises(TypeError, match="must be a real number"): diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 2f6c442c1f824..538fd7eec4631 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -22,9 +22,7 @@ # Christian Lorentzen # License: BSD 3 clause - import numpy as np -from scipy.special import xlogy import warnings from ..utils.validation import (check_array, check_consistent_length, @@ -676,39 +674,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): sample_weight = column_or_1d(sample_weight) sample_weight = sample_weight[:, np.newaxis] - message = ("Mean Tweedie deviance error with p={} can only be used on " - .format(p)) - if p < 0: - # 'Extreme stable', y_true any realy number, y_pred > 0 - if (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_pred.") - elif p == 0: - pass - elif p < 1: - raise ValueError("Tweedie deviance is only defined for p<=0 and " - "p>=1.") - elif p == 1: - # Poisson distribution, y_true >= 0, y_pred > 0 - if (y_true < 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "non-negative y_true and strictly " - "positive y_pred.") - elif p == 2: - # Gamma distribution, y_true and y_pred > 0 - if (y_true <= 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_true and y_pred.") - else: - if p < 2: - # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0 - if (y_true < 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "non-negative y_true and strictly " - "positive y_pred.") - else: - if (y_true <= 0).any() or (y_pred <= 0).any(): - raise ValueError(message + "strictly positive y_true and " - "y_pred.") - dist = TweedieDistribution(power=p) - dev = dist.unit_deviance(y_true, y_pred) + dev = dist.unit_deviance(y_true, y_pred, check_input=True) return np.average(dev, weights=sample_weight) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 526c27f0a036c..d46bca1301b1e 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -101,35 +101,35 @@ def test_regression_metrics_at_limits(): assert_allclose(mean_tweedie_deviance([0], [1.], p=p), 2./(2.-p), rtol=1e-3) with pytest.raises(ValueError, - match="can only be used on strictly positive y_pred."): + match="can only be used on strictly positive mu."): mean_tweedie_deviance([0.], [0.], p=p) assert_almost_equal(mean_tweedie_deviance([0.], [0.], p=0), 0.00, 2) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=1.0) p = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], p=p), 2./(2.-p)) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) p = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y_true and y_pred." + msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) p = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y_true and y_pred." + msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], p=p) with pytest.raises(ValueError, - match="deviance is only defined for p<=0 and p>=1."): + match="is only defined for p<=0 and p>=1"): mean_tweedie_deviance([0.], [0.], p=0.5) From 09a57c9cfedfebb2452c50921260c310987a0057 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 18:12:55 +0200 Subject: [PATCH 104/269] Expose TweedieRegressor --- doc/modules/linear_model.rst | 6 +- ...lot_tweedie_regression_insurance_claims.py | 26 ++- sklearn/linear_model/__init__.py | 7 +- sklearn/linear_model/_glm/__init__.py | 14 +- sklearn/linear_model/_glm/glm.py | 154 ++++++++++++++++++ sklearn/linear_model/_glm/tests/test_glm.py | 38 ++++- 6 files changed, 221 insertions(+), 24 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 0eaaab2ecded5..cab918b06f3b0 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -949,10 +949,10 @@ Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows: - >>> from sklearn.linear_model import GeneralizedLinearRegressor - >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + >>> from sklearn.linear_model import TweedieRegressor + >>> reg = TweedieRegressor(alpha=0.5, family='poisson', link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log') + TweedieRegressor(alpha=0.5, family='poisson', link='log') >>> reg.coef_ array([0.2463..., 0.4337...]) >>> reg.intercept_ diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 418127699dc1a..55a21c8d8723d 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -41,8 +41,7 @@ from sklearn.datasets import fetch_openml from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, GammaRegressor -from sklearn.linear_model._glm import GeneralizedLinearRegressor -from sklearn.linear_model._glm.distribution import TweedieDistribution +from sklearn.linear_model import TweedieRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -418,7 +417,6 @@ class ClaimProdEstimator: def __init__(self, est_freq, est_sev): self.est_freq = est_freq self.est_sev = est_sev - self._family_instance = TweedieDistribution(power=1.5) def predict(self, X, exposure): """Predict the total claim amount. @@ -429,11 +427,14 @@ def predict(self, X, exposure): def score(self, X, y, sample_weight=None): """Compute D², the percentage of deviance explained.""" + # TODO: remove this private import once d2_score is available + from sklearn.linear_model._glm.distribution import TweedieDistribution + mu = self.predict(X, exposure=sample_weight) - dev = self._family_instance.deviance(y, mu, weights=sample_weight) + family = TweedieDistribution(power=1.5) + dev = family.deviance(y, mu, weights=sample_weight) y_mean = np.average(y, weights=sample_weight) - dev_null = self._family_instance.deviance(y, y_mean, - weights=sample_weight) + dev_null = family.deviance(y, y_mean, weights=sample_weight) return 1. - dev / dev_null @@ -459,18 +460,13 @@ def score(self, X, y, sample_weight=None): from sklearn.model_selection import GridSearchCV -# this takes a while -params = { - "family": [ - TweedieDistribution(power=power) - # exclude upper bound as power=2 does not support null y samples. - for power in np.linspace(1 + 1e-4, 2 - 1e-4, 8) - ] -} +# exclude upper bound as power=2 does not support null y values. +params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)} +# this takes a while glm_total = GridSearchCV( - GeneralizedLinearRegressor(tol=1e-3, max_iter=500), cv=3, + TweedieRegressor(tol=1e-3, max_iter=500), cv=3, param_grid=params, n_jobs=-1 ) glm_total.fit( diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 63a52a9c9898e..46d1efe63de2e 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -18,8 +18,8 @@ lasso_path, enet_path, MultiTaskLasso, MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLassoCV) -from ._glm import (GeneralizedLinearRegressor, PoissonRegressor, - GammaRegressor) +from ._glm import (PoissonRegressor, + GammaRegressor, TweedieRegressor) from .huber import HuberRegressor from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber from .stochastic_gradient import SGDClassifier, SGDRegressor @@ -83,4 +83,5 @@ 'RANSACRegressor', 'GeneralizedLinearRegressor', 'PoissonRegressor', - 'GammaRegressor'] + 'GammaRegressor', + 'TweedieRegressor'] diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py index a7d4b67817730..3b5c0d95d6124 100644 --- a/sklearn/linear_model/_glm/__init__.py +++ b/sklearn/linear_model/_glm/__init__.py @@ -1,5 +1,15 @@ # License: BSD 3 clause -from .glm import GeneralizedLinearRegressor, PoissonRegressor, GammaRegressor +from .glm import ( + GeneralizedLinearRegressor, + PoissonRegressor, + GammaRegressor, + TweedieRegressor +) -__all__ = ["GeneralizedLinearRegressor", "PoissonRegressor", "GammaRegressor"] +__all__ = [ + "GeneralizedLinearRegressor", + "PoissonRegressor", + "GammaRegressor", + "TweedieRegressor" +] diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 12486e188951a..ca33c45c76292 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -623,6 +623,15 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) + @property + def family(self): + return "poisson" + + @family.setter + def family(self, value): + if value != "poisson": + raise ValueError("PoissonRegressor.family must be 'poisson'!") + class GammaRegressor(GeneralizedLinearRegressor): """Regression with the response variable y following a Gamma distribution @@ -729,3 +738,148 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', fit_dispersion=fit_dispersion, solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) + + @property + def family(self): + return "gamma" + + @family.setter + def family(self, value): + if value != "gamma": + raise ValueError("GammaRegressor.family must be 'gamma'!") + + +class TweedieRegressor(GeneralizedLinearRegressor): + """Regression with the response variable y following a Tweedie distribution + + GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at + fitting and predicting the mean of the target y as mu=h(X*w). + The fit minimizes the following objective function with L2 regularization:: + + 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + + with inverse link function h and s=sample_weight. Note that for + ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + power : float (default=0) + The variance power: :math:`v(\mu) = \mu^{power}`. + For ``0`. + + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + minimizing the deviance plus penalty term, which is equivalent to + (penalized) maximum likelihood estimation. + + + References + ---------- + .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + + .. Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ + """ + def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', + fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): + + super().__init__(alpha=alpha, fit_intercept=fit_intercept, + family=TweedieDistribution(power=power), link=link, + fit_dispersion=fit_dispersion, solver=solver, + max_iter=max_iter, tol=tol, warm_start=warm_start, + copy_X=copy_X, verbose=verbose) + + @property + def family(self): + dist = TweedieDistribution(power=self.power) + # TODO: make the returned object immutable + return dist + + @family.setter + def family(self, value): + if isinstance(value, TweedieDistribution): + self.power = value.power + else: + raise TypeError("TweedieRegressor.family must be of type " + "TweedieDistribution!") diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 542c18b65cad2..a56155fe03f22 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -7,7 +7,12 @@ import pytest from sklearn.datasets import make_regression -from sklearn.linear_model import GeneralizedLinearRegressor +from sklearn.linear_model._glm import GeneralizedLinearRegressor +from sklearn.linear_model import ( + TweedieRegressor, + PoissonRegressor, + GammaRegressor +) from sklearn.linear_model._glm.link import ( IdentityLink, LogLink, @@ -353,3 +358,34 @@ def test_convergence_warning(solver, regression_data): max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y) + + +def test_poisson_regression_family(regression_data): + est = PoissonRegressor() + est.family == "poisson" + + msg = "PoissonRegressor.family must be 'poisson'!" + with pytest.raises(ValueError, match=msg): + est.family = 0 + + +def test_gamma_regression_family(regression_data): + est = GammaRegressor() + est.family == "gamma" + + msg = "GammaRegressor.family must be 'gamma'!" + with pytest.raises(ValueError, match=msg): + est.family = 0 + + +def test_tweedie_regression_family(regression_data): + power = 2.0 + est = TweedieRegressor(power=power) + assert isinstance(est.family, TweedieDistribution) + assert est.family.power == power + msg = "TweedieRegressor.family must be of type TweedieDistribution!" + with pytest.raises(TypeError, match=msg): + est.family = None + + # TODO: the following should not be allowed + # est.family.power = 2 From 4b485cac821d30ece504a43d0c3316140cec5d33 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 18:32:07 +0200 Subject: [PATCH 105/269] Improve documentation --- doc/modules/classes.rst | 2 +- doc/modules/linear_model.rst | 38 ++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 177cd0780f9be..e5b61faf352e4 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -751,7 +751,6 @@ Kernels: linear_model.ElasticNet linear_model.ElasticNetCV linear_model.GammaRegressor - linear_model.GeneralizedLinearRegressor linear_model.HuberRegressor linear_model.Lars linear_model.LarsCV @@ -781,6 +780,7 @@ Kernels: linear_model.SGDClassifier linear_model.SGDRegressor linear_model.TheilSenRegressor + linear_model.TweedieRegressor .. autosummary:: :toctree: generated/ diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index cab918b06f3b0..f4a48fcaf3acd 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -916,14 +916,14 @@ likelihood as - loglike(y,y,\phi)\right) \\ D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) -===================================== =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` -===================================== =============================== ================================= ============================================ -Normal ("normal") :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson ("poisson") :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` -Gamma ("gamma") :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` -Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` -===================================== =============================== ================================= ============================================ +================= =============================== ================================= ============================================ +Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +================= =============================== ================================= ============================================ +Normal :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +================= =============================== ================================= ============================================ In the following use cases, a loss different from the squared loss might be @@ -945,14 +945,32 @@ it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with :math:`h(x^\top w)=\exp(x^\top w)`. +:class:`linear_model.TweedieRegressor` implements a generalized linear model +for the Tweedie distribution, that allows to model any of the above mentionned +distribution using the appropriate power parameter `p`, + + - `p = 0`: Normal distribution. Specialized solvers such as + :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally + more appropriate in this case. + + - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for + convinience however it is strictly equivalent to `TweedieRegressor(power=1)`. + + - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for + convinience however it is also strictly equivalent to + `TweedieRegressor(power=2)`. + + - `p = 3`: Inverse Gamma distribution. + + Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows: >>> from sklearn.linear_model import TweedieRegressor - >>> reg = TweedieRegressor(alpha=0.5, family='poisson', link='log') + >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - TweedieRegressor(alpha=0.5, family='poisson', link='log') + TweedieRegressor(alpha=0.5, power=1) >>> reg.coef_ array([0.2463..., 0.4337...]) >>> reg.intercept_ From aa0adf1e6304d9f8aa3497e2fa406244e8d3405b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Jul 2019 18:34:13 +0200 Subject: [PATCH 106/269] Lint --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index ca33c45c76292..7d7ef099cd04a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -766,7 +766,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: :math:`v(\mu) = \mu^{power}`. + The variance power: :math:`v(\\mu) = \\mu^{power}`. For ``0 Date: Tue, 30 Jul 2019 14:13:46 +0200 Subject: [PATCH 107/269] Fix __init__ --- sklearn/linear_model/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py index 46d1efe63de2e..9f696a14985c3 100644 --- a/sklearn/linear_model/__init__.py +++ b/sklearn/linear_model/__init__.py @@ -81,7 +81,6 @@ 'orthogonal_mp_gram', 'ridge_regression', 'RANSACRegressor', - 'GeneralizedLinearRegressor', 'PoissonRegressor', 'GammaRegressor', 'TweedieRegressor'] From 7a9d0674174af423b5bfe8290631db10c53a946b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 2 Aug 2019 15:39:19 +0200 Subject: [PATCH 108/269] Update doc/modules/linear_model.rst Co-Authored-By: Joel Nothman --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index f4a48fcaf3acd..8123eac5fc4a2 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -954,7 +954,7 @@ distribution using the appropriate power parameter `p`, more appropriate in this case. - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for - convinience however it is strictly equivalent to `TweedieRegressor(power=1)`. + convenience however it is strictly equivalent to `TweedieRegressor(power=1)`. - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for convinience however it is also strictly equivalent to From 18b45037fa6cb4d172c4e1b3cb92c1c33c0700c4 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 2 Aug 2019 15:39:29 +0200 Subject: [PATCH 109/269] Update doc/modules/linear_model.rst Co-Authored-By: Joel Nothman --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 8123eac5fc4a2..d159a8ad77039 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -957,7 +957,7 @@ distribution using the appropriate power parameter `p`, convenience however it is strictly equivalent to `TweedieRegressor(power=1)`. - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for - convinience however it is also strictly equivalent to + convenience however it is strictly equivalent to `TweedieRegressor(power=2)`. - `p = 3`: Inverse Gamma distribution. From 29658d66ff34eb633f2728d69a956cde760c271e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 2 Aug 2019 15:39:40 +0200 Subject: [PATCH 110/269] Update doc/modules/linear_model.rst Co-Authored-By: Joel Nothman --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d159a8ad77039..02c01b674f467 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -965,7 +965,7 @@ distribution using the appropriate power parameter `p`, Note that the feature matrix ``X`` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as -follows: +follows:: >>> from sklearn.linear_model import TweedieRegressor >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') From 1ea70d3a0c2da89ff164aef9975080de66a1cebd Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 7 Aug 2019 08:22:56 +0200 Subject: [PATCH 111/269] Fix typos in documentation --- doc/modules/linear_model.rst | 13 +++++++------ sklearn/linear_model/_glm/distribution.py | 2 +- sklearn/linear_model/_glm/glm.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 02c01b674f467..ce160d341af5f 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -733,8 +733,8 @@ of a single trial are modeled using a `logistic function `_. Logistic regression is implemented in :class:`LogisticRegression`. -This implementation can fit binary, One-vs-Rest, or multinomial logistic -regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net +This implementation can fit binary, One-vs-Rest, or multinomial logistic +regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net regularization. .. note:: @@ -946,18 +946,19 @@ it is convenient to apply a link function different from the identity link log-link with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`linear_model.TweedieRegressor` implements a generalized linear model -for the Tweedie distribution, that allows to model any of the above mentionned -distribution using the appropriate power parameter `p`, +for the Tweedie distribution, that allows to model any of the above mentioned +distributions using the appropriate power parameter `p`, - `p = 0`: Normal distribution. Specialized solvers such as :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally more appropriate in this case. - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for - convenience however it is strictly equivalent to `TweedieRegressor(power=1)`. + convenience. However, it is strictly equivalent to + `TweedieRegressor(power=1)`. - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for - convenience however it is strictly equivalent to + convenience. However, it is strictly equivalent to `TweedieRegressor(power=2)`. - `p = 3`: Inverse Gamma distribution. diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index c30996662114e..c7b4ec6c7836d 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -224,7 +224,7 @@ class TweedieDistribution(ExponentialDispersionModel): ===== ================ 0 Normal 1 Poisson - (0,1) Compound Poisson + (1,2) Compound Poisson 2 Gamma 3 Inverse Gaussian diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 7d7ef099cd04a..af411c38f87ca 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -776,7 +776,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): ===== ================ 0 Normal 1 Poisson - (0,1) Compound Poisson + (1,2) Compound Poisson 2 Gamma 3 Inverse Gaussian From efdcb5be1ed18d681c9b7c358b6f23adb6d0f795 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 16:58:46 +0300 Subject: [PATCH 112/269] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index ce160d341af5f..67e4fbbde182c 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -964,7 +964,7 @@ distributions using the appropriate power parameter `p`, - `p = 3`: Inverse Gamma distribution. -Note that the feature matrix ``X`` should be standardized before fitting. This +Note that the feature matrix `X` should be standardized before fitting. This ensures that the penalty treats features equally. The estimator can be used as follows:: From ef0d063ec406c73aaee715af4cf06e5ca1f3b78c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 16:59:26 +0300 Subject: [PATCH 113/269] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 67e4fbbde182c..9645690d594f6 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -945,7 +945,7 @@ it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the log-link with :math:`h(x^\top w)=\exp(x^\top w)`. -:class:`linear_model.TweedieRegressor` implements a generalized linear model +:class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned distributions using the appropriate power parameter `p`, From 0125e1cbe9105f8d4f4cf08ee652add64deee384 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 16:59:59 +0300 Subject: [PATCH 114/269] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 9645690d594f6..fe52cad236392 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -950,7 +950,7 @@ for the Tweedie distribution, that allows to model any of the above mentioned distributions using the appropriate power parameter `p`, - `p = 0`: Normal distribution. Specialized solvers such as - :class:`linear_model.Ridge`, :class:`linear_model.ElasticNet` are generally + :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in this case. - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for From 6a8a600258c8be783f02dad7519fc2aa012418eb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:02:46 +0300 Subject: [PATCH 115/269] Update examples/linear_model/plot_poisson_regression_non_normal_loss.py Co-Authored-By: Nicolas Hug --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 347a424d8f4fe..6cee852866018 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -238,7 +238,7 @@ def load_mtpl2(n_samples=100000): y_pred = model.predict(X_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1]) - ax[idx+1].set_title(model.__class__.__name__) + ax[idx + 1].set_title(model.__class__.__name__) for axi in ax: axi.set( From 73f3bd1f646ec1dc1b9a3808149cc185d3fdea0d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:13:00 +0300 Subject: [PATCH 116/269] Rename inverse.gaussian to inverse-gaussian --- sklearn/linear_model/_glm/distribution.py | 2 +- sklearn/linear_model/_glm/glm.py | 6 +++--- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index c7b4ec6c7836d..950fa3fbb03e7 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -393,5 +393,5 @@ def __init__(self): 'normal': NormalDistribution, 'poisson': PoissonDistribution, 'gamma': GammaDistribution, - 'inverse.gaussian': InverseGaussianDistribution, + 'inverse-gaussian': InverseGaussianDistribution, } diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index af411c38f87ca..86f4c544d8f84 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -57,7 +57,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - family : {'normal', 'poisson', 'gamma', 'inverse.gaussian'} \ + family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \ or an instance of class ExponentialDispersionModel, \ optional(default='normal') The distributional assumption of the GLM, i.e. which distribution from @@ -71,7 +71,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'identity' for family 'normal' - - 'log' for families 'poisson', 'gamma', 'inverse.gaussian' + - 'log' for families 'poisson', 'gamma', 'inverse-gaussian' fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) Method for estimation of the dispersion parameter phi. Whether to use @@ -217,7 +217,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError( "The family must be an instance of class" " ExponentialDispersionModel or an element of" - " ['normal', 'poisson', 'gamma', 'inverse.gaussian']" + " ['normal', 'poisson', 'gamma', 'inverse-gaussian']" "; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index a56155fe03f22..12edbefbf2833 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -65,7 +65,7 @@ def test_sample_weights_validation(): [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), - ('inverse.gaussian', InverseGaussianDistribution())]) + ('inverse-gaussian', InverseGaussianDistribution())]) def test_glm_family_argument(f, fam): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions From 11b178fffc7d279596acd122c7b6c33ba4b84a2f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:17:42 +0300 Subject: [PATCH 117/269] Remove sample_weight parameter from predict --- sklearn/linear_model/_glm/glm.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 86f4c544d8f84..921317cd0965a 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -388,32 +388,25 @@ def _linear_predictor(self, X): allow_nd=False) return X @ self.coef_ + self.intercept_ - def predict(self, X, sample_weight=None): + def predict(self, X): """Predict using GLM with feature matrix X. - If sample_weight is given, returns prediction*sample_weight. - Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Samples. - sample_weight : {None, array-like}, shape (n_samples,), optional \ - (default=None) - Returns ------- C : array, shape (n_samples,) - Returns predicted values times sample_weight. + Returns predicted values. """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', ensure_2d=True, allow_nd=False) eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) - weights = _check_sample_weight(sample_weight, X) - - return mu*weights + return mu def estimate_phi(self, X, y, sample_weight=None): """Estimate/fit the dispersion parameter phi. From 3806fbe05c164d3fd74aada0b8a068c43d3998cd Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 9 Aug 2019 17:22:20 +0300 Subject: [PATCH 118/269] Remove redundant check_array in predict --- sklearn/linear_model/_glm/glm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 921317cd0965a..0ab9e8c1db777 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -384,7 +384,7 @@ def _linear_predictor(self, X): """ check_is_fitted(self, "coef_") X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', copy=True, ensure_2d=True, + dtype='numeric', ensure_2d=True, allow_nd=False) return X @ self.coef_ + self.intercept_ @@ -401,9 +401,7 @@ def predict(self, X): C : array, shape (n_samples,) Returns predicted values. """ - X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', ensure_2d=True, - allow_nd=False) + # check_array is done in _linear_predictor eta = self._linear_predictor(X) mu = self._link_instance.inverse(eta) return mu From ae1c6721876fe5e9f1f14cd092fb198067899457 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 11 Aug 2019 23:11:52 +0200 Subject: [PATCH 119/269] Update doc/modules/linear_model.rst --- doc/modules/linear_model.rst | 66 +++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index fe52cad236392..09b657c26b915 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -900,30 +900,31 @@ combination of the input variables :math:`X` via an inverse link function .. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). -Secondly, the squared loss function is replaced by the deviance :math:`D` of an -exponential dispersion model (EDM) [11]_. The minimized objective function is -the penalized negative log likelihood, +Secondly, the squared loss function is replaced by the unit deviance :math:`d` +of a reproductive exponential dispersion model (EDM) [11]_. The minimization +problem becomes -.. math:: \frac{1}{2 \sum s_i}D(y, \hat{y}; s) +\frac{\alpha}{2} ||w||_2 +.. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, x_i)) + \frac{\alpha}{2} ||w||_2 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. - -The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` +The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as .. math:: d(y, \mu) = -2\phi\cdot \left(loglike(y,\mu,\phi) - - loglike(y,y,\phi)\right) \\ - D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i) + - loglike(y,y,\phi)\right) + +The following table lists some specific EDM distributions—all are Tweedie +distributions—and some properties. -================= =============================== ================================= ============================================ -Distribution Target Domain Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` -================= =============================== ================================= ============================================ -Normal :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` -Gamma :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` -Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` -================= =============================== ================================= ============================================ +================= =============================== ====================================== ============================================ +Distribution Target Domain Unit Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` +================= =============================== ====================================== ============================================ +Normal :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` +Poisson :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` +Gamma :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` +Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` +================= =============================== ====================================== ============================================ In the following use cases, a loss different from the squared loss might be @@ -947,7 +948,8 @@ log-link with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned -distributions using the appropriate power parameter `p`, +distributions using the appropriate power parameter `p`, i.e. the exponent of +the unit variance function, - `p = 0`: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally @@ -964,9 +966,16 @@ distributions using the appropriate power parameter `p`, - `p = 3`: Inverse Gamma distribution. -Note that the feature matrix `X` should be standardized before fitting. This -ensures that the penalty treats features equally. The estimator can be used as -follows:: +Note: +* The feature matrix `X` should be standardized before fitting. This + ensures that the penalty treats features equally. +* If you want to model a relative frequency, i.e. counts per exposure (time, + volume, ...) you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together + with :math:`s=\mathrm{exposure}` as sample weights. This is done in both + examples linked below. + +The estimator can be used as follows:: >>> from sklearn.linear_model import TweedieRegressor >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') @@ -993,7 +1002,7 @@ In the unpenalized case, the assumptions are the following: with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter :math:`\phi` and sample weights :math:`s_i`. * The aim is to predict the expectation :math:`\mu_i` with - :math:`\hat{y_i} = h(\eta_i)`, linear predictor + :math:`\hat{y}_i = h(\eta_i)`, linear predictor :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. Note that the first assumption implies @@ -1001,21 +1010,16 @@ Note that the first assumption implies function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the same as specifying a unit variance function (they are one-to-one). -The objective function (the penalized negative log likelihood) is -independent of :math:`\phi` and is minimized with respect to the -coefficients :math:`w`. - -Two remarks: +A few remarks: +* The deviance is independent of :math:`\phi`. Therefore, also the estimation + of the coefficients :math:`w` is independent of the dispersion parameter of + the EDM. +* The minimization is equivalent to (penalized) maximum likelihood estimation. * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. (19)-(20) in [12]_. -* If you want to model a frequency, i.e. counts per exposure (time, volume, ...) - you can do so by a Poisson distribution and passing - :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together - with :math:`s=\mathrm{exposure}` as sample weights. - .. topic:: References: From f07c831c29bc37ab0b922e1482d9f027f84049c1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 11 Aug 2019 23:32:57 +0200 Subject: [PATCH 120/269] Remove dispersion --- sklearn/linear_model/_glm/glm.py | 116 +++----------------- sklearn/linear_model/_glm/tests/test_glm.py | 18 --- 2 files changed, 15 insertions(+), 119 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 0ab9e8c1db777..cf9b00527c8cf 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -73,11 +73,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'log' for families 'poisson', 'gamma', 'inverse-gaussian' - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'auto', 'lbfgs'}, optional (default='auto') Algorithm to use in the optimization problem: @@ -124,9 +119,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -169,14 +161,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', - fit_dispersion=None, solver='auto', max_iter=100, - tol=1e-4, warm_start=False, + solver='auto', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept self.family = family self.link = link - self.fit_dispersion = fit_dispersion self.solver = solver self.max_iter = max_iter self.tol = tol @@ -310,8 +300,8 @@ def fit(self, X, y, sample_weight=None): weights = weights/weights_sum # initialization of coef = (intercept_, coef) - # Note: Since phi=self.dispersion_ does not enter the estimation - # of mu_i=E[y_i], set it to 1. + # Note: The dispersion parameter phi does not enter the estimation + # of mu_i=E[y_i]. if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: @@ -363,10 +353,6 @@ def func(coef, X, y, weights, alpha, family, link): self.intercept_ = 0. self.coef_ = coef - if self.fit_dispersion in ['chisqr', 'deviance']: - # attention because of rescaling of weights - self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum - return self def _linear_predictor(self, X): @@ -406,51 +392,6 @@ def predict(self, X): mu = self._link_instance.inverse(eta) return mu - def estimate_phi(self, X, y, sample_weight=None): - """Estimate/fit the dispersion parameter phi. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training data. - - y : array-like, shape (n_samples,) - Target values. - - sample_weight : {None, array-like}, shape (n_samples,), optional \ - (default=None) - Sample weights. - - Returns - ------- - phi : float - Dispersion parameter. - """ - check_is_fitted(self, "coef_") - _dtype = [np.float64, np.float32] - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], - dtype=_dtype, y_numeric=True, multi_output=False) - n_samples, n_features = X.shape - weights = _check_sample_weight(sample_weight, X) - eta = X @ self.coef_ - if self.fit_intercept is True: - eta += self.intercept_ - n_features += 1 - if n_samples <= n_features: - raise ValueError("Estimation of dispersion parameter phi requires" - " more samples than features, got" - " samples=X.shape[0]={0} and" - " n_features=X.shape[1]+fit_intercept={1}." - .format(n_samples, n_features)) - mu = self._link_instance.inverse(eta) - if self.fit_dispersion == 'chisqr': - chisq = np.sum(weights*(y-mu)**2 / - self._family_instance.unit_variance(mu)) - return chisq/(n_samples - n_features) - elif self.fit_dispersion == 'deviance': - dev = self._family_instance.deviance(y, mu, weights) - return dev/(n_samples - n_features) - def score(self, X, y, sample_weight=None): """Compute D^2, the percentage of deviance explained. @@ -525,11 +466,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: @@ -564,9 +500,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -605,14 +538,13 @@ class PoissonRegressor(GeneralizedLinearRegressor): `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', - fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link=link, - fit_dispersion=fit_dispersion, solver=solver, - max_iter=max_iter, tol=tol, warm_start=warm_start, - copy_X=copy_X, verbose=verbose) + solver=solver, max_iter=max_iter, tol=tol, + warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property def family(self): @@ -652,11 +584,6 @@ class GammaRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: @@ -691,9 +618,6 @@ class GammaRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -721,14 +645,13 @@ class GammaRegressor(GeneralizedLinearRegressor): `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', - fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="gamma", link=link, - fit_dispersion=fit_dispersion, solver=solver, - max_iter=max_iter, tol=tol, warm_start=warm_start, - copy_X=copy_X, verbose=verbose) + solver=solver, max_iter=max_iter, tol=tol, + warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property def family(self): @@ -783,11 +706,6 @@ class TweedieRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None) - Method for estimation of the dispersion parameter phi. Whether to use - the chi squared statistic or the deviance statistic. If None, the - dispersion is not estimated. - solver : {'lbfgs'}, optional (default='lbfgs') Algorithm to use in the optimization problem: @@ -822,9 +740,6 @@ class TweedieRegressor(GeneralizedLinearRegressor): intercept_ : float Intercept (a.k.a. bias) added to linear predictor. - dispersion_ : float - The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set. - n_iter_ : int Actual number of iterations used in solver. @@ -852,14 +767,13 @@ class TweedieRegressor(GeneralizedLinearRegressor): `_ """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', - fit_dispersion=None, solver='lbfgs', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, - fit_dispersion=fit_dispersion, solver=solver, - max_iter=max_iter, tol=tol, warm_start=warm_start, - copy_X=copy_X, verbose=verbose) + solver=solver, max_iter=max_iter, tol=tol, + warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property def family(self): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 12edbefbf2833..5bd80cbf76fcf 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -332,24 +332,6 @@ def test_solver_equivalence(params, regression_data): ) -def test_fit_dispersion(regression_data): - X, y = regression_data - - est1 = GeneralizedLinearRegressor() - est1.fit(X, y) - assert not hasattr(est1, "dispersion_") - - est2 = GeneralizedLinearRegressor(fit_dispersion="chisqr") - est2.fit(X, y) - assert isinstance(est2.dispersion_, float) - - est3 = GeneralizedLinearRegressor(fit_dispersion="deviance") - est3.fit(X, y) - assert isinstance(est3.dispersion_, float) - - assert_allclose(est2.dispersion_, est3.dispersion_) - - @pytest.mark.parametrize("solver", GLM_SOLVERS) def test_convergence_warning(solver, regression_data): X, y = regression_data From ebbbe9cd1c670e15aca835b92730f570c49e6e4c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 13 Aug 2019 08:44:50 +0200 Subject: [PATCH 121/269] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 09b657c26b915..f352860b6826f 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1003,7 +1003,7 @@ In the unpenalized case, the assumptions are the following: :math:`\phi` and sample weights :math:`s_i`. * The aim is to predict the expectation :math:`\mu_i` with :math:`\hat{y}_i = h(\eta_i)`, linear predictor - :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`. + :math:`\eta_i=(Xw)_i` and inverse link function :math:`h`. Note that the first assumption implies :math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance From 918e2574ad99724d4d4a437af5283f32442d5752 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 13 Aug 2019 08:53:57 +0200 Subject: [PATCH 122/269] Update doc/modules/linear_model.rst --- doc/modules/linear_model.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 09b657c26b915..32b7494ba7c00 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -927,6 +927,9 @@ Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` ================= =============================== ====================================== ============================================ +Usage +----- + In the following use cases, a loss different from the squared loss might be appropriate, @@ -944,7 +947,7 @@ Since the linear predictor :math:`Xw` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the -log-link with :math:`h(x^\top w)=\exp(x^\top w)`. +log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned @@ -1018,7 +1021,9 @@ A few remarks: * The minimization is equivalent to (penalized) maximum likelihood estimation. * The deviances for at least Normal, Poisson and Gamma distributions are strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [12]_. + (19)-(20) in [12]_. This means that, given an appropriate feature matrix `X`, + you get good (asymptotic) estimators for the expectation when using these + deviances. .. topic:: References: From 37d0f47bac45c27d01f949a835ecddad471b8d42 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:14:20 +0300 Subject: [PATCH 123/269] Use double `` when necessary --- doc/modules/linear_model.rst | 10 +++++----- sklearn/linear_model/_glm/glm.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d0e3f9542a641..7bb684e8bcf87 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -951,22 +951,22 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned -distributions using the appropriate power parameter `p`, i.e. the exponent of +distributions using the appropriate power parameter ``p``, i.e. the exponent of the unit variance function, - - `p = 0`: Normal distribution. Specialized solvers such as + - ``p = 0``: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in this case. - - `p = 1`: Poisson distribution. :class:`PoissonRegressor` is exposed for + - ``p = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=1)`. - - `p = 2`: Gamma distribution. :class:`GammaRegressor` is exposed for + - ``p = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=2)`. - - `p = 3`: Inverse Gamma distribution. + - ``p = 3``: Inverse Gamma distribution. Note: diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index cf9b00527c8cf..baed5fecb2b97 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -397,7 +397,7 @@ def score(self, X, y, sample_weight=None): D^2 is a generalization of the coefficient of determination R^2. R^2 uses squared error and D^2 deviance. Note that those two are equal - for family='normal'. + for ``family='normal'``. D^2 is defined as :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, From 9c337f25ea389c15e32ce943292fbe7b953a95b3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:16:59 +0300 Subject: [PATCH 124/269] ax -> axes in plot_poisson_regression_non_normal_loss.py --- .../plot_poisson_regression_non_normal_loss.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 6cee852866018..5c0b64faea255 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -228,19 +228,19 @@ def load_mtpl2(n_samples=100000): # histogram of observed target values with that of predicted values, -fig, ax = plt.subplots(1, 4, figsize=(16, 3)) +fig, axes = plt.subplots(1, 4, figsize=(16, 3)) -df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0]) +df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0]) -ax[0].set_title('Experimental data') +axes[0].set_title('Experimental data') for idx, model in enumerate([linregr, glm_freq, gbr]): y_pred = model.predict(X_train) - pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1]) - ax[idx + 1].set_title(model.__class__.__name__) + pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) + axes[idx + 1].set_title(model.__class__.__name__) -for axi in ax: +for axi in axes: axi.set( yscale='log', xlabel="y (Frequency)" From 5e05935caf29ff6bffb25ab5c83a1e4c855b83b0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:21:13 +0300 Subject: [PATCH 125/269] Update sklearn/linear_model/_glm/distribution.py Co-Authored-By: Nicolas Hug --- sklearn/linear_model/_glm/distribution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 950fa3fbb03e7..b1845eb13e921 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -206,7 +206,7 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): if coef.size == X.shape[1] + 1: devp = np.concatenate(([temp.sum()], temp @ X)) else: - devp = temp @ X # sampe as X.T @ temp + devp = temp @ X # same as X.T @ temp return mu, devp From 4a6821393715e42150fb8673f3d45840002d0114 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:26:47 +0300 Subject: [PATCH 126/269] Remove solver=auto --- sklearn/linear_model/_glm/glm.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index baed5fecb2b97..4786d231336b6 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -73,19 +73,12 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - 'log' for families 'poisson', 'gamma', 'inverse-gaussian' - solver : {'auto', 'lbfgs'}, optional (default='auto') + solver : 'lbfgs', optional (default='lbfgs') Algorithm to use in the optimization problem: - 'auto' - Sets 'lbfgs' - 'lbfgs' Calls scipy's L-BFGS-B optimizer. - - Note that all solvers except lbfgs use the fisher matrix, i.e. the - expected Hessian instead of the Hessian matrix. - max_iter : int, optional (default=100) The maximal number of iterations for solver algorithms. @@ -161,7 +154,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', - solver='auto', max_iter=100, tol=1e-4, warm_start=False, + solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept @@ -243,14 +236,11 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.fit_intercept, bool): raise ValueError("The argument fit_intercept must be bool;" " got {0}".format(self.fit_intercept)) - if self.solver not in ['auto', 'lbfgs']: + if self.solver not in ['lbfgs']: raise ValueError("GeneralizedLinearRegressor supports only solvers" - "'auto', 'lbfgs';" - " got {0}".format(self.solver)) + "'lbfgs'; got {0}".format(self.solver)) solver = self.solver - if self.solver == 'auto': - solver = 'lbfgs' - if (not isinstance(self.max_iter, int) + if (not isinstance(self.max_iter, numbers.Integral) or self.max_iter <= 0): raise ValueError("Maximum number of iteration must be a positive " "integer;" From 8ee5c85a0140175e354ca9f8dd4db08f23606b9f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:38:56 +0300 Subject: [PATCH 127/269] Update sklearn/linear_model/_glm/glm.py Co-Authored-By: Nicolas Hug --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index baed5fecb2b97..f770337e40e62 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -384,7 +384,7 @@ def predict(self, X): Returns ------- - C : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Returns predicted values. """ # check_array is done in _linear_predictor From a1f8aabb6042aac39929a65faaac78e3b6b68e32 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:39:53 +0300 Subject: [PATCH 128/269] More review comments --- sklearn/linear_model/_glm/glm.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f81408a83a5cf..25f018c3e0eb4 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -261,16 +261,14 @@ def fit(self, X, y, sample_weight=None): family = self._family_instance link = self._link_instance - _dtype = [np.float64, np.float32] - _stype = ['csc', 'csr'] - X, y = check_X_y(X, y, accept_sparse=_stype, - dtype=_dtype, y_numeric=True, multi_output=False, - copy=self.copy_X) + X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], + dtype=[np.float64, np.float32], + y_numeric=True, multi_output=False, copy=self.copy_X) y = np.asarray(y, dtype=np.float64) weights = _check_sample_weight(sample_weight, X) - n_samples, n_features = X.shape + _, n_features = X.shape if self.check_input: if not np.all(family.in_y_range(y)): @@ -287,7 +285,7 @@ def fit(self, X, y, sample_weight=None): # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance) weights_sum = np.sum(weights) - weights = weights/weights_sum + weights = weights / weights_sum # initialization of coef = (intercept_, coef) # Note: The dispersion parameter phi does not enter the estimation @@ -355,7 +353,7 @@ def _linear_predictor(self, X): Returns ------- - C : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Returns predicted values of linear predictor. """ check_is_fitted(self, "coef_") From c0999ead8e8e834456f4a0f61ec77cec94790f4d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:44:36 +0300 Subject: [PATCH 129/269] Addressing reviews in tests --- sklearn/linear_model/_glm/glm.py | 4 ---- sklearn/linear_model/_glm/tests/test_glm.py | 16 ++++++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 25f018c3e0eb4..c1f7aecd32e4f 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -287,10 +287,6 @@ def fit(self, X, y, sample_weight=None): weights_sum = np.sum(weights) weights = weights / weights_sum - # initialization of coef = (intercept_, coef) - # Note: The dispersion parameter phi does not enter the estimation - # of mu_i=E[y_i]. - if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: coef = np.concatenate((np.array([self.intercept_]), diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 5bd80cbf76fcf..c4e8c883ff379 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -61,17 +61,17 @@ def test_sample_weights_validation(): glm.fit(X, y, weights) -@pytest.mark.parametrize('f, fam', +@pytest.mark.parametrize('name, instance', [('normal', NormalDistribution()), ('poisson', PoissonDistribution()), ('gamma', GammaDistribution()), ('inverse-gaussian', InverseGaussianDistribution())]) -def test_glm_family_argument(f, fam): +def test_glm_family_argument(name, instance): """Test GLM family argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y) - assert isinstance(glm._family_instance, fam.__class__) + glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y) + assert isinstance(glm._family_instance, instance.__class__) glm = GeneralizedLinearRegressor(family='not a family', fit_intercept=False) @@ -79,15 +79,15 @@ def test_glm_family_argument(f, fam): glm.fit(X, y) -@pytest.mark.parametrize('l, link', +@pytest.mark.parametrize('name, instance', [('identity', IdentityLink()), ('log', LogLink())]) -def test_glm_link_argument(l, link): +def test_glm_link_argument(name, instance): """Test GLM link argument set as string.""" y = np.array([0.1, 0.5]) # in range of all distributions X = np.array([[1], [2]]) - glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y) - assert isinstance(glm._link_instance, link.__class__) + glm = GeneralizedLinearRegressor(family='normal', link=name).fit(X, y) + assert isinstance(glm._link_instance, instance.__class__) glm = GeneralizedLinearRegressor(family='normal', link='not a link') with pytest.raises(ValueError, match="link must be"): From e09e3368d5e3c173569dba1669cbf97334d20db0 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 17 Aug 2019 00:50:37 +0300 Subject: [PATCH 130/269] More comments in tests --- sklearn/linear_model/_glm/tests/test_glm.py | 52 ++++----------------- 1 file changed, 9 insertions(+), 43 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index c4e8c883ff379..ebcab6395e5b4 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -23,7 +23,6 @@ GammaDistribution, InverseGaussianDistribution, ) from sklearn.linear_model import Ridge -from sklearn.metrics import mean_absolute_error from sklearn.exceptions import ConvergenceWarning GLM_SOLVERS = ['lbfgs'] @@ -183,8 +182,7 @@ def test_glm_identity_regression(solver): X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', - fit_intercept=False, solver=solver, - tol=1e-7) + fit_intercept=False, solver=solver) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=1e-6) @@ -242,18 +240,13 @@ def test_warm_start(fit_intercept): @pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize('solver', GLM_SOLVERS) def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): - """Test ridge regression for Normal distributions. - - Case n_samples >> n_features - - Compare to test_ridge in test_ridge.py. - """ + """Compare with Ridge regression for Normal distributions.""" alpha = 1.0 n_predict = 10 - X, y, coef = make_regression(n_samples=n_samples+n_predict, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=42) + X, y, _ = make_regression(n_samples=n_samples+n_predict, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + coef=True, random_state=42) y = y[0:n_samples] X, T = X[0:n_samples], X[n_samples:] @@ -279,10 +272,9 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) -def test_poisson_ridge(solver, tol): - """Test ridge regression with poisson family and LogLink. - - Compare to R's glmnet""" +def test_poisson_glmnet(solver, tol): + """Compare Poisson regression with L2 regularization and LogLink to glmnet + """ # library("glmnet") # options(digits=10) # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2)) @@ -306,32 +298,6 @@ def test_poisson_ridge(solver, tol): assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) -@pytest.mark.parametrize( - "params", - [ - {"solver": "lbfgs"}, - ], - ids=lambda params: ', '.join("%s=%s" % (key, val) - for key, val in params.items()) -) -def test_solver_equivalence(params, regression_data): - X, y = regression_data - est_ref = GeneralizedLinearRegressor() - est_ref.fit(X, y) - - estimator = GeneralizedLinearRegressor(**params) - - estimator.fit(X, y) - - assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4) - assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4) - assert_allclose( - mean_absolute_error(estimator.predict(X), y), - mean_absolute_error(est_ref.predict(X), y), - rtol=1e-4 - ) - - @pytest.mark.parametrize("solver", GLM_SOLVERS) def test_convergence_warning(solver, regression_data): X, y = regression_data From 6601d3049d3637c1823ff709d5416b306386d46a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 15:49:25 +0200 Subject: [PATCH 131/269] Update linear_model.rst --- doc/modules/linear_model.rst | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 7bb684e8bcf87..8b3764e7cefd0 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -914,8 +914,8 @@ likelihood as \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right) -The following table lists some specific EDM distributions—all are Tweedie -distributions—and some properties. +The following table lists some specific EDM distributions—all are Tweedie +distributions—and some of their properties. ================= =============================== ====================================== ============================================ Distribution Target Domain Unit Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` @@ -943,7 +943,7 @@ appropriate, you might try an Inverse Gaussian deviance (or even higher variance powers of the Tweedie family). -Since the linear predictor :math:`Xw` can be negative and +Since the linear predictor :math:`x^\top w` can be negative and Poisson, Gamma and Inverse Gaussian distributions don't support negative values, it is convenient to apply a link function different from the identity link :math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the @@ -951,32 +951,33 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned -distributions using the appropriate power parameter ``p``, i.e. the exponent of -the unit variance function, +distributions using the appropriate ``power`` parameter, i.e. the exponent +of the unit variance function, - - ``p = 0``: Normal distribution. Specialized solvers such as + - ``power = 0``: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in this case. - - ``p = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for + - ``power = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=1)`. - - ``p = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for + - ``power = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for convenience. However, it is strictly equivalent to `TweedieRegressor(power=2)`. - - ``p = 3``: Inverse Gamma distribution. + - ``power = 3``: Inverse Gamma distribution. -Note: -* The feature matrix `X` should be standardized before fitting. This - ensures that the penalty treats features equally. -* If you want to model a relative frequency, i.e. counts per exposure (time, - volume, ...) you can do so by a Poisson distribution and passing - :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together - with :math:`s=\mathrm{exposure}` as sample weights. This is done in both - examples linked below. +.. note:: + + * The feature matrix `X` should be standardized before fitting. This + ensures that the penalty treats features equally. + * If you want to model a relative frequency, i.e. counts per exposure (time, + volume, ...) you can do so by a Poisson distribution and passing + :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values + together with :math:`s=\mathrm{exposure}` as sample weights. This is done + in both examples linked below. The estimator can be used as follows:: From 5174dae1cbc7ea9442243f15f3887f8252520f46 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 18:55:24 +0200 Subject: [PATCH 132/269] Address check_is_fitted deprication of attributes --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index c1f7aecd32e4f..13ca3673c8b3d 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -352,7 +352,7 @@ def _linear_predictor(self, X): y_pred : array, shape (n_samples,) Returns predicted values of linear predictor. """ - check_is_fitted(self, "coef_") + check_is_fitted(self) X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype='numeric', ensure_2d=True, allow_nd=False) From 61dc13fdd5b71fe0e1d30e5a9b58934441a9dcb1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 20:18:21 +0200 Subject: [PATCH 133/269] No LaTeX in docstrings --- sklearn/linear_model/_glm/distribution.py | 89 ++++++++++------------- sklearn/linear_model/_glm/glm.py | 66 ++++++++--------- 2 files changed, 70 insertions(+), 85 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index b1845eb13e921..1baf99a465326 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -26,19 +26,15 @@ def _safe_lin_pred(X, coef): class ExponentialDispersionModel(metaclass=ABCMeta): - r"""Base class for reproductive Exponential Dispersion Models (EDM). + """Base class for reproductive Exponential Dispersion Models (EDM). - The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + The pdf of Y∼EDM(μ, φ) is given by:: - .. math:: p(y| \theta, \phi) = c(y, \phi) - \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) - = \tilde{c}(y, \phi) - \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + p(y| θ, φ) = c1(y, φ) * exp((θy-A(θ))/φ) + = c2(y, φ) * exp(-d(y, μ)/(2φ)) - with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, - variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, - unit variance :math:`v(\mu)` and - unit deviance :math:`d(y,\mu)`. + with mean E[Y] = A'(θ) = μ, variance Var[Y] = φ * v(μ), + unit variance v(μ), unit deviance d(y,μ) and dispersion parameter φ. Methods ------- @@ -56,7 +52,7 @@ class ExponentialDispersionModel(metaclass=ABCMeta): """ def in_y_range(self, y): - """Returns ``True`` if y is in the valid range of Y~EDM. + """Returns ``True`` if y is in the valid range of Y∼EDM. Parameters ---------- @@ -80,17 +76,13 @@ def in_y_range(self, y): @abstractmethod def unit_variance(self, mu): - r"""Compute the unit variance function. + """Compute the unit variance function. - The unit variance :math:`v(\mu)` determines the variance as - a function of the mean :math:`\mu` by - :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. - It can also be derived from the unit deviance :math:`d(y,\mu)` as + The unit variance v(μ) determines the variance as a function of the + mean μ by Var[Y_i] = φ/s_i * v(μ_i). + It can also be derived from the unit deviance d(y,μ) as:: - .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ - \partial\mu^2}}\big|_{y=\mu} - - See also :func:`variance`. + v(μ) = 2/(∂^2 d(y,μ)/(∂ μ^2))|_{y=μ} Parameters ---------- @@ -101,9 +93,9 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - r"""Compute the derivative of the unit variance w.r.t. mu. + """Compute the derivative of the unit variance w.r.t. mu. - Return :math:`v'(\mu)`. + Return v'(μ). Parameters ---------- @@ -114,12 +106,11 @@ def unit_variance_derivative(self, mu): @abstractmethod def unit_deviance(self, y, mu, check_input=False): - r"""Compute the unit deviance. + """Compute the unit deviance. + + The unit_deviance d(y,μ) can be defined by the log-likelihood as:: - The unit_deviance :math:`d(y,\mu)` can be defined by the - log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)) Parameters ---------- @@ -140,11 +131,10 @@ def unit_deviance(self, y, mu, check_input=False): pass # pragma: no cover def unit_deviance_derivative(self, y, mu): - r"""Compute the derivative of the unit deviance w.r.t. mu. + """Compute the derivative of the unit deviance w.r.t. mu. The derivative of the unit deviance is given by - :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` - with unit variance :math:`v(\mu)`. + ∂ d(y,μ)/(∂ μ) = -2(y-μ)/v(μ) with unit variance v(μ). Parameters ---------- @@ -157,14 +147,13 @@ def unit_deviance_derivative(self, y, mu): return -2 * (y - mu) / self.unit_variance(mu) def deviance(self, y, mu, weights=1): - r"""Compute the deviance. + """Compute the deviance. The deviance is a weighted sum of the per sample unit deviances, - :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` - with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. - In terms of the log-likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{s}) - - loglike(y,y,\frac{phi}{s})\right)`. + D = sum_i s_i * d(y_i,μ_i) + with weights s_i and unit deviance d(y,μ). + In terms of the log-likelihood it is + D = -2φ * (loglike(y,μ,φ/s) - loglike(y,y,φ/s)). Parameters ---------- @@ -182,7 +171,7 @@ def deviance(self, y, mu, weights=1): def deviance_derivative(self, y, mu, weights=1): """Compute the derivative of the deviance w.r.t. mu. - It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + It gives ∂ D(y, μ; weights)/(∂ μ). Parameters ---------- @@ -211,11 +200,10 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): class TweedieDistribution(ExponentialDispersionModel): - r"""A class for the Tweedie distribution. + """A class for the Tweedie distribution. - A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely - defined by it's mean-variance relationship - :math:`\mathrm{Var}[Y] \propto \mu^power`. + A Tweedie distribution with mean μ=E[Y] is uniquely defined by it's + mean-variance relationship Var[Y] ∝ μ^power. Special cases are: @@ -231,8 +219,7 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- power : float (default=0) - The variance power of the `unit_variance` - :math:`v(\mu) = \mu^{power}`. + The variance power of the unit variance v(μ) = μ^power. For ``0=1.') + raise ValueError('Tweedie distribution is only defined for ' + 'power<=0 and p>=1.') elif 1 <= power < 2: # Poisson or Compound Poisson distribution self._lower_bound = DistributionBoundary(0, inclusive=True) @@ -279,7 +266,7 @@ def unit_variance(self, mu): def unit_variance_derivative(self, mu): """Compute the derivative of the unit variance of a Tweedie - distribution v(mu)=power*mu**(power-1). + distribution v(mu)=power * mu**(power-1). Parameters ---------- @@ -289,12 +276,10 @@ def unit_variance_derivative(self, mu): return self.power * np.power(mu, self.power - 1) def unit_deviance(self, y, mu, check_input=False): - r"""Compute the unit deviance. + """Compute the unit deviance. - The unit_deviance :math:`d(y,\mu)` can be defined by the - log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + The unit deviance d(y,μ) can be defined by the log-likelihood as + d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)). Parameters ---------- @@ -328,7 +313,7 @@ def unit_deviance(self, y, mu, check_input=False): raise ValueError("Tweedie deviance is only defined for p<=0 " "and p>=1.") elif 1 <= p < 2: - # Poisson and Compount poisson distribution, y >= 0, mu > 0 + # Poisson and Compound poisson distribution, y >= 0, mu > 0 if (y < 0).any() or (mu <= 0).any(): raise ValueError(message + "non-negative y and strictly " "positive mu.") diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 13ca3673c8b3d..fb653b5e4361f 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -31,7 +31,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). Therefore, + fitting and predicting the mean of the target y as μ=h(X*w). Therefore, the fit minimizes the following objective function with L2 priors as regularizer:: @@ -118,16 +118,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and + Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of + and given by the specific EDM, see :ref:`User Guide `. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For alpha > 0, the feature matrix X should be standardized in order to + For ``alpha > 0``, the feature matrix X should be standardized in order to penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. @@ -138,7 +138,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + Consider ȳ = (sum_i s_i y_i)(sum_i s_i), in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. @@ -183,9 +183,9 @@ def fit(self, X, y, sample_weight=None): optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has - Var[Y_i]=phi/w_i * v(mu). - If Y_i ~ EDM(mu, phi/w_i), then - sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + Var[Y_i]=φ/w_i * v(mu). + If Y_i ~ EDM(mu, φ/w_i), then + sum(w*Y)/sum(w) ~ EDM(mu, φ/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. Returns @@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("No default link known for the " "specified distribution family. Please " "set link manually, i.e. not to 'auto'; " - "got (link='auto', family={}" + "got (link='auto', family={})" .format(self.family)) elif self.link == 'identity': self._link_instance = IdentityLink() @@ -383,11 +383,10 @@ def score(self, X, y, sample_weight=None): R^2 uses squared error and D^2 deviance. Note that those two are equal for ``family='normal'``. - D^2 is defined as - :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, - :math:`D_{null}` is the null deviance, i.e. the deviance of a model - with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. - The mean :math:`\\bar{y}` is averaged by sample_weight. + D^2 is defined as D^2 = 1 - D(y_true,y_pred) / D_null, + D_null is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to y_pred = ȳ. + The mean ȳ is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). @@ -490,17 +489,18 @@ class PoissonRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and + Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of + and given by the specific EDM, see :ref:`User Guide `. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For alpha > 0, the feature matrix X should be standardized in order to - penalize features equally strong. + For ``alpha > 0``, the feature matrix X should be standardized in order to + penalize features equally strong. Call + :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. If the target y is a ratio, appropriate sample weights s should be provided. @@ -508,7 +508,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + Consider ȳ = (sum_i s_i y_i)(sum_i s_i), in this case one might say that y has a 'scaled' Poisson distributions. References @@ -608,12 +608,12 @@ class GammaRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see + the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and + Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of + and given by the specific EDM, see :ref:`User Guide `. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. @@ -664,7 +664,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: :math:`v(\\mu) = \\mu^{power}`. + The variance power: v(μ) = μ^{power}. For ``0`. - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by + The parameters w (``coef_`` and ``intercept_``) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. From 44524cafc0aac0289267f1099b2ec2f8d53fe6c7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 20:28:54 +0200 Subject: [PATCH 134/269] Replace Tweedie p->power --- sklearn/linear_model/_glm/distribution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 1baf99a465326..e82f787d7e710 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -241,7 +241,7 @@ def power(self, power): self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False) elif 0 < power < 1: raise ValueError('Tweedie distribution is only defined for ' - 'power<=0 and p>=1.') + 'power<=0 and power>=1.') elif 1 <= power < 2: # Poisson or Compound Poisson distribution self._lower_bound = DistributionBoundary(0, inclusive=True) From 58d240973a175ee705d4e8cfbd4e9b6495eca609 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 17 Aug 2019 20:34:11 +0200 Subject: [PATCH 135/269] Replace Tweedie p->power --- sklearn/linear_model/_glm/distribution.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e82f787d7e710..e0b3511734cee 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -300,7 +300,7 @@ def unit_deviance(self, y, mu, check_input=False): p = self.power if check_input: - message = ("Mean Tweedie deviance error with p={} can only be " + message = ("Mean Tweedie deviance error with power={} can only be " "used on ".format(p)) if p < 0: # 'Extreme stable', y any realy number, mu > 0 @@ -310,8 +310,8 @@ def unit_deviance(self, y, mu, check_input=False): # Normal, y and mu can be any real number pass elif 0 < p < 1: - raise ValueError("Tweedie deviance is only defined for p<=0 " - "and p>=1.") + raise ValueError("Tweedie deviance is only defined for " + "power<=0 and power>=1.") elif 1 <= p < 2: # Poisson and Compound poisson distribution, y >= 0, mu > 0 if (y < 0).any() or (mu <= 0).any(): @@ -335,8 +335,8 @@ def unit_deviance(self, y, mu, check_input=False): # Normal distribution, y and mu any real number dev = (y - mu)**2 elif p < 1: - raise ValueError("Tweedie deviance is only defined for p<=0 and " - "p>=1.") + raise ValueError("Tweedie deviance is only defined for power<=0 " + "and power>=1.") elif p == 1: # Poisson distribution dev = 2 * (xlogy(y, y/mu) - y + mu) From ee351e1f7137bc225d8f964558009612303b37db Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 00:11:57 +0200 Subject: [PATCH 136/269] Fix tests due to Tweedie p->power --- sklearn/linear_model/_glm/tests/test_distribution.py | 2 +- sklearn/metrics/tests/test_regression.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/linear_model/_glm/tests/test_distribution.py index 82e493b7a2149..ed81c8328c87a 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/linear_model/_glm/tests/test_distribution.py @@ -45,7 +45,7 @@ def test_invalid_distribution_bound(): def test_tweedie_distribution_power(): - msg = "distribution is only defined for p<=0 and p>=1" + msg = "distribution is only defined for power<=0 and power>=1" with pytest.raises(ValueError, match=msg): TweedieDistribution(power=0.5) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 8e18321f80303..8febf9064e58e 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -133,7 +133,7 @@ def test_regression_metrics_at_limits(): mean_tweedie_deviance([0.], [0.], p=p) with pytest.raises(ValueError, - match="is only defined for p<=0 and p>=1"): + match="is only defined for power<=0 and power>=1"): mean_tweedie_deviance([0.], [0.], p=0.5) From 33fe9be737001045eb9fad5616f678fb27fb8d79 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 12:09:24 +0200 Subject: [PATCH 137/269] Simplify super(...) --- sklearn/linear_model/_glm/distribution.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e0b3511734cee..7ce51deb0921d 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -353,25 +353,25 @@ def unit_deviance(self, y, mu, check_input=False): class NormalDistribution(TweedieDistribution): """Class for the Normal (aka Gaussian) distribution""" def __init__(self): - super(NormalDistribution, self).__init__(power=0) + super().__init__(power=0) class PoissonDistribution(TweedieDistribution): """Class for the scaled Poisson distribution""" def __init__(self): - super(PoissonDistribution, self).__init__(power=1) + super().__init__(power=1) class GammaDistribution(TweedieDistribution): """Class for the Gamma distribution""" def __init__(self): - super(GammaDistribution, self).__init__(power=2) + super().__init__(power=2) class InverseGaussianDistribution(TweedieDistribution): """Class for the scaled InverseGaussianDistribution distribution""" def __init__(self): - super(InverseGaussianDistribution, self).__init__(power=3) + super().__init__(power=3) EDM_DISTRIBUTIONS = { From 94272e79d565672416fbc6772aa8c0eb9e7ce519 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 12:26:12 +0200 Subject: [PATCH 138/269] Replace Link.link(..) by __call__(..) --- sklearn/linear_model/_glm/glm.py | 8 ++++---- sklearn/linear_model/_glm/link.py | 8 ++++---- sklearn/linear_model/_glm/tests/test_link.py | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index fb653b5e4361f..4d7a05095cb27 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -49,7 +49,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix X must have full column rank (no collinearities). @@ -296,7 +296,7 @@ def fit(self, X, y, sample_weight=None): else: if self.fit_intercept: coef = np.zeros(n_features+1) - coef[0] = link.link(np.average(y, weights=weights)) + coef[0] = link(np.average(y, weights=weights)) else: coef = np.zeros(n_features) @@ -441,7 +441,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix X must have full column rank (no collinearities). @@ -560,7 +560,7 @@ class GammaRegressor(GeneralizedLinearRegressor): Constant that multiplies the penalty terms and thus determines the regularization strength. See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this + parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this case, the design matrix X must have full column rank (no collinearities). diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index ec9a7b7736eb1..1ef485cc85a55 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -15,7 +15,7 @@ class Link(metaclass=ABCMeta): """Abstract base class for Link functions.""" @abstractmethod - def link(self, mu): + def __call__(self, mu): """Compute the link function g(mu). The link function links the mean mu=E[Y] to the so called linear @@ -79,7 +79,7 @@ def inverse_derivative2(self, lin_pred): class IdentityLink(Link): """The identity link function g(x)=x.""" - def link(self, mu): + def __call__(self, mu): return mu def derivative(self, mu): @@ -98,7 +98,7 @@ def inverse_derivative2(self, lin_pred): class LogLink(Link): """The log link function g(x)=log(x).""" - def link(self, mu): + def __call__(self, mu): return np.log(mu) def derivative(self, mu): @@ -117,7 +117,7 @@ def inverse_derivative2(self, lin_pred): class LogitLink(Link): """The logit link function g(x)=logit(x).""" - def link(self, mu): + def __call__(self, mu): return logit(mu) def derivative(self, mu): diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index 3a2a21c4c04e0..feafb151e14a2 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -25,7 +25,7 @@ def test_link_properties(link): # careful for large x, note expit(36) = 1 # limit max eta to 15 x = x / 100 * 15 - assert_allclose(link.link(link.inverse(x)), x) + assert_allclose(link(link.inverse(x)), x) # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) assert_allclose(link.derivative(link.inverse(x)), 1./link.inverse_derivative(x)) @@ -34,5 +34,5 @@ def test_link_properties(link): link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) # for LogitLink, in the following x should be between 0 and 1. - # assert_almost_equal(link.inverse_derivative(link.link(x)), + # assert_almost_equal(link.inverse_derivative(link(x)), # 1./link.derivative(x), decimal=decimal) From 2457039f82ea308e634e4853484df17029f311ac Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 12:45:35 +0200 Subject: [PATCH 139/269] Replace 1. -> 1 --- sklearn/linear_model/_glm/glm.py | 2 +- sklearn/linear_model/_glm/link.py | 8 ++++---- sklearn/linear_model/_glm/tests/test_link.py | 11 ++++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 4d7a05095cb27..b8d87799f8a93 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -415,7 +415,7 @@ def score(self, X, y, sample_weight=None): dev = self._family_instance.deviance(y, mu, weights=weights) y_mean = np.average(y, weights=weights) dev_null = self._family_instance.deviance(y, y_mean, weights=weights) - return 1. - dev / dev_null + return 1 - dev / dev_null def _more_tags(self): return {"requires_positive_y": True} diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index 1ef485cc85a55..04b485c067cca 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -102,7 +102,7 @@ def __call__(self, mu): return np.log(mu) def derivative(self, mu): - return 1./mu + return 1 / mu def inverse(self, lin_pred): return np.exp(lin_pred) @@ -121,15 +121,15 @@ def __call__(self, mu): return logit(mu) def derivative(self, mu): - return 1. / (mu * (1 - mu)) + return 1 / (mu * (1 - mu)) def inverse(self, lin_pred): return expit(lin_pred) def inverse_derivative(self, lin_pred): ep = expit(lin_pred) - return ep * (1. - ep) + return ep * (1 - ep) def inverse_derivative2(self, lin_pred): ep = expit(lin_pred) - return ep * (1. - ep) * (1. - 2 * ep) + return ep * (1 - ep) * (1 - 2 * ep) diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index feafb151e14a2..a631509baca79 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -19,20 +19,21 @@ def test_link_properties(link): """Test link inverse and derivative.""" rng = np.random.RandomState(42) - x = rng.rand(100)*100 + x = rng.rand(100) * 100 link = link() # instantiate object if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 # limit max eta to 15 x = x / 100 * 15 assert_allclose(link(link.inverse(x)), x) - # if f(g(x)) = x, then f'(g(x)) = 1/g'(x) + # if g(h(x)) = x, then g'(h(x)) = 1/h'(x) + # g = link, h = link.inverse assert_allclose(link.derivative(link.inverse(x)), - 1./link.inverse_derivative(x)) + 1 / link.inverse_derivative(x)) assert ( link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) - # for LogitLink, in the following x should be between 0 and 1. + # for LogitLink, in the following x should be between 0 and 1 # assert_almost_equal(link.inverse_derivative(link(x)), - # 1./link.derivative(x), decimal=decimal) + # 1 / link.derivative(x), decimal=decimal) From 6396d2c4495681d7d774f7c7de7054b336ba2709 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 18 Aug 2019 13:46:45 +0200 Subject: [PATCH 140/269] Fix table in TweedieRegressor --- sklearn/linear_model/_glm/glm.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index b8d87799f8a93..f538aa6121747 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -664,19 +664,24 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: v(μ) = μ^{power}. + The variance power: v(μ) = μ^power. For ``0 Date: Thu, 22 Aug 2019 12:02:28 +0300 Subject: [PATCH 141/269] Improve docstring in plot_tweedie_regression_insurance_claims.py --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 55a21c8d8723d..24b3afcadf120 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -85,8 +85,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, Parameters ---------- - df : DataFrame with at least three columns named feature, weight and - observed + df : DataFrame + input data feature: str a column name of df for the feature to be plotted weight : str From da66fd5dc0fb8fc89825e5cf75057eed70c09aed Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 12:23:40 +0300 Subject: [PATCH 142/269] Use train_test_split in tests --- sklearn/linear_model/_glm/tests/test_glm.py | 36 +++++++++++---------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index ebcab6395e5b4..d6c7c7c3f767e 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -24,6 +24,7 @@ ) from sklearn.linear_model import Ridge from sklearn.exceptions import ConvergenceWarning +from sklearn.model_selection import train_test_split GLM_SOLVERS = ['lbfgs'] @@ -207,9 +208,8 @@ def test_glm_log_regression(family, solver, tol): @pytest.mark.parametrize('fit_intercept', [True, False]) def test_warm_start(fit_intercept): - n_samples, n_features = 100, 10 - n_predict = 10 - X, y, coef = make_regression(n_samples=n_samples+n_predict, + n_samples, n_features = 110, 10 + X, y, coef = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_features-2, noise=0.5, coef=True, random_state=42) @@ -230,7 +230,7 @@ def test_warm_start(fit_intercept): assert glm1.score(X, y) > glm2.score(X, y) glm2.set_params(max_iter=1000) glm2.fit(X, y) - assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-4, atol=1e-5) + assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) # TODO: investigate why this doesn't match # assert glm1.n_iter_ == glm2.n_iter_ + 2 @@ -242,13 +242,14 @@ def test_warm_start(fit_intercept): def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): """Compare with Ridge regression for Normal distributions.""" alpha = 1.0 - n_predict = 10 - X, y, _ = make_regression(n_samples=n_samples+n_predict, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=42) - y = y[0:n_samples] - X, T = X[0:n_samples], X[n_samples:] + test_size = 10 + X, y = make_regression(n_samples=n_samples + test_size, + n_features=n_features, + n_informative=n_features-2, noise=0.5, + random_state=42) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=0 + ) if n_samples > n_features: ridge_params = {"solver": "svd"} @@ -258,17 +259,18 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, normalize=False, random_state=42, **ridge_params) - ridge.fit(X, y) + ridge.fit(X_train, y_train) glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, - max_iter=300, solver=solver, tol=1e-6, - check_input=False) - glm.fit(X, y) + solver=solver, check_input=False, + max_iter=300) + glm.fit(X_train, y_train) assert glm.coef_.shape == (X.shape[1], ) - assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6) + assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5) + assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=5e-5) + assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=5e-5) @pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) From b9bc170b2ef7673d6b40103a6583cd9a56d7c517 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 14:07:32 +0300 Subject: [PATCH 143/269] Fix TODO in test_warm_start --- sklearn/linear_model/_glm/tests/test_glm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index d6c7c7c3f767e..4975d1454b922 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -232,8 +232,7 @@ def test_warm_start(fit_intercept): glm2.fit(X, y) assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) - # TODO: investigate why this doesn't match - # assert glm1.n_iter_ == glm2.n_iter_ + 2 + assert glm1.n_iter_ == glm2.n_iter_ @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) From ab6c5d82ea6149f8acc20834abcc735c630a516c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 14:31:06 +0300 Subject: [PATCH 144/269] Revert "No LaTeX in docstrings" This reverts commit 61dc13fdd5b71fe0e1d30e5a9b58934441a9dcb1. --- sklearn/linear_model/_glm/distribution.py | 85 +++++++++++++---------- sklearn/linear_model/_glm/glm.py | 66 +++++++++--------- 2 files changed, 83 insertions(+), 68 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 7ce51deb0921d..5754cb391ec61 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -26,15 +26,19 @@ def _safe_lin_pred(X, coef): class ExponentialDispersionModel(metaclass=ABCMeta): - """Base class for reproductive Exponential Dispersion Models (EDM). + r"""Base class for reproductive Exponential Dispersion Models (EDM). - The pdf of Y∼EDM(μ, φ) is given by:: + The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by - p(y| θ, φ) = c1(y, φ) * exp((θy-A(θ))/φ) - = c2(y, φ) * exp(-d(y, μ)/(2φ)) + .. math:: p(y| \theta, \phi) = c(y, \phi) + \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) + = \tilde{c}(y, \phi) + \exp\left(-\frac{d(y, \mu)}{2\phi}\right) - with mean E[Y] = A'(θ) = μ, variance Var[Y] = φ * v(μ), - unit variance v(μ), unit deviance d(y,μ) and dispersion parameter φ. + with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, + unit variance :math:`v(\mu)` and + unit deviance :math:`d(y,\mu)`. Methods ------- @@ -52,7 +56,7 @@ class ExponentialDispersionModel(metaclass=ABCMeta): """ def in_y_range(self, y): - """Returns ``True`` if y is in the valid range of Y∼EDM. + """Returns ``True`` if y is in the valid range of Y~EDM. Parameters ---------- @@ -76,13 +80,17 @@ def in_y_range(self, y): @abstractmethod def unit_variance(self, mu): - """Compute the unit variance function. + r"""Compute the unit variance function. - The unit variance v(μ) determines the variance as a function of the - mean μ by Var[Y_i] = φ/s_i * v(μ_i). - It can also be derived from the unit deviance d(y,μ) as:: + The unit variance :math:`v(\mu)` determines the variance as + a function of the mean :math:`\mu` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. + It can also be derived from the unit deviance :math:`d(y,\mu)` as - v(μ) = 2/(∂^2 d(y,μ)/(∂ μ^2))|_{y=μ} + .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ + \partial\mu^2}}\big|_{y=\mu} + + See also :func:`variance`. Parameters ---------- @@ -93,9 +101,9 @@ def unit_variance(self, mu): @abstractmethod def unit_variance_derivative(self, mu): - """Compute the derivative of the unit variance w.r.t. mu. + r"""Compute the derivative of the unit variance w.r.t. mu. - Return v'(μ). + Return :math:`v'(\mu)`. Parameters ---------- @@ -106,11 +114,12 @@ def unit_variance_derivative(self, mu): @abstractmethod def unit_deviance(self, y, mu, check_input=False): - """Compute the unit deviance. - - The unit_deviance d(y,μ) can be defined by the log-likelihood as:: + r"""Compute the unit deviance. - d(y,μ) = -2φ * (loglike(y,μ,φ) - loglike(y,y,φ)) + The unit_deviance :math:`d(y,\mu)` can be defined by the + log-likelihood as + :math:`d(y,\mu) = -2\phi\cdot + \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` Parameters ---------- @@ -131,10 +140,11 @@ def unit_deviance(self, y, mu, check_input=False): pass # pragma: no cover def unit_deviance_derivative(self, y, mu): - """Compute the derivative of the unit deviance w.r.t. mu. + r"""Compute the derivative of the unit deviance w.r.t. mu. The derivative of the unit deviance is given by - ∂ d(y,μ)/(∂ μ) = -2(y-μ)/v(μ) with unit variance v(μ). + :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` + with unit variance :math:`v(\mu)`. Parameters ---------- @@ -147,13 +157,14 @@ def unit_deviance_derivative(self, y, mu): return -2 * (y - mu) / self.unit_variance(mu) def deviance(self, y, mu, weights=1): - """Compute the deviance. + r"""Compute the deviance. The deviance is a weighted sum of the per sample unit deviances, - D = sum_i s_i * d(y_i,μ_i) - with weights s_i and unit deviance d(y,μ). - In terms of the log-likelihood it is - D = -2φ * (loglike(y,μ,φ/s) - loglike(y,y,φ/s)). + :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` + with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + In terms of the log-likelihood it is :math:`D = -2\phi\cdot + \left(loglike(y,\mu,\frac{phi}{s}) + - loglike(y,y,\frac{phi}{s})\right)`. Parameters ---------- @@ -171,7 +182,7 @@ def deviance(self, y, mu, weights=1): def deviance_derivative(self, y, mu, weights=1): """Compute the derivative of the deviance w.r.t. mu. - It gives ∂ D(y, μ; weights)/(∂ μ). + It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. Parameters ---------- @@ -200,10 +211,11 @@ def _mu_deviance_derivative(self, coef, X, y, weights, link): class TweedieDistribution(ExponentialDispersionModel): - """A class for the Tweedie distribution. + r"""A class for the Tweedie distribution. - A Tweedie distribution with mean μ=E[Y] is uniquely defined by it's - mean-variance relationship Var[Y] ∝ μ^power. + A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely + defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto \mu^power`. Special cases are: @@ -219,7 +231,8 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- power : float (default=0) - The variance power of the unit variance v(μ) = μ^power. + The variance power of the `unit_variance` + :math:`v(\mu) = \mu^{power}`. For ``0=1.") elif 1 <= p < 2: - # Poisson and Compound poisson distribution, y >= 0, mu > 0 + # Poisson and Compount poisson distribution, y >= 0, mu > 0 if (y < 0).any() or (mu <= 0).any(): raise ValueError(message + "non-negative y and strictly " "positive mu.") diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f538aa6121747..69e68c1eda0f8 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -31,7 +31,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as μ=h(X*w). Therefore, + fitting and predicting the mean of the target y as mu=h(X*w). Therefore, the fit minimizes the following objective function with L2 priors as regularizer:: @@ -118,16 +118,16 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and - Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of - and given by the specific EDM, see + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see :ref:`User Guide `. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For ``alpha > 0``, the feature matrix X should be standardized in order to + For alpha > 0, the feature matrix X should be standardized in order to penalize features equally strong. Call :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. @@ -138,7 +138,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider ȳ = (sum_i s_i y_i)(sum_i s_i), + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. @@ -183,9 +183,9 @@ def fit(self, X, y, sample_weight=None): optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has - Var[Y_i]=φ/w_i * v(mu). - If Y_i ~ EDM(mu, φ/w_i), then - sum(w*Y)/sum(w) ~ EDM(mu, φ/sum(w)), i.e. the mean of y is a + Var[Y_i]=phi/w_i * v(mu). + If Y_i ~ EDM(mu, phi/w_i), then + sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. Returns @@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("No default link known for the " "specified distribution family. Please " "set link manually, i.e. not to 'auto'; " - "got (link='auto', family={})" + "got (link='auto', family={}" .format(self.family)) elif self.link == 'identity': self._link_instance = IdentityLink() @@ -383,10 +383,11 @@ def score(self, X, y, sample_weight=None): R^2 uses squared error and D^2 deviance. Note that those two are equal for ``family='normal'``. - D^2 is defined as D^2 = 1 - D(y_true,y_pred) / D_null, - D_null is the null deviance, i.e. the deviance of a model - with intercept alone, which corresponds to y_pred = ȳ. - The mean ȳ is averaged by sample_weight. + D^2 is defined as + :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`, + :math:`D_{null}` is the null deviance, i.e. the deviance of a model + with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`. + The mean :math:`\\bar{y}` is averaged by sample_weight. Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). @@ -489,18 +490,17 @@ class PoissonRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and - Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of - and given by the specific EDM, see + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see :ref:`User Guide `. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. - For ``alpha > 0``, the feature matrix X should be standardized in order to - penalize features equally strong. Call - :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. + For alpha > 0, the feature matrix X should be standardized in order to + penalize features equally strong. If the target y is a ratio, appropriate sample weights s should be provided. @@ -508,7 +508,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider ȳ = (sum_i s_i y_i)(sum_i s_i), + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distributions. References @@ -608,12 +608,12 @@ class GammaRegressor(GeneralizedLinearRegressor): Notes ----- The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be E[Y_i]=μ_i=h((Xw)_i) and - Var[Y_i]=φ/s_i * v(μ_i). The unit variance function v(μ_i) is a property of - and given by the specific EDM, see + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function + :math:`v(\\mu_i)` is a property of and given by the specific EDM, see :ref:`User Guide `. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. @@ -664,7 +664,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: v(μ) = μ^power. + The variance power: :math:`v(\\mu) = \\mu^{power}`. For ``0`. - The parameters w (``coef_`` and ``intercept_``) are estimated by + The parameters :math:`w` (`coef_` and `intercept_`) are estimated by minimizing the deviance plus penalty term, which is equivalent to (penalized) maximum likelihood estimation. From b424a070f696262174959c1927c921d301fd8ba5 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 14:42:39 +0300 Subject: [PATCH 145/269] Remove n_iter_ check when warm start. --- sklearn/linear_model/_glm/glm.py | 2 +- sklearn/linear_model/_glm/tests/test_glm.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 69e68c1eda0f8..38f27eec397e8 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -218,7 +218,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("No default link known for the " "specified distribution family. Please " "set link manually, i.e. not to 'auto'; " - "got (link='auto', family={}" + "got (link='auto', family={})" .format(self.family)) elif self.link == 'identity': self._link_instance = IdentityLink() diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 4975d1454b922..879d382ce28dc 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -232,7 +232,6 @@ def test_warm_start(fit_intercept): glm2.fit(X, y) assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) - assert glm1.n_iter_ == glm2.n_iter_ @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) From 95a90580c9d034e7bcb9c488cc19d09ac692deca Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 15:02:00 +0300 Subject: [PATCH 146/269] Rename variable L2 -> coef_scaled --- sklearn/linear_model/_glm/glm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 38f27eec397e8..a7128eb78df07 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -304,15 +304,16 @@ def fit(self, X, y, sample_weight=None): if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): - mu, devp = \ - family._mu_deviance_derivative(coef, X, y, weights, link) + mu, devp = family._mu_deviance_derivative( + coef, X, y, weights, link + ) dev = family.deviance(y, mu, weights) intercept = (coef.size == X.shape[1] + 1) idx = 1 if intercept else 0 # offset if coef[0] is intercept - L2 = alpha * coef[idx:] - obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2) + coef_scaled = alpha * coef[idx:] + obj = 0.5 * dev + 0.5 * (coef[idx:] @ coef_scaled) objp = 0.5 * devp - objp[idx:] += L2 + objp[idx:] += coef_scaled return obj, objp args = (X, y, weights, self.alpha, family, link) From 59eceb42d874d94d0bb7fdced4f3e05fd030dc62 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 22 Aug 2019 15:59:54 +0300 Subject: [PATCH 147/269] Minor fixes --- .../plot_tweedie_regression_insurance_claims.py | 3 --- sklearn/linear_model/_glm/distribution.py | 12 ++++++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 24b3afcadf120..24451069eeb68 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -181,9 +181,6 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # events occuring with a constant rate in a given time interval (``Exposure``). # Here we model the frequency ``y = ClaimNb / Exposure``, # which is still a (scaled) Poisson distribution. -# -# A very important property of the Poisson distribution is its mean-variance -# relation: The variance is proportional to the mean. df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 5754cb391ec61..8779cd616cc5a 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -342,9 +342,9 @@ def unit_deviance(self, y, mu, check_input=False): if p < 0: # 'Extreme stable', y any realy number, mu > 0 - dev = 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p) * (2-p)) - - y * np.power(mu, 1-p)/(1-p) + - np.power(mu, 2-p)/(2-p)) + dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p)) + - y * np.power(mu, 1-p) / (1-p) + + np.power(mu, 2-p) / (2-p)) elif p == 0: # Normal distribution, y and mu any real number @@ -359,9 +359,9 @@ def unit_deviance(self, y, mu, check_input=False): # Gamma distribution dev = 2 * (np.log(mu/y) + y/mu - 1) else: - dev = 2 * (np.power(y, 2-p)/((1-p) * (2-p)) - - y * np.power(mu, 1-p)/(1-p) + - np.power(mu, 2-p)/(2-p)) + dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p)) + - y * np.power(mu, 1-p) / (1-p) + + np.power(mu, 2-p) / (2-p)) return dev From 04f30f40e572d6e919fef8a4c2d4bcb27d562b6b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 10:13:27 +0300 Subject: [PATCH 148/269] Better wording in example --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 5c0b64faea255..d7f9b65660453 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -5,7 +5,9 @@ This example illustrates the use of linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compares -it with models learned with least squared error. +it with models learned with least squared error. The goal is to predict the +number of insurance claims (or frequency) following car accidents for a user +given historical data over a population of users. We start by defining a few helper functions for loading the data and visualizing results. From 3630b5277f68cd9805ec5f3af21b7fe9336cd46d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 11:25:19 +0300 Subject: [PATCH 149/269] Improvements in plot_poisson_regression_non_normal_loss.py --- ...plot_poisson_regression_non_normal_loss.py | 95 ++++++++++--------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d7f9b65660453..6a98030817245 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -28,6 +28,7 @@ import pandas as pd from sklearn.datasets import fetch_openml +from sklearn.dummy import DummyRegressor from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, LinearRegression from sklearn.model_selection import train_test_split @@ -78,7 +79,7 @@ def load_mtpl2(n_samples=100000): # containing the number of claims (``ClaimNb``) with the freMTPL2sev table # containing the claim amount (``ClaimAmount``) for the same user ids. -df = load_mtpl2(n_samples=100000) +df = load_mtpl2(n_samples=50000) # Note: filter out claims with zero amount, as the severity model # requires a strictly positive target values. @@ -117,8 +118,6 @@ def load_mtpl2(n_samples=100000): # (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, # which is still a (scaled) Poisson distribution. # -# A very important property of the Poisson distribution is its mean-variance -# relation: The variance is proportional to the mean. df["Frequency"] = df.ClaimNb / df.Exposure @@ -135,49 +134,50 @@ def load_mtpl2(n_samples=100000): # To evaluate the pertinence of the used metrics, we will consider as a # baseline an estimator that returns 0 for any input. -df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) +dummy = DummyRegressor(strategy='constant', constant=0) +dummy.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) -eps = 1e-5 -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, np.zeros(len(df_test)), - df_test.Exposure.values)) -print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, np.zeros(len(df_test)), - df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, eps + np.zeros(len(df_test)), - df_test.Exposure.values)) +############################################################################## +# +# The Poisson deviance cannot be computed on negative values predicted by the +# model, so we set the minimum predicted value to eps, + + +def score_estimator(estimator, df_test, eps=1e-5): + """Score an estimatr on the test set""" + + print("MSE: %.3f" % mean_squared_error( + df_test.Frequency.values, estimator.predict(X_test), + df_test.Exposure.values)) + print("MAE: %.3f" % mean_absolute_error( + df_test.Frequency.values, estimator.predict(X_test), + df_test.Exposure.values)) + print("mean Poisson deviance: %.3f" % mean_poisson_deviance( + df_test.Frequency.values, np.fmax(estimator.predict(X_test), eps), + df_test.Exposure.values)) + + +print("DummyRegressor") +score_estimator(dummy, df_test) ############################################################################## # # We start by modeling the target variable with the least squares linear # regression model, - linregr = LinearRegression() linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) -print("LinearRegression") -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, linregr.predict(X_test), - df_test.Exposure.values)) -print("MSE: %.3f" % mean_absolute_error( - df_test.Frequency.values, linregr.predict(X_test), - df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps), - df_test.Exposure.values)) - -############################################################################## -# -# The Poisson deviance cannot be computed because negative values are -# predicted by the model, print('Number Negatives: %s / total: %s' % ( (linregr.predict(X_test) < 0).sum(), X_test.shape[0])) +print("LinearRegression") +score_estimator(linregr, df_test) + ############################################################################## # # Next we fit the Poisson regressor on the target variable, @@ -186,15 +186,7 @@ def load_mtpl2(n_samples=100000): glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) print("PoissonRegressor") -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, glm_freq.predict(X_test), - df_test.Exposure.values)) -print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, glm_freq.predict(X_test), - df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, glm_freq.predict(X_test), - df_test.Exposure.values)) +score_estimator(glm_freq, df_test) ############################################################################## # @@ -202,19 +194,13 @@ def load_mtpl2(n_samples=100000): # still minimizes the least square error. -gbr = GradientBoostingRegressor(max_depth=3) +gbr = GradientBoostingRegressor() gbr.fit(X_train, df_train.Frequency.values, sample_weight=df_train.Exposure.values) print("GradientBoostingRegressor") -print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) -print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values)) -print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(gbr.predict(X_test), eps), - df_test.Exposure.values)) +score_estimator(gbr, df_test) ############################################################################## # @@ -231,6 +217,7 @@ def load_mtpl2(n_samples=100000): fig, axes = plt.subplots(1, 4, figsize=(16, 3)) +fig.subplots_adjust(bottom=0.2) df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0]) @@ -247,3 +234,17 @@ def load_mtpl2(n_samples=100000): yscale='log', xlabel="y (Frequency)" ) + +############################################################################## +# +# The experimental data presents a long tail distribution for ``y``. In all +# models we predict the mean expected value, so we will have necessairily fewer +# extreme values. Additionally normal distribution used in ``Ridge`` and +# ``GradientBoostingRegressor`` has a constant variance, while for the Poisson +# distribution used in ``PoissonRegressor``, the variance is proportional to +# the mean predicted value. +# +# Thus, among the considered estimators, +# ``PoissonRegressor`` and ``GradientBoostingRegressor`` are better suited for +# modeling the long tail distribution of the data as compared to the ``Ridge`` +# estimator. From 516eadba0cbddd9c016f66fc6dce41773f0350e7 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 12:28:06 +0300 Subject: [PATCH 150/269] Improvements in plot_tweedie_regression_insurance_claims.py --- ...plot_poisson_regression_non_normal_loss.py | 4 +-- ...lot_tweedie_regression_insurance_claims.py | 30 +++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 6a98030817245..9deca6c25032a 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -116,8 +116,8 @@ def load_mtpl2(n_samples=100000): # as a Poisson distribution. It is then assumed to be the number of discrete # events occurring with a constant rate in a given time interval # (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution. -# +# which is still a (scaled) Poisson distribution, and use ``Exposure`` as +# `sample_weight`. df["Frequency"] = df.ClaimNb / df.Exposure diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 24451069eeb68..f866518b69db8 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -80,7 +80,7 @@ def load_mtpl2(n_samples=100000): def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, - title=None, ax=None): + title=None, ax=None, fill_legend=False): """Plot observed and predicted - aggregated per feature level. Parameters @@ -95,6 +95,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, a column name of df with the observed target predicted : frame a dataframe, with the same index as df, with the predicted target + fill_legend : bool, default=False + wgether to show fill_between legend """ # aggregate observed and predicted variables by feature level df_ = df.loc[:, [feature, weight]].copy() @@ -109,13 +111,15 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax) y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8 - ax.fill_between( + p2 = ax.fill_between( df_.index, 0, y_max * df_[weight] / df_[weight].values.max(), color="g", alpha=0.1, ) + if fill_legend: + ax.legend([p2], ["{} distribution".format(feature)]) ax.set( ylabel=y_label if y_label is not None else None, title=title if title is not None else "Train: Observed vs Predicted", @@ -132,7 +136,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # containing the claim amount (``ClaimAmount``) for the same policy ids # (``IDpol``). -df = load_mtpl2(n_samples=100000) +df = load_mtpl2(n_samples=60000) # Note: filter out claims with zero amount, as the severity model # requires a strictly positive target values. @@ -180,9 +184,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # as a Poisson distribution. It is then assumed to be the number of discrete # events occuring with a constant rate in a given time interval (``Exposure``). # Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution. +# which is still a (scaled) Poisson distribution, and use ``Exposure`` as +# `sample_weight`. -df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2) +df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) # Some of the features are colinear, we use a weak penalization to avoid # numerical issues. @@ -200,6 +205,7 @@ def mean_deviance(estimator, y, y_pred, weights): def score_estimator( estimator, X_train, X_test, df_train, df_test, target, weights ): + """Evaluate an estimator on train and test sets with different metrics""" res = [] for subset_label, X, df in [ @@ -282,6 +288,7 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[0, 1], + fill_legend=True ) plot_obs_pred( @@ -293,6 +300,7 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[1, 0], + fill_legend=True ) plot_obs_pred( @@ -304,11 +312,17 @@ def score_estimator( y_label="Claim Frequency", title="test data", ax=ax[1, 1], + fill_legend=True ) ############################################################################## # +# According to the observed data, the frequency of accidents is higher for +# drivers younger than 30 years old, and it positively correlated with the +# `BonusMalus` variable. Out model is able to mostly correctly model +# this behaviour. +# # 3. Severity model - Gamma Distribution # --------------------------------------- # The mean claim amount or severity (`AvgClaimAmount`) can be empirically @@ -392,11 +406,15 @@ def score_estimator( y_label="Average Claim Severity", title="test data", ax=ax[1], + fill_legend=True ) ############################################################################## # +# Overall the drivers age (``DrivAge``) has a weak impact on the claim +# severity, both in observed and predicted data. +# # 4. Total Claims Amount -- Compound Poisson distribution # ------------------------------------------------------- # @@ -517,3 +535,5 @@ def score(self, X, y, sample_weight=None): ) print(pd.DataFrame(res).set_index("subset").T) + +plt.show() From 5e14928507f04893d131942c57e566c5a5789517 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 12:35:28 +0300 Subject: [PATCH 151/269] Drop unused ExponentialDispersionModel._upper_bound --- sklearn/linear_model/_glm/distribution.py | 7 +------ sklearn/linear_model/_glm/tests/test_distribution.py | 5 ----- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index 8779cd616cc5a..e3dfd1fcd3cf5 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -63,11 +63,7 @@ def in_y_range(self, y): y : array, shape (n_samples,) Target values. """ - if hasattr(self, '_upper_bound'): - # All currently supported distributions have an upper bound at - # +inf, however this may need to be implemented for other - # distributions - raise NotImplementedError + # Note that currently supported distributions have +inf upper bound if not isinstance(self._lower_bound, DistributionBoundary): raise TypeError('_lower_bound attribute must be of type ' @@ -236,7 +232,6 @@ class TweedieDistribution(ExponentialDispersionModel): For ``0 Date: Wed, 28 Aug 2019 13:10:55 +0300 Subject: [PATCH 152/269] Move notes and references from docstrings to user manual --- doc/modules/linear_model.rst | 12 ++++ sklearn/linear_model/_glm/glm.py | 117 ------------------------------- 2 files changed, 12 insertions(+), 117 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index fc92cd2002948..b127a2c345a36 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -986,6 +986,18 @@ of the unit variance function, :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together with :math:`s=\mathrm{exposure}` as sample weights. This is done in both examples linked below. + * The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. + * If the target y is a ratio, appropriate sample weights s should be + provided. + As an example, consider Poisson distributed counts z (integers) and + weights s=exposure (time, money, persons years, ...). Then you fit + y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. + The weights are necessary for the right (finite sample) mean. + Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distributions. + The same holds for other distributions. The estimator can be used as follows:: diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index a7128eb78df07..826f5924776b3 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -114,43 +114,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - For alpha > 0, the feature matrix X should be standardized in order to - penalize features equally strong. Call - :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``. - - If the target y is a ratio, appropriate sample weights s should be - provided. - As an example, consider Poisson distributed counts z (integers) and - weights s=exposure (time, money, persons years, ...). Then you fit - y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y, - sample_weight=s)``. The weights are necessary for the right (finite - sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, - in this case one might say that y has a 'scaled' Poisson distributions. - The same holds for other distributions. - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', @@ -487,40 +450,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - For alpha > 0, the feature matrix X should be standardized in order to - penalize features equally strong. - - If the target y is a ratio, appropriate sample weights s should be - provided. - As an example, consider Poisson distributed counts z (integers) and - weights s=exposure (time, money, persons years, ...). Then you fit - y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``. - The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, - in this case one might say that y has a 'scaled' Poisson distributions. - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, @@ -605,29 +534,6 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, alpha=1.0, fit_intercept=True, link='log', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, @@ -732,29 +638,6 @@ class TweedieRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. - - Notes - ----- - The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function - :math:`v(\\mu_i)` is a property of and given by the specific EDM, see - :ref:`User Guide `. - - The parameters :math:`w` (`coef_` and `intercept_`) are estimated by - minimizing the deviance plus penalty term, which is equivalent to - (penalized) maximum likelihood estimation. - - - References - ---------- - .. McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. - - .. Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, From 752d6aa90bc034426b0348a4fd12f0fc3421027d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 14:01:04 +0300 Subject: [PATCH 153/269] More explanatory comments in the code --- sklearn/linear_model/_glm/glm.py | 22 +++++++++++++++++---- sklearn/linear_model/_glm/tests/test_glm.py | 3 +++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 826f5924776b3..54560dbae0867 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -462,6 +462,10 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', @property def family(self): + # We use a property with a setter, since the GLM solver relies + # on self.family attribute, but we can't set it in __init__ according + # to scikit-learn API constraints. This attribute is made read-only + # to disallow changing distribution to other than Poisson. return "poisson" @family.setter @@ -546,6 +550,10 @@ def __init__(self, alpha=1.0, fit_intercept=True, link='log', @property def family(self): + # We use a property with a setter, since the GLM solver relies + # on self.family attribute, but we can't set it in __init__ according + # to scikit-learn API constraints. This attribute is made read-only + # to disallow changing distribution to other than Gamma. return "gamma" @family.setter @@ -571,7 +579,10 @@ class TweedieRegressor(GeneralizedLinearRegressor): Parameters ---------- power : float (default=0) - The variance power: :math:`v(\\mu) = \\mu^{power}`. + The power determines the underlying target distribution. By + definition it links distribution variance (:math:`v`) and + mean (:math:`\\mu`): :math:`v(\\mu) = \\mu^{power}`. + For ``0 glm2.score(X, y) glm2.set_params(max_iter=1000) glm2.fit(X, y) + # The two model are not exactly identical since the lbfgs solver + # computes the approximate hessian from previous iterations, which + # will not be strictly identical in the case of a warm start. assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5) assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4) From 38a4ad4e44ed4c4b27c48306afc74b85710dd889 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 14:18:43 +0300 Subject: [PATCH 154/269] Fix requires_positive_y tag --- sklearn/linear_model/_glm/glm.py | 11 ++++++++++- sklearn/linear_model/_glm/tests/test_glm.py | 13 +++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 54560dbae0867..7c518bb3f8149 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -383,7 +383,16 @@ def score(self, X, y, sample_weight=None): return 1 - dev / dev_null def _more_tags(self): - return {"requires_positive_y": True} + # create the _family_instance if fit wasn't called yet. + if hasattr(self, '_family_instance'): + _family_instance = self._family_instance + elif isinstance(self.family, ExponentialDispersionModel): + _family_instance = self.family + elif self.family in EDM_DISTRIBUTIONS: + _family_instance = EDM_DISTRIBUTIONS[self.family]() + else: + raise ValueError + return {"requires_positive_y": not _family_instance.in_y_range(-1.0)} class PoissonRegressor(GeneralizedLinearRegressor): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 93d7ee1f08324..898d3c4edf9c0 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -340,3 +340,16 @@ def test_tweedie_regression_family(regression_data): # TODO: the following should not be allowed # est.family.power = 2 + + +@pytest.mark.parametrize( + 'estimator, value', + [ + (PoissonRegressor(), True), + (GammaRegressor(), True), + (TweedieRegressor(power=1.5), True), + (TweedieRegressor(power=0), False) + ], +) +def test_tags(estimator, value): + assert estimator._get_tags()['requires_positive_y'] is value From c15a1cc573a555cc1a2e35c0063f6795cc2f7d84 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 28 Aug 2019 14:56:06 +0300 Subject: [PATCH 155/269] Remove Link.inverse_derivative2 --- doc/modules/linear_model.rst | 4 ++-- sklearn/linear_model/_glm/link.py | 21 -------------------- sklearn/linear_model/_glm/tests/test_link.py | 7 ------- 3 files changed, 2 insertions(+), 30 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index b127a2c345a36..622ab335059ab 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -919,8 +919,8 @@ The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as .. math:: d(y, \mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - - loglike(y,y,\phi)\right) + \left( log P(y|\mu,\phi) + - log P(y|y,\phi)\right) The following table lists some specific EDM distributions—all are Tweedie distributions—and some of their properties. diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index 04b485c067cca..b257036ac4727 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -64,17 +64,6 @@ def inverse_derivative(self, lin_pred): """ pass # pragma: no cover - @abstractmethod - def inverse_derivative2(self, lin_pred): - """Compute 2nd derivative of the inverse link function h''(lin_pred). - - Parameters - ---------- - lin_pred : array, shape (n_samples,) - Usually the (fitted) linear predictor. - """ - pass # pragma: no cover - class IdentityLink(Link): """The identity link function g(x)=x.""" @@ -91,9 +80,6 @@ def inverse(self, lin_pred): def inverse_derivative(self, lin_pred): return np.ones_like(lin_pred) - def inverse_derivative2(self, lin_pred): - return np.zeros_like(lin_pred) - class LogLink(Link): """The log link function g(x)=log(x).""" @@ -110,9 +96,6 @@ def inverse(self, lin_pred): def inverse_derivative(self, lin_pred): return np.exp(lin_pred) - def inverse_derivative2(self, lin_pred): - return np.exp(lin_pred) - class LogitLink(Link): """The logit link function g(x)=logit(x).""" @@ -129,7 +112,3 @@ def inverse(self, lin_pred): def inverse_derivative(self, lin_pred): ep = expit(lin_pred) return ep * (1 - ep) - - def inverse_derivative2(self, lin_pred): - ep = expit(lin_pred) - return ep * (1 - ep) * (1 - 2 * ep) diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index a631509baca79..36219e09b58e3 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -30,10 +30,3 @@ def test_link_properties(link): # g = link, h = link.inverse assert_allclose(link.derivative(link.inverse(x)), 1 / link.inverse_derivative(x)) - - assert ( - link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape) - - # for LogitLink, in the following x should be between 0 and 1 - # assert_almost_equal(link.inverse_derivative(link(x)), - # 1 / link.derivative(x), decimal=decimal) From 37de07b7b78fe7ced78da6ed068fde3e4a08425b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 30 Aug 2019 12:41:44 +0300 Subject: [PATCH 156/269] Rename p to power parameter in mean_tweedie_deviance --- doc/modules/model_evaluation.rst | 45 ++++++++--------- sklearn/metrics/regression.py | 25 +++++----- sklearn/metrics/tests/test_regression.py | 61 ++++++++++++------------ 3 files changed, 67 insertions(+), 64 deletions(-) diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 2db5053e08cce..1fe8ece51df67 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -2028,14 +2028,14 @@ Mean Poisson, Gamma, and Tweedie deviances The :func:`mean_tweedie_deviance` function computes the `mean Tweedie deviance error `_ -with power parameter `p`. This is a metric that elicits predicted expectation +with a ``power`` parameter. This is a metric that elicits predicted expectation values of regression targets. Following special cases exist, -- when `p=0` it is equivalent to :func:`mean_squared_error`. -- when `p=1` it is equivalent to :func:`mean_poisson_deviance`. -- when `p=2` it is equivalent to :func:`mean_gamma_deviance`. +- when ``power=0`` it is equivalent to :func:`mean_squared_error`. +- when ``power=1`` it is equivalent to :func:`mean_poisson_deviance`. +- when ``power=2`` it is equivalent to :func:`mean_gamma_deviance`. If :math:`\hat{y}_i` is the predicted value of the :math:`i`-th sample, and :math:`y_i` is the corresponding true value, then the mean Tweedie @@ -2046,48 +2046,49 @@ deviance error (D) estimated over :math:`n_{\text{samples}}` is defined as \text{D}(y, \hat{y}) = \frac{1}{n_\text{samples}} \sum_{i=0}^{n_\text{samples} - 1} \begin{cases} - (y_i-\hat{y}_i)^2, & \text{for }p=0\text{ (Normal)}\\ - 2(y_i \log(y/\hat{y}_i) + \hat{y}_i - y_i), & \text{for }p=1\text{ (Poisson)}\\ - 2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1), & \text{for }p=2\text{ (Gamma)}\\ + (y_i-\hat{y}_i)^2, & \text{for }\text{power}=0\text{ (Normal)}\\ + 2(y_i \log(y/\hat{y}_i) + \hat{y}_i - y_i), & \text{for power}=1\text{ (Poisson)}\\ + 2(\log(\hat{y}_i/y_i) + y_i/\hat{y}_i - 1), & \text{for power}=2\text{ (Gamma)}\\ 2\left(\frac{\max(y_i,0)^{2-p}}{(1-p)(2-p)}- \frac{y\,\hat{y}^{1-p}_i}{1-p}+\frac{\hat{y}^{2-p}_i}{2-p}\right), & \text{otherwise} \end{cases} -Tweedie deviance is a homogeneous function of degree ``2-p``. -Thus, Gamma distribution with `p=2` means that simultaneously scaling `y_true` -and `y_pred` has no effect on the deviance. For Poisson distribution `p=1` -the deviance scales linearly, and for Normal distribution (`p=0`), -quadratically. In general, the higher `p` the less weight is given to extreme -deviations between true and predicted targets. +Tweedie deviance is a homogeneous function of degree ``2-power``. +Thus, Gamma distribution with ``power=2`` means that simultaneously scaling +``y_true`` and ``y_pred`` has no effect on the deviance. For Poisson +distribution ``power=1`` the deviance scales linearly, and for Normal +distribution (``power=0``), quadratically. In general, the higher +``power`` the less weight is given to extreme deviations between true +and predicted targets. For instance, let's compare the two predictions 1.0 and 100 that are both 50% of their corresponding true value. -The mean squared error (``p=0``) is very sensitive to the +The mean squared error (``power=0``) is very sensitive to the prediction difference of the second point,:: >>> from sklearn.metrics import mean_tweedie_deviance - >>> mean_tweedie_deviance([1.0], [1.5], p=0) + >>> mean_tweedie_deviance([1.0], [1.5], power=0) 0.25 - >>> mean_tweedie_deviance([100.], [150.], p=0) + >>> mean_tweedie_deviance([100.], [150.], power=0) 2500.0 If we increase ``p`` to 1,:: - >>> mean_tweedie_deviance([1.0], [1.5], p=1) + >>> mean_tweedie_deviance([1.0], [1.5], power=1) 0.18... - >>> mean_tweedie_deviance([100.], [150.], p=1) + >>> mean_tweedie_deviance([100.], [150.], power=1) 18.9... -the difference in errors decreases. Finally, by setting, ``p=2``:: +the difference in errors decreases. Finally, by setting, ``power=2``:: - >>> mean_tweedie_deviance([1.0], [1.5], p=2) + >>> mean_tweedie_deviance([1.0], [1.5], power=2) 0.14... - >>> mean_tweedie_deviance([100.], [150.], p=2) + >>> mean_tweedie_deviance([100.], [150.], power=2) 0.14... -we would get identical errors. The deviance when `p=2` is thus only +we would get identical errors. The deviance when ``power=2`` is thus only sensitive to relative errors. .. _clustering_metrics: diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index a1ae25fec278c..73db0acc945e6 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -624,7 +624,7 @@ def max_error(y_true, y_pred): return np.max(np.abs(y_true - y_pred)) -def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): +def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): """Mean Tweedie deviance regression loss. Read more in the :ref:`User Guide `. @@ -640,20 +640,21 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): sample_weight : array-like, shape (n_samples,), optional Sample weights. - p : float, optional - Tweedie power parameter. Either p <= 0 or p >= 1. + power : float, default=0 + Tweedie power parameter. Either power <= 0 or power >= 1. The higher `p` the less weight is given to extreme deviations between true and predicted targets. - - p < 0: Extreme stable distribution. Requires: y_pred > 0. - - p = 0 : Normal distribution, output corresponds to + - power < 0: Extreme stable distribution. Requires: y_pred > 0. + - power = 0 : Normal distribution, output corresponds to mean_squared_error. y_true and y_pred can be any real numbers. - - p = 1 : Poisson distribution. Requires: y_true >= 0 and y_pred > 0. + - power = 1 : Poisson distribution. Requires: y_true >= 0 and + y_pred > 0. - 1 < p < 2 : Compound Poisson distribution. Requires: y_true >= 0 and y_pred > 0. - - p = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0. - - p = 3 : Inverse Gaussian distribution. Requires: y_true > 0 + - power = 2 : Gamma distribution. Requires: y_true > 0 and y_pred > 0. + - power = 3 : Inverse Gaussian distribution. Requires: y_true > 0 and y_pred > 0. - otherwise : Positive stable distribution. Requires: y_true > 0 and y_pred > 0. @@ -668,7 +669,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): >>> from sklearn.metrics import mean_tweedie_deviance >>> y_true = [2, 0, 1, 4] >>> y_pred = [0.5, 0.5, 2., 2.] - >>> mean_tweedie_deviance(y_true, y_pred, p=1) + >>> mean_tweedie_deviance(y_true, y_pred, power=1) 1.4260... """ from ..linear_model._glm.distribution import TweedieDistribution @@ -682,7 +683,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, p=0): sample_weight = column_or_1d(sample_weight) sample_weight = sample_weight[:, np.newaxis] - dist = TweedieDistribution(power=p) + dist = TweedieDistribution(power=power) dev = dist.unit_deviance(y_true, y_pred, check_input=True) return np.average(dev, weights=sample_weight) @@ -721,7 +722,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None): 1.4260... """ return mean_tweedie_deviance( - y_true, y_pred, sample_weight=sample_weight, p=1 + y_true, y_pred, sample_weight=sample_weight, power=1 ) @@ -759,5 +760,5 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): 1.0568... """ return mean_tweedie_deviance( - y_true, y_pred, sample_weight=sample_weight, p=2 + y_true, y_pred, sample_weight=sample_weight, power=2 ) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 6dc31676357f5..0f987a088bb84 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -36,7 +36,7 @@ def test_regression_metrics(n_samples=50): assert_almost_equal(max_error(y_true, y_pred), 1.) assert_almost_equal(r2_score(y_true, y_pred), 0.995, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 1.) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=0), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=0), mean_squared_error(y_true, y_pred)) # Tweedie deviance needs positive y_pred, except for p=0, @@ -45,15 +45,15 @@ def test_regression_metrics(n_samples=50): y_true = np.arange(1, 1 + n_samples) y_pred = 2 * y_true n = n_samples - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=-1), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=-1), 5/12 * n * (n**2 + 2 * n + 1)) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=1), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=1), (n + 1) * (1 - np.log(2))) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=2), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=2), 2 * np.log(2) - 1) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3/2), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3/2), ((6 * np.sqrt(2) - 8) / n) * np.sqrt(y_true).sum()) - assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, p=3), + assert_almost_equal(mean_tweedie_deviance(y_true, y_pred, power=3), np.sum(1 / y_true) / (4 * n)) @@ -101,40 +101,41 @@ def test_regression_metrics_at_limits(): mean_squared_log_error, [1., -2., 3.], [1., 2., 3.]) # Tweedie deviance error - p = -1.2 - assert_allclose(mean_tweedie_deviance([0], [1.], p=p), - 2./(2.-p), rtol=1e-3) + power = -1.2 + assert_allclose(mean_tweedie_deviance([0], [1.], power=power), + 2 / (2 - power), rtol=1e-3) with pytest.raises(ValueError, match="can only be used on strictly positive mu."): - mean_tweedie_deviance([0.], [0.], p=p) - assert_almost_equal(mean_tweedie_deviance([0.], [0.], p=0), 0.00, 2) + mean_tweedie_deviance([0.], [0.], power=power) + assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=1.0) + mean_tweedie_deviance([0.], [0.], power=1.0) - p = 1.5 - assert_allclose(mean_tweedie_deviance([0.], [1.], p=p), 2./(2.-p)) + power = 1.5 + assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), + 2 / (2 - power)) msg = "only be used on non-negative y and strictly positive mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=p) - p = 2. - assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), 0.00, + mean_tweedie_deviance([0.], [0.], power=power) + power = 2. + assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=p) - p = 3. - assert_allclose(mean_tweedie_deviance([1.], [1.], p=p), + mean_tweedie_deviance([0.], [0.], power=power) + power = 3. + assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) msg = "can only be used on strictly positive y and mu." with pytest.raises(ValueError, match=msg): - mean_tweedie_deviance([0.], [0.], p=p) + mean_tweedie_deviance([0.], [0.], power=power) with pytest.raises(ValueError, match="is only defined for power<=0 and power>=1"): - mean_tweedie_deviance([0.], [0.], p=0.5) + mean_tweedie_deviance([0.], [0.], power=0.5) def test__check_reg_targets(): @@ -274,21 +275,21 @@ def test_tweedie_deviance_continuity(): y_true = np.random.RandomState(0).rand(n_samples) + 0.1 y_pred = np.random.RandomState(1).rand(n_samples) + 0.1 - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=0 - 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=0)) + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=0 - 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=0)) # Ws we get closer to the limit, with 1e-12 difference the absolute # tolerance to pass the below check increases. There are likely # numerical precision issues on the edges of different definition # regions. - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=1 + 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=1), + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=1 + 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=1), atol=1e-6) - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=2 - 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=2), + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 - 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=2), atol=1e-6) - assert_allclose(mean_tweedie_deviance(y_true, y_pred, p=2 + 1e-10), - mean_tweedie_deviance(y_true, y_pred, p=2), + assert_allclose(mean_tweedie_deviance(y_true, y_pred, power=2 + 1e-10), + mean_tweedie_deviance(y_true, y_pred, power=2), atol=1e-6) From adbf997ba53d68e53a8d5997ffecd98ce4e13863 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 30 Aug 2019 12:07:41 +0200 Subject: [PATCH 157/269] Rename predicted mean mu to y_pred --- sklearn/linear_model/_glm/distribution.py | 173 +++++++++--------- sklearn/linear_model/_glm/glm.py | 37 ++-- sklearn/linear_model/_glm/link.py | 42 ++--- .../_glm/tests/test_distribution.py | 4 +- 4 files changed, 132 insertions(+), 124 deletions(-) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index e3dfd1fcd3cf5..a5e42bcee5d1c 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -28,17 +28,17 @@ def _safe_lin_pred(X, coef): class ExponentialDispersionModel(metaclass=ABCMeta): r"""Base class for reproductive Exponential Dispersion Models (EDM). - The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by + The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by .. math:: p(y| \theta, \phi) = c(y, \phi) \exp\left(\frac{\theta y-A(\theta)}{\phi}\right) = \tilde{c}(y, \phi) - \exp\left(-\frac{d(y, \mu)}{2\phi}\right) + \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right) - with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`, - variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`, - unit variance :math:`v(\mu)` and - unit deviance :math:`d(y,\mu)`. + with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`, + variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`, + unit variance :math:`v(y_\textrm{pred})` and + unit deviance :math:`d(y,y_\textrm{pred})`. Methods ------- @@ -75,58 +75,60 @@ def in_y_range(self, y): return np.greater(y, self._lower_bound.value) @abstractmethod - def unit_variance(self, mu): + def unit_variance(self, y_pred): r"""Compute the unit variance function. - The unit variance :math:`v(\mu)` determines the variance as - a function of the mean :math:`\mu` by - :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`. - It can also be derived from the unit deviance :math:`d(y,\mu)` as + The unit variance :math:`v(y_\textrm{pred})` determines the variance as + a function of the mean :math:`y_\textrm{pred}` by + :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`. + It can also be derived from the unit deviance + :math:`d(y,y_\textrm{pred})` as - .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{ - \partial\mu^2}}\big|_{y=\mu} + .. math:: v(y_\textrm{pred}) = \frac{2}{ + \frac{\partial^2 d(y,y_\textrm{pred})}{ + \partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}} See also :func:`variance`. Parameters ---------- - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. """ pass # pragma: no cover @abstractmethod - def unit_variance_derivative(self, mu): - r"""Compute the derivative of the unit variance w.r.t. mu. + def unit_variance_derivative(self, y_pred): + r"""Compute the derivative of the unit variance w.r.t. y_pred. - Return :math:`v'(\mu)`. + Return :math:`v'(y_\textrm{pred})`. Parameters ---------- - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Target values. """ pass # pragma: no cover @abstractmethod - def unit_deviance(self, y, mu, check_input=False): + def unit_deviance(self, y, y_pred, check_input=False): r"""Compute the unit deviance. - The unit_deviance :math:`d(y,\mu)` can be defined by the + The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the log-likelihood as - :math:`d(y,\mu) = -2\phi\cdot - \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).` + :math:`d(y,y_\textrm{pred}) = -2\phi\cdot + \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).` Parameters ---------- y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. check_input : bool, default=False - If True raise an exception on invalid y or mu values, otherwise + If True raise an exception on invalid y or y_pred values, otherwise they will be propagated as NaN. Returns ------- @@ -135,31 +137,33 @@ def unit_deviance(self, y, mu, check_input=False): """ pass # pragma: no cover - def unit_deviance_derivative(self, y, mu): - r"""Compute the derivative of the unit deviance w.r.t. mu. + def unit_deviance_derivative(self, y, y_pred): + r"""Compute the derivative of the unit deviance w.r.t. y_pred. The derivative of the unit deviance is given by - :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}` - with unit variance :math:`v(\mu)`. + :math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred}) + = -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}` + with unit variance :math:`v(y_\textrm{pred})`. Parameters ---------- y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. """ - return -2 * (y - mu) / self.unit_variance(mu) + return -2 * (y - y_pred) / self.unit_variance(y_pred) - def deviance(self, y, mu, weights=1): + def deviance(self, y, y_pred, weights=1): r"""Compute the deviance. The deviance is a weighted sum of the per sample unit deviances, - :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)` - with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`. + :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)` + with weights :math:`s_i` and unit deviance + :math:`d(y,y_\textrm{pred})`. In terms of the log-likelihood it is :math:`D = -2\phi\cdot - \left(loglike(y,\mu,\frac{phi}{s}) + \left(loglike(y,y_\textrm{pred},\frac{phi}{s}) - loglike(y,y,\frac{phi}{s})\right)`. Parameters @@ -167,51 +171,52 @@ def deviance(self, y, mu, weights=1): y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return np.sum(weights * self.unit_deviance(y, mu)) + return np.sum(weights * self.unit_deviance(y, y_pred)) - def deviance_derivative(self, y, mu, weights=1): - """Compute the derivative of the deviance w.r.t. mu. + def deviance_derivative(self, y, y_pred, weights=1): + r"""Compute the derivative of the deviance w.r.t. y_pred. - It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`. + It gives :math:`\frac{\partial}{\partial y_\textrm{pred}} + D(y, \y_\textrm{pred}; weights)`. Parameters ---------- y : array, shape (n_samples,) Target values. - mu : array, shape (n_samples,) + y_pred : array, shape (n_samples,) Predicted mean. weights : array, shape (n_samples,) (default=1) Weights or exposure to which variance is inverse proportional. """ - return weights * self.unit_deviance_derivative(y, mu) + return weights * self.unit_deviance_derivative(y, y_pred) - def _mu_deviance_derivative(self, coef, X, y, weights, link): - """Compute mu and the derivative of the deviance w.r.t coef.""" + def _y_pred_deviance_derivative(self, coef, X, y, weights, link): + """Compute y_pred and the derivative of the deviance w.r.t coef.""" lin_pred = _safe_lin_pred(X, coef) - mu = link.inverse(lin_pred) + y_pred = link.inverse(lin_pred) d1 = link.inverse_derivative(lin_pred) - temp = d1 * self.deviance_derivative(y, mu, weights) + temp = d1 * self.deviance_derivative(y, y_pred, weights) if coef.size == X.shape[1] + 1: devp = np.concatenate(([temp.sum()], temp @ X)) else: devp = temp @ X # same as X.T @ temp - return mu, devp + return y_pred, devp class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. - A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely - defined by it's mean-variance relationship - :math:`\mathrm{Var}[Y] \propto \mu^power`. + A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]` + is uniquely defined by it's mean-variance relationship + :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`. Special cases are: @@ -228,7 +233,7 @@ class TweedieDistribution(ExponentialDispersionModel): ---------- power : float (default=0) The variance power of the `unit_variance` - :math:`v(\mu) = \mu^{power}`. + :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`. For ``0 0 - if (mu <= 0).any(): - raise ValueError(message + "strictly positive mu.") + # 'Extreme stable', y any realy number, y_pred > 0 + if (y_pred <= 0).any(): + raise ValueError(message + "strictly positive y_pred.") elif p == 0: - # Normal, y and mu can be any real number + # Normal, y and y_pred can be any real number pass elif 0 < p < 1: raise ValueError("Tweedie deviance is only defined for " "power<=0 and power>=1.") elif 1 <= p < 2: - # Poisson and Compount poisson distribution, y >= 0, mu > 0 - if (y < 0).any() or (mu <= 0).any(): + # Poisson and Compount poisson distribution, y >= 0, y_pred > 0 + if (y < 0).any() or (y_pred <= 0).any(): raise ValueError(message + "non-negative y and strictly " - "positive mu.") + "positive y_pred.") elif p >= 2: - # Gamma and Extreme stable distribution, y and mu > 0 - if (y <= 0).any() or (mu <= 0).any(): - raise ValueError(message + "strictly positive y and mu.") + # Gamma and Extreme stable distribution, y and y_pred > 0 + if (y <= 0).any() or (y_pred <= 0).any(): + raise ValueError(message + + "strictly positive y and y_pred.") else: # pragma: nocover # Unreachable statement raise ValueError if p < 0: - # 'Extreme stable', y any realy number, mu > 0 + # 'Extreme stable', y any realy number, y_pred > 0 dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p)) - - y * np.power(mu, 1-p) / (1-p) - + np.power(mu, 2-p) / (2-p)) + - y * np.power(y_pred, 1-p) / (1-p) + + np.power(y_pred, 2-p) / (2-p)) elif p == 0: - # Normal distribution, y and mu any real number - dev = (y - mu)**2 + # Normal distribution, y and y_pred any real number + dev = (y - y_pred)**2 elif p < 1: raise ValueError("Tweedie deviance is only defined for power<=0 " "and power>=1.") elif p == 1: # Poisson distribution - dev = 2 * (xlogy(y, y/mu) - y + mu) + dev = 2 * (xlogy(y, y/y_pred) - y + y_pred) elif p == 2: # Gamma distribution - dev = 2 * (np.log(mu/y) + y/mu - 1) + dev = 2 * (np.log(y_pred/y) + y/y_pred - 1) else: dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p)) - - y * np.power(mu, 1-p) / (1-p) - + np.power(mu, 2-p) / (2-p)) + - y * np.power(y_pred, 1-p) / (1-p) + + np.power(y_pred, 2-p) / (2-p)) return dev diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 7c518bb3f8149..cab89766c0b92 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -31,8 +31,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a Generalized Linear Model (GLM) with penalties. GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). Therefore, - the fit minimizes the following objective function with L2 + fitting and predicting the mean of the target y as y_pred=h(X*w). + Therefore, the fit minimizes the following objective function with L2 priors as regularizer:: 1/(2*sum(s)) * deviance(y, h(X*w); s) @@ -66,8 +66,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): link : {'auto', 'identity', 'log'} or an instance of class Link, \ optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor - (X*coef) to expectation (mu). Option 'auto' sets the link depending on - the chosen family as follows: + (X*coef) to expectation (y_pred). Option 'auto' sets the link + depending on the chosen family as follows: - 'identity' for family 'normal' @@ -146,9 +146,9 @@ def fit(self, X, y, sample_weight=None): optional (default=None) Individual weights w_i for each sample. Note that for an Exponential Dispersion Model (EDM), one has - Var[Y_i]=phi/w_i * v(mu). - If Y_i ~ EDM(mu, phi/w_i), then - sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a + Var[Y_i]=phi/w_i * v(y_pred). + If Y_i ~ EDM(y_pred, phi/w_i), then + sum(w*Y)/sum(w) ~ EDM(y_pred, phi/sum(w)), i.e. the mean of y is a weighted average with weights=sample_weight. Returns @@ -267,10 +267,10 @@ def fit(self, X, y, sample_weight=None): if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): - mu, devp = family._mu_deviance_derivative( + y_pred, devp = family._y_pred_deviance_derivative( coef, X, y, weights, link ) - dev = family.deviance(y, mu, weights) + dev = family.deviance(y, y_pred, weights) intercept = (coef.size == X.shape[1] + 1) idx = 1 if intercept else 0 # offset if coef[0] is intercept coef_scaled = alpha * coef[idx:] @@ -337,8 +337,8 @@ def predict(self, X): """ # check_array is done in _linear_predictor eta = self._linear_predictor(X) - mu = self._link_instance.inverse(eta) - return mu + y_pred = self._link_instance.inverse(eta) + return y_pred def score(self, X, y, sample_weight=None): """Compute D^2, the percentage of deviance explained. @@ -376,8 +376,8 @@ def score(self, X, y, sample_weight=None): # TODO: make D^2 a score function in module metrics (and thereby get # input validation and so on) weights = _check_sample_weight(sample_weight, X) - mu = self.predict(X) - dev = self._family_instance.deviance(y, mu, weights=weights) + y_pred = self.predict(X) + dev = self._family_instance.deviance(y, y_pred, weights=weights) y_mean = np.average(y, weights=weights) dev_null = self._family_instance.deviance(y, y_mean, weights=weights) return 1 - dev / dev_null @@ -399,7 +399,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): """Regression with the response variable y following a Poisson distribution GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). + fitting and predicting the mean of the target y as y_pred=h(X*w). The fit minimizes the following objective function with L2 regularization:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 @@ -487,7 +487,7 @@ class GammaRegressor(GeneralizedLinearRegressor): """Regression with the response variable y following a Gamma distribution GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). + fitting and predicting the mean of the target y as y_pred=h(X*w). The fit minimizes the following objective function with L2 regularization:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 @@ -572,10 +572,10 @@ def family(self, value): class TweedieRegressor(GeneralizedLinearRegressor): - """Regression with the response variable y following a Tweedie distribution + r"""Regression with the response variable y following a Tweedie distribution GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as mu=h(X*w). + fitting and predicting the mean of the target y as y_pred=h(X*w). The fit minimizes the following objective function with L2 regularization:: 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 @@ -590,7 +590,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): power : float (default=0) The power determines the underlying target distribution. By definition it links distribution variance (:math:`v`) and - mean (:math:`\\mu`): :math:`v(\\mu) = \\mu^{power}`. + mean (:math:`\y_\textrm{pred}`): + :math:`v(\y_\textrm{pred}) = \y_\textrm{pred}^{power}`. For ``0 Date: Fri, 30 Aug 2019 12:13:43 +0200 Subject: [PATCH 158/269] Fix link parameter documentation in TweedieRegression --- sklearn/linear_model/_glm/glm.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index cab89766c0b92..d6da8b8b80949 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -460,12 +460,12 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, link='log', + def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="poisson", link=link, + family="poisson", link='log', solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) @@ -548,12 +548,12 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, link='log', - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs', + max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="gamma", link=link, + family="gamma", link='log', solver=solver, max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) @@ -619,6 +619,15 @@ class TweedieRegressor(GeneralizedLinearRegressor): case, the design matrix X must have full column rank (no collinearities). + link : {'auto', 'identity', 'log'}, default='auto' + The link function of the GLM, i.e. mapping from linear predictor + (X*coef) to expectation (y_pred). Option 'auto' sets the link + depending on the chosen family as follows: + + - 'identity' for Normal distribution + + - 'log' for Poisson, Gamma or Inverse Gaussian distributions + fit_intercept : boolean, optional (default=True) Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). @@ -659,9 +668,9 @@ class TweedieRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in solver. """ - def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='log', - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, + link='auto', solver='lbfgs', max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, From 3b526e98ce11514090cbdad890f3c2d13664cc83 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 30 Aug 2019 12:44:05 +0200 Subject: [PATCH 159/269] EXA Use a simpler pipeline for GBDT in poisson regression example --- ...plot_poisson_regression_non_normal_loss.py | 66 ++++++++++++++----- 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 9deca6c25032a..7ab772fb59920 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -34,6 +34,7 @@ from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder +from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import GradientBoostingRegressor @@ -134,10 +135,14 @@ def load_mtpl2(n_samples=100000): # To evaluate the pertinence of the used metrics, we will consider as a # baseline an estimator that returns 0 for any input. -df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) +df_train, df_test = train_test_split(df, random_state=0) -dummy = DummyRegressor(strategy='constant', constant=0) -dummy.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) +dummy = make_pipeline( + column_trans, + DummyRegressor(strategy='constant', constant=0) +) +dummy.fit(df_train, df_train.Frequency, + dummyregressor__sample_weight=df_train.Exposure) ############################################################################## # @@ -149,14 +154,14 @@ def score_estimator(estimator, df_test, eps=1e-5): """Score an estimatr on the test set""" print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, estimator.predict(X_test), + df_test.Frequency.values, estimator.predict(df_test), df_test.Exposure.values)) print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, estimator.predict(X_test), + df_test.Frequency.values, estimator.predict(df_test), df_test.Exposure.values)) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(estimator.predict(X_test), eps), + df_test.Frequency.values, np.fmax(estimator.predict(df_test), eps), df_test.Exposure.values)) @@ -168,12 +173,14 @@ def score_estimator(estimator, df_test, eps=1e-5): # We start by modeling the target variable with the least squares linear # regression model, -linregr = LinearRegression() -linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) +linregr = make_pipeline(column_trans, LinearRegression()) +linregr.fit(df_train, df_train.Frequency, + linearregression__sample_weight=df_train.Exposure) print('Number Negatives: %s / total: %s' % ( - (linregr.predict(X_test) < 0).sum(), X_test.shape[0])) + (linregr.predict(df_train) < 0).sum(), + df_train.shape[0])) print("LinearRegression") score_estimator(linregr, df_test) @@ -182,8 +189,12 @@ def score_estimator(estimator, df_test, eps=1e-5): # # Next we fit the Poisson regressor on the target variable, -glm_freq = PoissonRegressor(alpha=0, max_iter=1000) -glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) +glm_freq = make_pipeline( + column_trans, + PoissonRegressor(alpha=0, max_iter=1000) +) +glm_freq.fit(df_train, df_train.Frequency, + poissonregressor__sample_weight=df_train.Exposure) print("PoissonRegressor") score_estimator(glm_freq, df_test) @@ -191,12 +202,31 @@ def score_estimator(estimator, df_test, eps=1e-5): ############################################################################## # # Finally we will consider a non linear model with Gradient boosting that -# still minimizes the least square error. - - -gbr = GradientBoostingRegressor() -gbr.fit(X_train, df_train.Frequency.values, - sample_weight=df_train.Exposure.values) +# still minimizes the least square error. Gradient Boostring Decision Trees do +# not require for categorical data to be one hot encoded, therefore here we use +# a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with +# ``OrdinalEncoder`` instead of ``OneHotEncoder``. + + +gbr = make_pipeline( + ColumnTransformer( + [ + ( + "Veh_Brand_Gas_Region", + OrdinalEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"], + ), + ("Continious", "passthrough", ["VehAge", "DrivAge", "BonusMalus"]), + ("Density_log", make_pipeline( + FunctionTransformer(np.log, validate=False), StandardScaler()), + ["Density"]), + ], + remainder="drop", + ), + GradientBoostingRegressor() +) +gbr.fit(df_train, df_train.Frequency.values, + gradientboostingregressor__sample_weight=df_train.Exposure.values) print("GradientBoostingRegressor") @@ -224,7 +254,7 @@ def score_estimator(estimator, df_test, eps=1e-5): axes[0].set_title('Experimental data') for idx, model in enumerate([linregr, glm_freq, gbr]): - y_pred = model.predict(X_train) + y_pred = model.predict(df_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) axes[idx + 1].set_title(model.__class__.__name__) From b1eb611f1d7364af25f1d8f4222ff0a9cb494efa Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Sep 2019 16:49:52 +0200 Subject: [PATCH 160/269] Minor fixes for user guide --- doc/modules/linear_model.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 622ab335059ab..29c329f6f333a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -874,7 +874,7 @@ with 'log' loss, which might be even faster but requires more tuning. It is possible to obtain the p-values and confidence intervals for coefficients in cases of regression without penalization. The `statsmodels package ` natively supports this. - Within sklearn, one could use bootstrapping instead as well. + Within sklearn, one could use bootstrapping instead as well. :class:`LogisticRegressionCV` implements Logistic Regression with built-in @@ -919,8 +919,8 @@ The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as .. math:: d(y, \mu) = -2\phi\cdot - \left( log P(y|\mu,\phi) - - log P(y|y,\phi)\right) + \left( \log p(y|\mu,\phi) + - \log p(y|y,\phi)\right). The following table lists some specific EDM distributions—all are Tweedie distributions—and some of their properties. @@ -939,7 +939,7 @@ Usage ----- In the following use cases, a loss different from the squared loss might be -appropriate, +appropriate: * If the target values :math:`y` are counts (non-negative integer valued) or frequencies (non-negative), you might use a Poisson deviance with log-link. @@ -960,7 +960,7 @@ log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. :class:`TweedieRegressor` implements a generalized linear model for the Tweedie distribution, that allows to model any of the above mentioned distributions using the appropriate ``power`` parameter, i.e. the exponent -of the unit variance function, +of the unit variance function: - ``power = 0``: Normal distribution. Specialized solvers such as :class:`Ridge`, :class:`ElasticNet` are generally @@ -987,15 +987,15 @@ of the unit variance function, together with :math:`s=\mathrm{exposure}` as sample weights. This is done in both examples linked below. * The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. - * If the target y is a ratio, appropriate sample weights s should be + the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`. + * If the target `y` is a ratio, appropriate sample weights ``s`` should be provided. As an example, consider Poisson distributed counts z (integers) and weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`, + Consider :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distributions. The same holds for other distributions. From d964c01324a9945253549fbfb2ec1d3b3b18b79f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 1 Sep 2019 19:01:08 +0200 Subject: [PATCH 161/269] EXA Poisson: minor changes --- ...plot_poisson_regression_non_normal_loss.py | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 7ab772fb59920..b71fba9236ef5 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -3,11 +3,11 @@ Poisson regression and non-normal loss ====================================== -This example illustrates the use of linear Poisson regression +This example illustrates the use of log-linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compares it with models learned with least squared error. The goal is to predict the -number of insurance claims (or frequency) following car accidents for a user -given historical data over a population of users. +number of insurance claims (or frequency) following car accidents for a +policyholder given historical data over a population of policyholders. We start by defining a few helper functions for loading the data and visualizing results. @@ -48,7 +48,8 @@ def load_mtpl2(n_samples=100000): Parameters ---------- n_samples: int, default=100000 - number of samples to select (for faster run time). + number of samples to select (for faster run time). Full dataset has + 678013 samples. """ # freMTPL2freq dataset from https://www.openml.org/d/41214 @@ -76,14 +77,15 @@ def load_mtpl2(n_samples=100000): # 1. Loading datasets and pre-processing # -------------------------------------- # -# We construct the freMTPL2 dataset by joining the freMTPL2freq table, +# We construct the freMTPL2 dataset by joining the freMTPL2freq table, # containing the number of claims (``ClaimNb``) with the freMTPL2sev table -# containing the claim amount (``ClaimAmount``) for the same user ids. +# containing the claim amount (``ClaimAmount``) for the same policy ids +# (``IDpol``). df = load_mtpl2(n_samples=50000) # Note: filter out claims with zero amount, as the severity model -# requires a strictly positive target values. +# requires strictly positive target values. df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 # correct for unreasonable observations (that might be data error) @@ -116,9 +118,9 @@ def load_mtpl2(n_samples=100000): # The number of claims (``ClaimNb``) is a positive integer that can be modeled # as a Poisson distribution. It is then assumed to be the number of discrete # events occurring with a constant rate in a given time interval -# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution, and use ``Exposure`` as -# `sample_weight`. +# (``Exposure``, in units of years). Here we model the frequency +# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, +# and use ``Exposure`` as `sample_weight`. df["Frequency"] = df.ClaimNb / df.Exposure @@ -126,20 +128,23 @@ def load_mtpl2(n_samples=100000): pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() ) +print("Average Frequency = {}" + .format(np.average(df.Frequency, weights=df.Exposure))) + ############################################################################## # -# It worth noting that 96 % of users have 0 claims, and if we were to convert -# this problem into a binary classification task, it would be significantly -# imbalanced. +# It worth noting that 96 % of policyholders have zero claims, and if we were +# to convert this problem into a binary classification task, it would be +# significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a -# baseline an estimator that returns 0 for any input. +# baseline an estimator that returns the mean of the training sample. df_train, df_test = train_test_split(df, random_state=0) dummy = make_pipeline( column_trans, - DummyRegressor(strategy='constant', constant=0) + DummyRegressor(strategy='mean') ) dummy.fit(df_train, df_train.Frequency, dummyregressor__sample_weight=df_train.Exposure) @@ -257,7 +262,7 @@ def score_estimator(estimator, df_test, eps=1e-5): y_pred = model.predict(df_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) - axes[idx + 1].set_title(model.__class__.__name__) + axes[idx + 1].set_title(model[-1].__class__.__name__) for axi in axes: axi.set( From a1844b8543a9a43fadfc0523e48cd0135e58ba37 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 2 Sep 2019 20:29:56 +0200 Subject: [PATCH 162/269] Fix mu->y_pred and p->power --- doc/whats_new/v0.22.rst | 16 ++++++++-------- .../plot_tweedie_regression_insurance_claims.py | 4 ++-- sklearn/metrics/regression.py | 4 ++-- sklearn/metrics/scorer.py | 4 ++-- sklearn/metrics/tests/test_common.py | 4 ++-- sklearn/metrics/tests/test_regression.py | 10 +++++----- 6 files changed, 21 insertions(+), 21 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 6e3d4822b261d..d6d52732ba714 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -241,10 +241,10 @@ Changelog :user:`Mohamed Maskani `, and :user:`Thomas Fan `. - |Feature| Add :class:`metrics.mean_tweedie_deviance` measuring the - Tweedie deviance for a power parameter ``p``. Also add mean Poisson deviance - :class:`metrics.mean_poisson_deviance` and mean Gamma deviance + Tweedie deviance for a power parameter ``power``. Also add mean Poisson + deviance :class:`metrics.mean_poisson_deviance` and mean Gamma deviance :class:`metrics.mean_gamma_deviance` that are special cases of the Tweedie - deviance for `p=1` and `p=2` respectively. + deviance for `power=1` and `power=2` respectively. :pr:`13938` by :user:`Christian Lorentzen ` and `Roman Yurchak`_. @@ -306,19 +306,19 @@ Changelog - |Enhancement| SVM now throws more specific error when fit on non-square data and kernel = precomputed. :class:`svm.BaseLibSVM` :pr:`14336` by :user:`Gregory Dexter `. - + :mod:`sklearn.tree` ................... - |Feature| Adds minimal cost complexity pruning, controlled by ``ccp_alpha``, to :class:`tree.DecisionTreeClassifier`, :class:`tree.DecisionTreeRegressor`, :class:`tree.ExtraTreeClassifier`, :class:`tree.ExtraTreeRegressor`, - :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, - :class:`ensemble.ExtraTreesClassifier`, + :class:`ensemble.ExtraTreesClassifier`, :class:`ensemble.ExtraTreesRegressor`, - :class:`ensemble.RandomTreesEmbedding`, - :class:`ensemble.GradientBoostingClassifier`, + :class:`ensemble.RandomTreesEmbedding`, + :class:`ensemble.GradientBoostingClassifier`, and :class:`ensemble.GradientBoostingRegressor`. :pr:`12887` by `Thomas Fan`_. diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index f866518b69db8..eb9769814ade5 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -426,7 +426,7 @@ class ClaimProdEstimator: """Total claim amount estimator. Computed as the product of the frequency model by the serverity model, - denormalized by exposure. Use Tweedie deviance with `p=1.5`. + denormalized by exposure. Use Tweedie deviance with `power=1.5`. """ def __init__(self, est_freq, est_sev): @@ -527,7 +527,7 @@ def score(self, X, y, sample_weight=None): "predicted, frequency*severity model": np.sum( est_prod.predict(X, exposure=df.Exposure.values) ), - "predicted, tweedie, p=%.2f" + "predicted, tweedie, power=%.2f" % glm_total.best_estimator_.family.power: np.sum( glm_total.best_estimator_.predict(X) ), diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 73db0acc945e6..706c484334d21 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -693,7 +693,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None): """Mean Poisson deviance regression loss. Poisson deviance is equivalent to the Tweedie deviance with - the power parameter `p=1`. + the power parameter `power=1`. Read more in the :ref:`User Guide `. @@ -730,7 +730,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): """Mean Gamma deviance regression loss. Gamma deviance is equivalent to the Tweedie deviance with - the power parameter `p=2`. It is invariant to scaling of + the power parameter `power=2`. It is invariant to scaling of the target variable, and mesures relative errors. Read more in the :ref:`User Guide `. diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index bf2892bdf83a2..e2496c83b666d 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -499,11 +499,11 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False, greater_is_better=False, squared=False) neg_mean_poisson_deviance_scorer = make_scorer( - mean_tweedie_deviance, p=1., greater_is_better=False + mean_tweedie_deviance, power=1., greater_is_better=False ) neg_mean_gamma_deviance_scorer = make_scorer( - mean_tweedie_deviance, p=2., greater_is_better=False + mean_tweedie_deviance, power=2., greater_is_better=False ) # Standard Classification Scores diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 6459f93c68449..a8cabe984e563 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -102,11 +102,11 @@ "median_absolute_error": median_absolute_error, "explained_variance_score": explained_variance_score, "r2_score": partial(r2_score, multioutput='variance_weighted'), - "mean_normal_deviance": partial(mean_tweedie_deviance, p=0), + "mean_normal_deviance": partial(mean_tweedie_deviance, power=0), "mean_poisson_deviance": mean_poisson_deviance, "mean_gamma_deviance": mean_gamma_deviance, "mean_compound_poisson_deviance": - partial(mean_tweedie_deviance, p=1.4), + partial(mean_tweedie_deviance, power=1.4), } CLASSIFICATION_METRICS = { diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 0f987a088bb84..c3947db5ed857 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -105,31 +105,31 @@ def test_regression_metrics_at_limits(): assert_allclose(mean_tweedie_deviance([0], [1.], power=power), 2 / (2 - power), rtol=1e-3) with pytest.raises(ValueError, - match="can only be used on strictly positive mu."): + match="can only be used on strictly positive y_pred."): mean_tweedie_deviance([0.], [0.], power=power) assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2) - msg = "only be used on non-negative y and strictly positive mu." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=1.0) power = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), 2 / (2 - power)) - msg = "only be used on non-negative y and strictly positive mu." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 2. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y and mu." + msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 3. assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00, atol=1e-8) - msg = "can only be used on strictly positive y and mu." + msg = "can only be used on strictly positive y and y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) From f5133920b47070f5b252dc4d8015745195130c44 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 3 Sep 2019 18:34:08 +0200 Subject: [PATCH 163/269] EXA Tweedie: some improvements --- ...lot_tweedie_regression_insurance_claims.py | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index eb9769814ade5..22a26d880a869 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -13,8 +13,8 @@ which are: 1. Model the number of claims with a Poisson distribution, the average - claim amount as a Gamma distribution and multiply the predictions of both in - order to get the total claim amount. + claim amount per claim, also known as severity, as a Gamma distribution and + multiply the predictions of both in order to get the total claim amount. 2. Model total claim amount directly, typically with a Tweedie distribution of Tweedie power :math:`p \\in (1, 2)`. @@ -42,6 +42,7 @@ from sklearn.compose import ColumnTransformer from sklearn.linear_model import PoissonRegressor, GammaRegressor from sklearn.linear_model import TweedieRegressor +from sklearn.metrics import mean_tweedie_deviance from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -56,7 +57,8 @@ def load_mtpl2(n_samples=100000): Parameters ---------- n_samples: int, default=100000 - number of samples to select (for faster run time). + number of samples to select (for faster run time). Full dataset has + 678013 samples. """ # freMTPL2freq dataset from https://www.openml.org/d/41214 @@ -139,7 +141,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df = load_mtpl2(n_samples=60000) # Note: filter out claims with zero amount, as the severity model -# requires a strictly positive target values. +# requires strictly positive target values. df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 # Correct for unreasonable observations (that might be data error) @@ -182,10 +184,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # # The number of claims (``ClaimNb``) is a positive integer that can be modeled # as a Poisson distribution. It is then assumed to be the number of discrete -# events occuring with a constant rate in a given time interval (``Exposure``). -# Here we model the frequency ``y = ClaimNb / Exposure``, -# which is still a (scaled) Poisson distribution, and use ``Exposure`` as -# `sample_weight`. +# events occuring with a constant rate in a given time interval +# (``Exposure``, in units of years). Here we model the frequency +# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, +# and use ``Exposure`` as `sample_weight`. df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) @@ -197,7 +199,10 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, def mean_deviance(estimator, y, y_pred, weights): if hasattr(estimator, "_family_instance"): - return estimator._family_instance.deviance(y, y_pred, weights) / len(y) + if weights is None: + weights = np.ones_like(y) + return (estimator._family_instance.deviance(y, y_pred, weights) + / np.sum(weights)) else: return np.nan @@ -320,10 +325,10 @@ def score_estimator( # # According to the observed data, the frequency of accidents is higher for # drivers younger than 30 years old, and it positively correlated with the -# `BonusMalus` variable. Out model is able to mostly correctly model +# `BonusMalus` variable. Our model is able to mostly correctly model # this behaviour. # -# 3. Severity model - Gamma Distribution +# 3. Severity model - Gamma distribution # --------------------------------------- # The mean claim amount or severity (`AvgClaimAmount`) can be empirically # shown to follow approximately a Gamma distribution. We fit a GLM model for @@ -333,7 +338,7 @@ def score_estimator( # # - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support # on :math:`(0, \infty)`, not :math:`[0, \infty)`. -# - We use ``ClaimNb`` as sample weights. +# - We use ``ClaimNb`` as `sample_weight`. mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 @@ -360,6 +365,8 @@ def score_estimator( ############################################################################## # +# Here, the scores for the test data call for caution as they are significantly +# worse than for the training data indicating an overfit. # Note that the resulting model is the average claim amount per claim. As such, # it is conditional on having at least one claim, and cannot be used to predict # the average claim amount per policy in general. @@ -412,10 +419,10 @@ def score_estimator( ############################################################################## # -# Overall the drivers age (``DrivAge``) has a weak impact on the claim +# Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # -# 4. Total Claims Amount -- Compound Poisson distribution +# 4. Total claim amount -- Compound Poisson distribution # ------------------------------------------------------- # # As mentionned in the introduction, the total claim amount can be modeled @@ -426,12 +433,16 @@ class ClaimProdEstimator: """Total claim amount estimator. Computed as the product of the frequency model by the serverity model, - denormalized by exposure. Use Tweedie deviance with `power=1.5`. + denormalized by exposure. For scores, use Tweedie deviance with + `power=1.5`. """ def __init__(self, est_freq, est_sev): + from sklearn.linear_model._glm.distribution import TweedieDistribution + self.est_freq = est_freq self.est_sev = est_sev + self._family_instance = TweedieDistribution(power=1.5) def predict(self, X, exposure): """Predict the total claim amount. @@ -442,14 +453,13 @@ def predict(self, X, exposure): def score(self, X, y, sample_weight=None): """Compute D², the percentage of deviance explained.""" - # TODO: remove this private import once d2_score is available - from sklearn.linear_model._glm.distribution import TweedieDistribution - + # TODO: use d2_score directly once it is available mu = self.predict(X, exposure=sample_weight) - family = TweedieDistribution(power=1.5) - dev = family.deviance(y, mu, weights=sample_weight) - y_mean = np.average(y, weights=sample_weight) - dev_null = family.deviance(y, y_mean, weights=sample_weight) + dev = mean_tweedie_deviance( + y, mu, sample_weight=sample_weight, power=1.5) + y_mean = np.average(y, weights=sample_weight) * np.ones_like(y) + dev_null = mean_tweedie_deviance( + y, y_mean, sample_weight=sample_weight, power=1.5) return 1. - dev / dev_null @@ -475,7 +485,7 @@ def score(self, X, y, sample_weight=None): from sklearn.model_selection import GridSearchCV -# exclude upper bound as power=2 does not support null y values. +# exclude upper bound as power>=2 does not support y=0. params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)} From 84229a6d5fbe6cf9964f573496c66fe8c88bd2ab Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 3 Sep 2019 21:19:44 +0200 Subject: [PATCH 164/269] Fix doc test --- doc/modules/linear_model.rst | 2 +- .../linear_model/plot_poisson_regression_non_normal_loss.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 29c329f6f333a..ee418af1d414b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1004,7 +1004,7 @@ The estimator can be used as follows:: >>> from sklearn.linear_model import TweedieRegressor >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2]) - TweedieRegressor(alpha=0.5, power=1) + TweedieRegressor(alpha=0.5, link='log', power=1) >>> reg.coef_ array([0.2463..., 0.4337...]) >>> reg.intercept_ diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index b71fba9236ef5..d739c37d2bb60 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -206,8 +206,8 @@ def score_estimator(estimator, df_test, eps=1e-5): ############################################################################## # -# Finally we will consider a non linear model with Gradient boosting that -# still minimizes the least square error. Gradient Boostring Decision Trees do +# Finally, we will consider a non linear model with Gradient boosting that +# still minimizes the least square error. Gradient Boosting Decision Trees do # not require for categorical data to be one hot encoded, therefore here we use # a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with # ``OrdinalEncoder`` instead of ``OneHotEncoder``. From 8c6c255cbad9cae5c82b5154f94f9a6a14cc6b3a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 11 Sep 2019 15:20:10 +0200 Subject: [PATCH 165/269] Fix test --- sklearn/metrics/tests/test_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py index 17bf7f828948c..f29e7d2ad1c13 100644 --- a/sklearn/metrics/tests/test_regression.py +++ b/sklearn/metrics/tests/test_regression.py @@ -118,7 +118,7 @@ def test_regression_metrics_at_limits(): power = 1.5 assert_allclose(mean_tweedie_deviance([0.], [1.], power=power), 2 / (2 - power)) - msg = "only be used on non-negative y_true and strictly positive y_pred." + msg = "only be used on non-negative y and strictly positive y_pred." with pytest.raises(ValueError, match=msg): mean_tweedie_deviance([0.], [0.], power=power) power = 2. From 0a2331385daca833429b82a3493471cb45329ac4 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 12 Sep 2019 17:17:41 +0200 Subject: [PATCH 166/269] EXA Use Ridge and remove eps --- ...plot_poisson_regression_non_normal_loss.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d739c37d2bb60..4fb16f6419209 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -30,7 +30,7 @@ from sklearn.datasets import fetch_openml from sklearn.dummy import DummyRegressor from sklearn.compose import ColumnTransformer -from sklearn.linear_model import PoissonRegressor, LinearRegression +from sklearn.linear_model import Ridge, PoissonRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder @@ -152,22 +152,28 @@ def load_mtpl2(n_samples=100000): ############################################################################## # # The Poisson deviance cannot be computed on negative values predicted by the -# model, so we set the minimum predicted value to eps, +# model, so all models need to return positive preditions if we intend to +# use this metric, -def score_estimator(estimator, df_test, eps=1e-5): +def score_estimator(estimator, df_test): """Score an estimatr on the test set""" + y_pred = estimator.predict(df_test) + print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, estimator.predict(df_test), + df_test.Frequency.values, y_pred, df_test.Exposure.values)) print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, estimator.predict(df_test), + df_test.Frequency.values, y_pred, df_test.Exposure.values)) + # ignore negative predictions + mask = y_pred > 0 + print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values, np.fmax(estimator.predict(df_test), eps), - df_test.Exposure.values)) + df_test.Frequency.values[mask], y_pred[mask], + df_test.Exposure.values[mask])) print("DummyRegressor") @@ -178,16 +184,16 @@ def score_estimator(estimator, df_test, eps=1e-5): # We start by modeling the target variable with the least squares linear # regression model, -linregr = make_pipeline(column_trans, LinearRegression()) +linregr = make_pipeline(column_trans, Ridge(alpha=1.0)) linregr.fit(df_train, df_train.Frequency, - linearregression__sample_weight=df_train.Exposure) + ridge__sample_weight=df_train.Exposure) print('Number Negatives: %s / total: %s' % ( (linregr.predict(df_train) < 0).sum(), df_train.shape[0])) -print("LinearRegression") +print("Ridge") score_estimator(linregr, df_test) ############################################################################## @@ -196,7 +202,7 @@ def score_estimator(estimator, df_test, eps=1e-5): glm_freq = make_pipeline( column_trans, - PoissonRegressor(alpha=0, max_iter=1000) + PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) ) glm_freq.fit(df_train, df_train.Frequency, poissonregressor__sample_weight=df_train.Exposure) From 976b436ebbeff0f6d21f9ae06352cb6bb25c174e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 16 Sep 2019 13:26:16 +0200 Subject: [PATCH 167/269] Address comments in plot_poisson_regression_non_normal_loss.py --- ...plot_poisson_regression_non_normal_loss.py | 73 +++++++++++-------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 4fb16f6419209..fa1bc09a76285 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -23,6 +23,8 @@ # Authors: Christian Lorentzen # Roman Yurchak # License: BSD 3 clause +import warnings + import numpy as np import matplotlib.pyplot as plt import pandas as pd @@ -36,7 +38,7 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.ensemble import GradientBoostingRegressor +from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -149,11 +151,6 @@ def load_mtpl2(n_samples=100000): dummy.fit(df_train, df_train.Frequency, dummyregressor__sample_weight=df_train.Exposure) -############################################################################## -# -# The Poisson deviance cannot be computed on negative values predicted by the -# model, so all models need to return positive preditions if we intend to -# use this metric, def score_estimator(estimator, df_test): @@ -168,11 +165,17 @@ def score_estimator(estimator, df_test): df_test.Frequency.values, y_pred, df_test.Exposure.values)) - # ignore negative predictions + # ignore negative predictions, as they are invalid for + # the Poisson deviance mask = y_pred > 0 + if (~mask).any(): + warnings.warn("estimator yields negative predictions for {} samples " + "out of {}. These will be ignored while computing the " + "poisson deviance".format((~mask).sum(), mask.shape[0])) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values[mask], y_pred[mask], + df_test.Frequency.values[mask], + y_pred[mask], df_test.Exposure.values[mask])) @@ -184,14 +187,21 @@ def score_estimator(estimator, df_test): # We start by modeling the target variable with the least squares linear # regression model, -linregr = make_pipeline(column_trans, Ridge(alpha=1.0)) +linregr = make_pipeline( + column_trans, + Ridge(alpha=1.0) +) linregr.fit(df_train, df_train.Frequency, ridge__sample_weight=df_train.Exposure) - -print('Number Negatives: %s / total: %s' % ( - (linregr.predict(df_train) < 0).sum(), - df_train.shape[0])) +############################################################################## +# +# The Poisson deviance cannot be computed on negative values predicted by the +# model. For models that do return a few negative predictions +# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, +# meaning that the obtained Poisson deviance is approximate. An alternative +# apporach could be to use class:`compose.TransformedTargetRegressor` +# meta-estimator to map ``y_pred`` to strictly positive domain. print("Ridge") score_estimator(linregr, df_test) @@ -212,40 +222,38 @@ def score_estimator(estimator, df_test): ############################################################################## # -# Finally, we will consider a non linear model with Gradient boosting that -# still minimizes the least square error. Gradient Boosting Decision Trees do +# Finally, we will consider a non linear model with a random forest that +# still minimizes the least square error. Random forest does # not require for categorical data to be one hot encoded, therefore here we use -# a simpler pre-processing pipeline without ``KBinsDiscretizer`` and with -# ``OrdinalEncoder`` instead of ``OneHotEncoder``. +# a simpler pre-processing pipeline with :class:`preprocessing.OrdinalEncoder`, gbr = make_pipeline( ColumnTransformer( [ ( - "Veh_Brand_Gas_Region", - OrdinalEncoder(), + "Veh_Brand_Gas_Region", OrdinalEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"], ), - ("Continious", "passthrough", ["VehAge", "DrivAge", "BonusMalus"]), - ("Density_log", make_pipeline( - FunctionTransformer(np.log, validate=False), StandardScaler()), - ["Density"]), + ( + "Continious", "passthrough", + ["VehAge", "DrivAge", "BonusMalus", "Density"] + ), ], remainder="drop", ), - GradientBoostingRegressor() + RandomForestRegressor(min_weight_fraction_leaf=1e-2) ) gbr.fit(df_train, df_train.Frequency.values, - gradientboostingregressor__sample_weight=df_train.Exposure.values) + randomforestregressor__sample_weight=df_train.Exposure.values) -print("GradientBoostingRegressor") +print("RandomForestRegressor") score_estimator(gbr, df_test) ############################################################################## # -# In this example, although Gradient boosting minimizes the least square error, +# In this example, although random forest minimizes the least square error, # because of a higher predictive power it also results in a smaller Poisson # deviance than the Poisson regression model. # @@ -281,11 +289,12 @@ def score_estimator(estimator, df_test): # The experimental data presents a long tail distribution for ``y``. In all # models we predict the mean expected value, so we will have necessairily fewer # extreme values. Additionally normal distribution used in ``Ridge`` and -# ``GradientBoostingRegressor`` has a constant variance, while for the Poisson +# ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. # -# Thus, among the considered estimators, -# ``PoissonRegressor`` and ``GradientBoostingRegressor`` are better suited for -# modeling the long tail distribution of the data as compared to the ``Ridge`` -# estimator. +# Thus, among the considered estimators, ``PoissonRegressor`` is better suited +# for modeling the long tail distribution of the data as compared to the +# ``Ridge`` and ``RandomForestRegressor`` estimators. + +plt.show() From 7c850d1a7c9a5f468ee0136ee91e2d662aefa4da Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 16 Sep 2019 13:28:06 +0200 Subject: [PATCH 168/269] Lint --- examples/linear_model/plot_poisson_regression_non_normal_loss.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index fa1bc09a76285..5098016f22913 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -152,7 +152,6 @@ def load_mtpl2(n_samples=100000): dummyregressor__sample_weight=df_train.Exposure) - def score_estimator(estimator, df_test): """Score an estimatr on the test set""" From f64dc4a4c51128c6d1017911e23760a866ca4007 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 16 Sep 2019 15:24:30 +0200 Subject: [PATCH 169/269] Simplify plot_tweedie_regression_insurance_claims.py example --- ...plot_poisson_regression_non_normal_loss.py | 2 - ...lot_tweedie_regression_insurance_claims.py | 85 ++++++------------- 2 files changed, 25 insertions(+), 62 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 5098016f22913..769f321ff1562 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -295,5 +295,3 @@ def score_estimator(estimator, df_test): # Thus, among the considered estimators, ``PoissonRegressor`` is better suited # for modeling the long tail distribution of the data as compared to the # ``Ridge`` and ``RandomForestRegressor`` estimators. - -plt.show() diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 22a26d880a869..5a312f656d9ce 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -197,16 +197,6 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) -def mean_deviance(estimator, y, y_pred, weights): - if hasattr(estimator, "_family_instance"): - if weights is None: - weights = np.ones_like(y) - return (estimator._family_instance.deviance(y, y_pred, weights) - / np.sum(weights)) - else: - return np.nan - - def score_estimator( estimator, X_train, X_test, df_train, df_test, target, weights ): @@ -221,18 +211,25 @@ def score_estimator( for score_label, metric in [ ("D² explained", None), - ("mean deviance", partial(mean_deviance, estimator)), + ("mean deviance", mean_tweedie_deviance), ("mean abs. error", mean_absolute_error), ("mean squared error", mean_squared_error), ]: - if estimator.__class__.__name__ == "ClaimProdEstimator": - # ClaimProdEstimator is the product of frequency and severity - # models, denormalized by the exposure values. - # It does not fully follow the scikit-learn API and we - # must handle it separately. - y_pred = estimator.predict(X, exposure=df.Exposure.values) + if isinstance(estimator, tuple) and len(estimator) == 2: + # Score the model consisting of the product of frequency and + # severity models, denormalized by the exposure values. + est_freq, est_sev = estimator + y_pred = (df.Exposure.values * est_freq.predict(X) + * est_sev.predict(X)) + power = 1.5 else: y_pred = estimator.predict(X) + power = getattr(getattr(estimator, "_family_instance"), + "power") + + if score_label == "mean deviance": + metric = partial(mean_tweedie_deviance, power=power) + if metric is None: if not hasattr(estimator, "score"): continue @@ -248,7 +245,8 @@ def score_estimator( pd.DataFrame(res) .set_index(["metric", "subset"]) .score.unstack(-1) - .round(3) + .round(2) + .loc[:, ['train', 'test']] ) return res @@ -425,48 +423,16 @@ def score_estimator( # 4. Total claim amount -- Compound Poisson distribution # ------------------------------------------------------- # -# As mentionned in the introduction, the total claim amount can be modeled +# As mentioned in the introduction, the total claim amount can be modeled # either as the product of the frequency model by the severity model, +# denormalized by exposure. In the following code sample, the +# ``score_estimator`` is extended to score such a model. The mean deviance +# is computed assuming a Tweedie distribution with ``power=1.5`` to be +# comparable with the model from the following section, -class ClaimProdEstimator: - """Total claim amount estimator. - - Computed as the product of the frequency model by the serverity model, - denormalized by exposure. For scores, use Tweedie deviance with - `power=1.5`. - """ - - def __init__(self, est_freq, est_sev): - from sklearn.linear_model._glm.distribution import TweedieDistribution - - self.est_freq = est_freq - self.est_sev = est_sev - self._family_instance = TweedieDistribution(power=1.5) - - def predict(self, X, exposure): - """Predict the total claim amount. - - The predict method is not compatible with the scikit-learn API. - """ - return exposure * self.est_freq.predict(X) * self.est_sev.predict(X) - - def score(self, X, y, sample_weight=None): - """Compute D², the percentage of deviance explained.""" - # TODO: use d2_score directly once it is available - mu = self.predict(X, exposure=sample_weight) - dev = mean_tweedie_deviance( - y, mu, sample_weight=sample_weight, power=1.5) - y_mean = np.average(y, weights=sample_weight) * np.ones_like(y) - dev_null = mean_tweedie_deviance( - y, y_mean, sample_weight=sample_weight, power=1.5) - return 1. - dev / dev_null - - -est_prod = ClaimProdEstimator(glm_freq, glm_sev) - scores = score_estimator( - est_prod, + (glm_freq, glm_sev), X_train, X_test, df_train, @@ -479,7 +445,8 @@ def score(self, X, y, sample_weight=None): ############################################################################## # -# or as a unique Compound Poisson model, also corresponding to a Tweedie model +# Indeed, an alternative approach for modeling the total loss is with a unique +# Compound Poisson model, also corresponding to a Tweedie model # with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter # ``p`` with a grid search, @@ -535,7 +502,7 @@ def score(self, X, y, sample_weight=None): "subset": subset_label, "observed": df.ClaimAmount.values.sum(), "predicted, frequency*severity model": np.sum( - est_prod.predict(X, exposure=df.Exposure.values) + df.Exposure.values*glm_freq.predict(X)*glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" % glm_total.best_estimator_.family.power: np.sum( @@ -545,5 +512,3 @@ def score(self, X, y, sample_weight=None): ) print(pd.DataFrame(res).set_index("subset").T) - -plt.show() From b1f5bde2ea12f7195d5efcf30d3a28fa9db1ef7f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 18 Sep 2019 16:57:48 +0200 Subject: [PATCH 170/269] Add "lift curve" for model validation in Poisson example --- ...plot_poisson_regression_non_normal_loss.py | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 769f321ff1562..3cdbc7cc1a789 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -39,6 +39,7 @@ from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import RandomForestRegressor +from sklearn.utils import gen_batches from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -295,3 +296,93 @@ def score_estimator(estimator, df_test): # Thus, among the considered estimators, ``PoissonRegressor`` is better suited # for modeling the long tail distribution of the data as compared to the # ``Ridge`` and ``RandomForestRegressor`` estimators. +# +# To ensure that estimators yield reasonable predictions for different +# policyholder types, we can bin test samples according to `y_pred` returned by +# each model. Then for each bin, compare the mean predicted `y_pred`, with +# the mean observed target. + + +def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): + """Compare predictions and observations for bins + ordered by y_pred + + We order the samples by ``y_pred`` and split it in bins. + In each bin the observed mean is compared with the predicted + mean. + + Parameters + ---------- + y_true: array-like of shape (n_samples,) + Ground truth (correct) target values. + y_pred: array-like of shape (n_samples,) + Estimated target values. + sample_weight : array-like of shape (n_samples,) + Sample weights. + n_bins: int + number of bins to use + + Returns + ------- + bin_centers: ndarray of shape (n_bins,) + bin centers + y_true_bin: ndarray of shape (n_bins,) + average y_pred for each bin + y_pred_bin: ndarray of shape (n_bins,) + average y_pred for each bin + """ + idx_sort = np.argsort(y_pred) + + bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins + + y_pred_bin = np.zeros(n_bins) + y_true_bin = np.zeros(n_bins) + bin_size = len(y_true) // n_bins + for n, sl in enumerate(gen_batches(len(y_true), bin_size)): + weights = sample_weights[idx_sort][sl] + y_pred_bin[n] = np.average( + y_pred[idx_sort][sl], weights=weights + ) + y_true_bin[n] = np.average( + y_true[idx_sort][sl], + weights=weights + ) + return bin_centers, y_true_bin, y_pred_bin + + +fig, ax = plt.subplots(1, 3, figsize=(12, 3.2)) +plt.subplots_adjust(wspace=0.3) + + +for axi, (label, model, color) in zip(ax, [ + ('Ridge', linregr, 'b'), + ('PoissonRegressor', glm_freq, 'k'), + ('Random Forest', gbr, 'r') +]): + y_pred = model.predict(df_test) + + q, y_true_seg, y_pred_seg = _lift_curve( + df_test.Frequency.values, + y_pred, + sample_weights=df_test.Exposure.values, + n_bins=10) + + axi.plot(q, y_pred_seg, 'o'+color, label="predictions", ms=5) + axi.step(q, y_true_seg, '--'+color, label="observations", + where='mid') + axi.set_xlim(0, 1.0) + axi.set( + title=label, + xlabel='Fraction of samples sorted by y_pred', + ylabel='Mean Frequency (y_pred)' + + ) + + axi.legend() + + +############################################################################## +# +# On the above figure, ``PoissonRegressor`` is the model which presents the +# best consistency between predicted and observed targets, both for low +# and high target values. From a9ab4e4975d3b8b1bf27e73561bbadc921aac5bf Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 25 Sep 2019 19:24:11 +0200 Subject: [PATCH 171/269] Various improvements to the model comparison example --- ...plot_poisson_regression_non_normal_loss.py | 153 +++++++++--------- 1 file changed, 79 insertions(+), 74 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 3cdbc7cc1a789..5c044d5530bd8 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -39,7 +39,7 @@ from sklearn.preprocessing import OrdinalEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import RandomForestRegressor -from sklearn.utils import gen_batches +from sklearn.utils import gen_even_slices from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -149,8 +149,8 @@ def load_mtpl2(n_samples=100000): column_trans, DummyRegressor(strategy='mean') ) -dummy.fit(df_train, df_train.Frequency, - dummyregressor__sample_weight=df_train.Exposure) +dummy.fit(df_train, df_train["Frequency"], + dummyregressor__sample_weight=df_train["Exposure"]) def score_estimator(estimator, df_test): @@ -159,11 +159,11 @@ def score_estimator(estimator, df_test): y_pred = estimator.predict(df_test) print("MSE: %.3f" % mean_squared_error( - df_test.Frequency.values, y_pred, - df_test.Exposure.values)) + df_test["Frequency"], y_pred, + df_test["Exposure"])) print("MAE: %.3f" % mean_absolute_error( - df_test.Frequency.values, y_pred, - df_test.Exposure.values)) + df_test["Frequency"], y_pred, + df_test["Exposure"])) # ignore negative predictions, as they are invalid for # the Poisson deviance @@ -174,12 +174,12 @@ def score_estimator(estimator, df_test): "poisson deviance".format((~mask).sum(), mask.shape[0])) print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test.Frequency.values[mask], + df_test["Frequency"][mask], y_pred[mask], - df_test.Exposure.values[mask])) + df_test["Exposure"][mask])) -print("DummyRegressor") +print("Constant mean frequency evaluation:") score_estimator(dummy, df_test) ############################################################################## @@ -187,12 +187,12 @@ def score_estimator(estimator, df_test): # We start by modeling the target variable with the least squares linear # regression model, -linregr = make_pipeline( +ridge = make_pipeline( column_trans, Ridge(alpha=1.0) ) -linregr.fit(df_train, df_train.Frequency, - ridge__sample_weight=df_train.Exposure) +ridge.fit(df_train, df_train["Frequency"], + ridge__sample_weight=df_train["Exposure"]) ############################################################################## # @@ -203,32 +203,33 @@ def score_estimator(estimator, df_test): # apporach could be to use class:`compose.TransformedTargetRegressor` # meta-estimator to map ``y_pred`` to strictly positive domain. -print("Ridge") -score_estimator(linregr, df_test) +print("Ridge evaluation:") +score_estimator(ridge, df_test) ############################################################################## # # Next we fit the Poisson regressor on the target variable, -glm_freq = make_pipeline( +poisson = make_pipeline( column_trans, PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) ) -glm_freq.fit(df_train, df_train.Frequency, - poissonregressor__sample_weight=df_train.Exposure) +poisson.fit(df_train, df_train["Frequency"], + poissonregressor__sample_weight=df_train["Exposure"]) -print("PoissonRegressor") -score_estimator(glm_freq, df_test) +print("PoissonRegressor evaluation:") +score_estimator(poisson, df_test) ############################################################################## # -# Finally, we will consider a non linear model with a random forest that -# still minimizes the least square error. Random forest does -# not require for categorical data to be one hot encoded, therefore here we use -# a simpler pre-processing pipeline with :class:`preprocessing.OrdinalEncoder`, - - -gbr = make_pipeline( +# Finally, we will consider a non-linear model, namely a random forest. Random +# forests do not require the categorical data to be one-hot encoded, instead +# we encode each category label with an arbirtrary integer using +# :class:`preprocessing.OrdinalEncoder` to make the model faster to train (the +# same information is encoded with a small number of features than with +# one-hot encoding). + +rf = make_pipeline( ColumnTransformer( [ ( @@ -242,53 +243,55 @@ def score_estimator(estimator, df_test): ], remainder="drop", ), - RandomForestRegressor(min_weight_fraction_leaf=1e-2) + RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2) ) -gbr.fit(df_train, df_train.Frequency.values, - randomforestregressor__sample_weight=df_train.Exposure.values) +rf.fit(df_train, df_train["Frequency"].values, + randomforestregressor__sample_weight=df_train["Exposure"].values) + +print("RandomForestRegressor evaluation:") +score_estimator(rf, df_test) -print("RandomForestRegressor") -score_estimator(gbr, df_test) ############################################################################## # -# In this example, although random forest minimizes the least square error, -# because of a higher predictive power it also results in a smaller Poisson -# deviance than the Poisson regression model. +# The random forest model also minimizes the conditional least square error. +# However because of a higher predictive power it also results in a smaller +# Poisson deviance than the Poisson regression model. # -# Evaluating models with a single train / test split is prone to numerical -# errors, we can verify that we would also get equivalent resuts with the -# cross-validation score. +# Not that Evaluating models with a single train / test split is prone to +# random fluctuations. We can verify that we would also get equivalent +# conclusions with cross-validated performance metrics. # -# The difference between these models can also be visualized by comparing the +# The qualitative difference between these models can also be visualized by comparing the # histogram of observed target values with that of predicted values, fig, axes = plt.subplots(1, 4, figsize=(16, 3)) fig.subplots_adjust(bottom=0.2) +n_bins = 20 +df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) -df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=axes[0]) +axes[0].set_title("Data") +axes[0].set_xlabel("y (observed Frequency)") -axes[0].set_title('Experimental data') - -for idx, model in enumerate([linregr, glm_freq, gbr]): +for idx, model in enumerate([ridge, poisson, rf]): y_pred = model.predict(df_train) - pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=axes[idx+1]) + pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1]) axes[idx + 1].set_title(model[-1].__class__.__name__) for axi in axes: axi.set( yscale='log', - xlabel="y (Frequency)" + xlabel="y_pred (predicted expected Frequency)" ) ############################################################################## # # The experimental data presents a long tail distribution for ``y``. In all -# models we predict the mean expected value, so we will have necessairily fewer -# extreme values. Additionally normal distribution used in ``Ridge`` and +# models we predict the mean expected value, so we will have necessairily +# fewer extreme values. Additionally normal distribution used in ``Ridge`` and # ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. @@ -298,14 +301,13 @@ def score_estimator(estimator, df_test): # ``Ridge`` and ``RandomForestRegressor`` estimators. # # To ensure that estimators yield reasonable predictions for different -# policyholder types, we can bin test samples according to `y_pred` returned by -# each model. Then for each bin, compare the mean predicted `y_pred`, with -# the mean observed target. +# policyholder types, we can bin test samples according to `y_pred` returned +# by each model. Then for each bin, compare the mean predicted `y_pred`, with +# the mean observed target: -def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): - """Compare predictions and observations for bins - ordered by y_pred +def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): + """Compare predictions and observations for bins ordered by y_pred We order the samples by ``y_pred`` and split it in bins. In each bin the observed mean is compared with the predicted @@ -332,14 +334,12 @@ def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): average y_pred for each bin """ idx_sort = np.argsort(y_pred) - bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins - y_pred_bin = np.zeros(n_bins) y_true_bin = np.zeros(n_bins) - bin_size = len(y_true) // n_bins - for n, sl in enumerate(gen_batches(len(y_true), bin_size)): - weights = sample_weights[idx_sort][sl] + + for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)): + weights = sample_weight[idx_sort][sl] y_pred_bin[n] = np.average( y_pred[idx_sort][sl], weights=weights ) @@ -350,39 +350,44 @@ def _lift_curve(y_true, y_pred, sample_weights=None, n_bins=100): return bin_centers, y_true_bin, y_pred_bin -fig, ax = plt.subplots(1, 3, figsize=(12, 3.2)) +fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.2)) plt.subplots_adjust(wspace=0.3) - -for axi, (label, model, color) in zip(ax, [ - ('Ridge', linregr, 'b'), - ('PoissonRegressor', glm_freq, 'k'), - ('Random Forest', gbr, 'r') +for axi, (label, model) in zip(ax, [ + ('Ridge', ridge), + ('PoissonRegressor', poisson), + ('Random Forest', rf) ]): y_pred = model.predict(df_test) - q, y_true_seg, y_pred_seg = _lift_curve( - df_test.Frequency.values, + q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( + df_test["Frequency"].values, y_pred, - sample_weights=df_test.Exposure.values, - n_bins=10) + sample_weights=df_test["Exposure"].values, + n_bins=5) - axi.plot(q, y_pred_seg, 'o'+color, label="predictions", ms=5) - axi.step(q, y_true_seg, '--'+color, label="observations", - where='mid') + axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") + axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations") axi.set_xlim(0, 1.0) + axi.set_ylim(0, 0.3) axi.set( title=label, xlabel='Fraction of samples sorted by y_pred', ylabel='Mean Frequency (y_pred)' ) - axi.legend() ############################################################################## # # On the above figure, ``PoissonRegressor`` is the model which presents the -# best consistency between predicted and observed targets, both for low -# and high target values. +# best consistency between predicted and observed targets, both for low and +# high target values. +# +# The ridge regression model tends to predict very low expected frequencies +# that do not match the data. +# +# The random forest regression model also tends to exaggerate low predicted +# frequencies although to a lower extent than ridge. It also tends to +# exaggerate high frequencies on the other hand. From be7bb67e2859b6c74854aeb942a008c3947757e2 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 25 Sep 2019 19:31:51 +0200 Subject: [PATCH 172/269] Add cumulated claims plot --- ...plot_poisson_regression_non_normal_loss.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 5c044d5530bd8..cbfbfc45dd703 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -391,3 +391,48 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 # The random forest regression model also tends to exaggerate low predicted # frequencies although to a lower extent than ridge. It also tends to # exaggerate high frequencies on the other hand. + + + +def _cumulated_claims(y_true, y_pred, exposure): + idx_sort = np.argsort(y_pred)[::-1] + sorted_exposure = exposure[idx_sort] + sorted_frequencies = y_true[idx_sort] + cumulated_exposure = np.cumsum(sorted_exposure) + cumulated_exposure /= cumulated_exposure[-1] + cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) + cumulated_claims /= cumulated_claims[-1] + return cumulated_exposure, cumulated_claims + + +fig, ax = plt.subplots(figsize=(8, 8)) +plt.subplots_adjust(wspace=0.3) + +for (label, model) in [ + ('Ridge', ridge), + ('PoissonRegressor', poisson), + ('Random Forest', rf) +]: + y_pred = model.predict(df_test) + cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + y_pred, + df_test["Exposure"].values) + ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) + +# Oracle model +cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + df_test["Frequency"].values, + df_test["Exposure"].values) +ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label="Oracle") + +# Random Baseline +ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") +ax.set( + title="Cumulated claims by model", + xlabel='Fraction of cumulated exposure (from riskiest to safest)', + ylabel='Fraction of cumulated number of claims' + +) +ax.legend() From 4125c20c9a5d06a0d3ed8241cceb6aaf582b350f Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 09:05:17 +0200 Subject: [PATCH 173/269] Improve the cumulated nb claims plot --- ...plot_poisson_regression_non_normal_loss.py | 44 +++++++++++++++---- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index cbfbfc45dd703..22a4b419cf483 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -263,8 +263,9 @@ def score_estimator(estimator, df_test): # random fluctuations. We can verify that we would also get equivalent # conclusions with cross-validated performance metrics. # -# The qualitative difference between these models can also be visualized by comparing the -# histogram of observed target values with that of predicted values, +# The qualitative difference between these models can also be visualized by +# comparing the histogram of observed target values with that of predicted +# values, fig, axes = plt.subplots(1, 4, figsize=(16, 3)) @@ -306,7 +307,8 @@ def score_estimator(estimator, df_test): # the mean observed target: -def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): +def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, + n_bins=100): """Compare predictions and observations for bins ordered by y_pred We order the samples by ``y_pred`` and split it in bins. @@ -363,7 +365,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( df_test["Frequency"].values, y_pred, - sample_weights=df_test["Exposure"].values, + sample_weight=df_test["Exposure"].values, n_bins=5) axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") @@ -391,11 +393,21 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100 # The random forest regression model also tends to exaggerate low predicted # frequencies although to a lower extent than ridge. It also tends to # exaggerate high frequencies on the other hand. - +# +# However for some business applications we are not necessarily interested in +# the the ability of the model in predicting the expected frequency value but +# instead in predicting which customer profiles are the riskiest and which are +# the safest. In this case the model evaluation would cast the problem as a +# ranking problem rather than a regression problem. +# +# To compare the 3 models under this light on, one can plot the fraction +# of cumulated number of claims vs the fraction of cumulated of exposure +# for test samples ordered by the model predictions, from riskiest to safest +# according to each model: def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred)[::-1] + idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest sorted_exposure = exposure[idx_sort] sorted_frequencies = y_true[idx_sort] cumulated_exposure = np.cumsum(sorted_exposure) @@ -420,7 +432,7 @@ def _cumulated_claims(y_true, y_pred, exposure): df_test["Exposure"].values) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) -# Oracle model +# Oracle model: y_pred == y_test cum_exposure, cum_claims = _cumulated_claims( df_test["Frequency"].values, df_test["Frequency"].values, @@ -433,6 +445,22 @@ def _cumulated_claims(y_true, y_pred, exposure): title="Cumulated claims by model", xlabel='Fraction of cumulated exposure (from riskiest to safest)', ylabel='Fraction of cumulated number of claims' - ) ax.legend() + +############################################################################## +# +# This plot reveals that the random forest model is almost uniformly the best +# at sorting customers by risk profiles even if the absolute value of the +# predicted expected frequencies are less well calibrated than for the linear +# Poisson model. +# +# +# All three models are significantly better than chance but also very far from +# making perfect predictions. +# +# This last point is expected due to the nature of the problem: the occurence +# of accidents is mostly dominated by environmental causes that are not +# captured in the columns of the dataset. + +plt.show() From 0070d527e7b3399cae9a6bd6447b4de636191481 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 10:41:02 +0200 Subject: [PATCH 174/269] Fix wrong xlabel in histogram plot --- .../plot_poisson_regression_non_normal_loss.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 22a4b419cf483..47e08b618758c 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -274,16 +274,15 @@ def score_estimator(estimator, df_test): df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) axes[0].set_title("Data") +axes[0].set_yscale('log') axes[0].set_xlabel("y (observed Frequency)") for idx, model in enumerate([ridge, poisson, rf]): y_pred = model.predict(df_train) pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1]) - axes[idx + 1].set_title(model[-1].__class__.__name__) - -for axi in axes: - axi.set( + axes[idx + 1].set( + title=model[-1].__class__.__name__, yscale='log', xlabel="y_pred (predicted expected Frequency)" ) From 9d6bb5258ac604bc523beb1c8b85357344a9929e Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 12:03:51 +0200 Subject: [PATCH 175/269] More example improvements (preprocessors + plots) --- ...plot_poisson_regression_non_normal_loss.py | 130 +++++++++--------- 1 file changed, 63 insertions(+), 67 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 47e08b618758c..76f957c57b6da 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -40,6 +40,7 @@ from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.ensemble import RandomForestRegressor from sklearn.utils import gen_even_slices +from sklearn.metrics import auc from sklearn.metrics import mean_squared_error, mean_absolute_error from sklearn.metrics import mean_poisson_deviance @@ -95,26 +96,33 @@ def load_mtpl2(n_samples=100000): df["ClaimNb"] = df["ClaimNb"].clip(upper=4) df["Exposure"] = df["Exposure"].clip(upper=1) -column_trans = ColumnTransformer( +############################################################################## +# +# The remaining columns can be used to predict the frequency of claim events. +# Those columns are very heterogeneous with a mix of categorical and numeric +# variables with different scales, possibly with heavy tails. +# +# In order to fit linear models with those predictors it is therefore +# necessary to perform standard feature transformation as follows: + +log_scale_transformer = make_pipeline( + FunctionTransformer(np.log, validate=False), + StandardScaler() +) + +linear_model_preprocessor = ColumnTransformer( [ - ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), - ( - "Veh_Brand_Gas_Region", - OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"], - ), - ("BonusMalus", "passthrough", ["BonusMalus"]), - ( - "Density_log", - make_pipeline( - FunctionTransformer(np.log, validate=False), StandardScaler() - ), - ["Density"], - ), + ("passthrough_numeric", "passthrough", + ["BonusMalus"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), + ["VehAge", "DrivAge"]), + ("log_scaled_numeric", log_scale_transformer, + ["Density"]), + ("onehot_categorical", OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), ], remainder="drop", ) -X = column_trans.fit_transform(df) ############################################################################## # @@ -141,12 +149,13 @@ def load_mtpl2(n_samples=100000): # significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a -# baseline an estimator that returns the mean of the training sample. +# baseline an estimator that constantly predicts the mean frequency of the +# training sample. df_train, df_test = train_test_split(df, random_state=0) dummy = make_pipeline( - column_trans, + linear_model_preprocessor, DummyRegressor(strategy='mean') ) dummy.fit(df_train, df_train["Frequency"], @@ -187,10 +196,7 @@ def score_estimator(estimator, df_test): # We start by modeling the target variable with the least squares linear # regression model, -ridge = make_pipeline( - column_trans, - Ridge(alpha=1.0) -) +ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0)) ridge.fit(df_train, df_train["Frequency"], ridge__sample_weight=df_train["Exposure"]) @@ -211,7 +217,7 @@ def score_estimator(estimator, df_test): # Next we fit the Poisson regressor on the target variable, poisson = make_pipeline( - column_trans, + linear_model_preprocessor, PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000) ) poisson.fit(df_train, df_train["Frequency"], @@ -229,20 +235,17 @@ def score_estimator(estimator, df_test): # same information is encoded with a small number of features than with # one-hot encoding). +rf_preprocessor = ColumnTransformer( + [ + ("categorical", OrdinalEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("numeric", "passthrough", + ["VehAge", "DrivAge", "BonusMalus", "Density"]), + ], + remainder="drop", +) rf = make_pipeline( - ColumnTransformer( - [ - ( - "Veh_Brand_Gas_Region", OrdinalEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"], - ), - ( - "Continious", "passthrough", - ["VehAge", "DrivAge", "BonusMalus", "Density"] - ), - ], - remainder="drop", - ), + rf_preprocessor, RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2) ) rf.fit(df_train, df_train["Frequency"].values, @@ -351,14 +354,10 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, return bin_centers, y_true_bin, y_pred_bin -fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.2)) +fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) plt.subplots_adjust(wspace=0.3) -for axi, (label, model) in zip(ax, [ - ('Ridge', ridge), - ('PoissonRegressor', poisson), - ('Random Forest', rf) -]): +for axi, model in zip(ax, [ridge, poisson, rf]): y_pred = model.predict(df_test) q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( @@ -372,19 +371,19 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, axi.set_xlim(0, 1.0) axi.set_ylim(0, 0.3) axi.set( - title=label, + title=model[-1].__class__.__name__, xlabel='Fraction of samples sorted by y_pred', ylabel='Mean Frequency (y_pred)' ) axi.legend() - +plt.tight_layout() ############################################################################## # # On the above figure, ``PoissonRegressor`` is the model which presents the # best consistency between predicted and observed targets, both for low and -# high target values. +# high predicted target values. # # The ridge regression model tends to predict very low expected frequencies # that do not match the data. @@ -393,16 +392,16 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # frequencies although to a lower extent than ridge. It also tends to # exaggerate high frequencies on the other hand. # -# However for some business applications we are not necessarily interested in -# the the ability of the model in predicting the expected frequency value but -# instead in predicting which customer profiles are the riskiest and which are -# the safest. In this case the model evaluation would cast the problem as a -# ranking problem rather than a regression problem. +# However, for some business applications, we are not necessarily interested +# in the the ability of the model in predicting the expected frequency value +# but instead in predicting which policyholder groups are the riskiest and +# which are the safest. In this case the model evaluation would cast the +# problem as a ranking problem rather than a regression problem. # -# To compare the 3 models under this light on, one can plot the fraction -# of cumulated number of claims vs the fraction of cumulated of exposure -# for test samples ordered by the model predictions, from riskiest to safest -# according to each model: +# To compare the 3 models under this light on, one can plot the fraction of +# cumulated number of claims vs the fraction of cumulated of exposure for test +# samples ordered by the model predictions, from riskiest to safest according +# to each model: def _cumulated_claims(y_true, y_pred, exposure): @@ -417,18 +416,16 @@ def _cumulated_claims(y_true, y_pred, exposure): fig, ax = plt.subplots(figsize=(8, 8)) -plt.subplots_adjust(wspace=0.3) -for (label, model) in [ - ('Ridge', ridge), - ('PoissonRegressor', poisson), - ('Random Forest', rf) -]: +for model in [ridge, poisson, rf]: y_pred = model.predict(df_test) cum_exposure, cum_claims = _cumulated_claims( df_test["Frequency"].values, y_pred, df_test["Exposure"].values) + area = auc(cum_exposure, cum_claims) + label = "{} (area under curve: {:.3f})".format( + model[-1].__class__.__name__, area) ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test @@ -449,17 +446,16 @@ def _cumulated_claims(y_true, y_pred, exposure): ############################################################################## # -# This plot reveals that the random forest model is almost uniformly the best -# at sorting customers by risk profiles even if the absolute value of the -# predicted expected frequencies are less well calibrated than for the linear -# Poisson model. -# +# This plot reveals that the random forest model is slightly better at ranking +# policyholders by risk profiles even if the absolute value of the predicted +# expected frequencies are less well calibrated than for the linear Poisson +# model. # # All three models are significantly better than chance but also very far from # making perfect predictions. # -# This last point is expected due to the nature of the problem: the occurence -# of accidents is mostly dominated by environmental causes that are not +# This last point is expected due to the nature of the problem: the occurrence +# of accidents is mostly dominated by circumstantial causes that are not # captured in the columns of the dataset. plt.show() From b353b2dcdb25bd2e55dffaacda1b80856f4b5b78 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 26 Sep 2019 13:14:09 +0200 Subject: [PATCH 176/269] Simplify dataset + use more data --- ...plot_poisson_regression_non_normal_loss.py | 99 ++++++++----------- 1 file changed, 39 insertions(+), 60 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 76f957c57b6da..da2a4bd2dccf8 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -9,10 +9,6 @@ number of insurance claims (or frequency) following car accidents for a policyholder given historical data over a population of policyholders. -We start by defining a few helper functions for loading the data and -visualizing results. - - .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764 `_ @@ -46,7 +42,7 @@ from sklearn.metrics import mean_poisson_deviance -def load_mtpl2(n_samples=100000): +def load_mtpl2(n_samples=None): """Fetcher for French Motor Third-Party Liability Claims dataset Parameters @@ -57,43 +53,27 @@ def load_mtpl2(n_samples=100000): """ # freMTPL2freq dataset from https://www.openml.org/d/41214 - df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] - df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) - df_freq.set_index('IDpol', inplace=True) - - # freMTPL2sev dataset from https://www.openml.org/d/41215 - df_sev = fetch_openml(data_id=41215, as_frame=True)['data'] - - # sum ClaimAmount over identical IDs - df_sev = df_sev.groupby('IDpol').sum() - - df = df_freq.join(df_sev, how="left") - df["ClaimAmount"].fillna(0, inplace=True) + df = fetch_openml(data_id=41214, as_frame=True)['data'] # unquote string fields for column_name in df.columns[df.dtypes.values == np.object]: df[column_name] = df[column_name].str.strip("'") - return df.iloc[:n_samples] + if n_samples is not None: + return df.iloc[:n_samples] + return df ############################################################################## # -# 1. Loading datasets and pre-processing -# -------------------------------------- +# Let's load the motor claim dataset. We ignore the severity data for this +# study for the sake of simplicitly. # -# We construct the freMTPL2 dataset by joining the freMTPL2freq table, -# containing the number of claims (``ClaimNb``) with the freMTPL2sev table -# containing the claim amount (``ClaimAmount``) for the same policy ids -# (``IDpol``). - -df = load_mtpl2(n_samples=50000) +# We also subsample the data for the sake of computational cost and running +# time. Using the full dataset would lead to similar conclusions. -# Note: filter out claims with zero amount, as the severity model -# requires strictly positive target values. -df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 +df = load_mtpl2(n_samples=300000) -# correct for unreasonable observations (that might be data error) -df["ClaimNb"] = df["ClaimNb"].clip(upper=4) +# Correct for unreasonable observations (that might be data error) df["Exposure"] = df["Exposure"].clip(upper=1) ############################################################################## @@ -133,14 +113,14 @@ def load_mtpl2(n_samples=100000): # ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, # and use ``Exposure`` as `sample_weight`. -df["Frequency"] = df.ClaimNb / df.Exposure +df["Frequency"] = df["ClaimNb"] / df["Exposure"] print( - pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() + pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts() ) print("Average Frequency = {}" - .format(np.average(df.Frequency, weights=df.Exposure))) + .format(np.average(df["Frequency"], weights=df["Exposure"]))) ############################################################################## # @@ -262,13 +242,13 @@ def score_estimator(estimator, df_test): # However because of a higher predictive power it also results in a smaller # Poisson deviance than the Poisson regression model. # -# Not that Evaluating models with a single train / test split is prone to -# random fluctuations. We can verify that we would also get equivalent -# conclusions with cross-validated performance metrics. +# Evaluating models with a single train / test split is prone to random +# fluctuations. If computation resources allow, it should be verified that +# cross-validated performance metrics would lead to similar conclusions. # # The qualitative difference between these models can also be visualized by # comparing the histogram of observed target values with that of predicted -# values, +# values: fig, axes = plt.subplots(1, 4, figsize=(16, 3)) @@ -293,8 +273,8 @@ def score_estimator(estimator, df_test): ############################################################################## # # The experimental data presents a long tail distribution for ``y``. In all -# models we predict the mean expected value, so we will have necessairily -# fewer extreme values. Additionally normal distribution used in ``Ridge`` and +# models we predict the mean expected value, so we will have necessarily fewer +# extreme values. Additionally normal distribution used in ``Ridge`` and # ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. @@ -364,12 +344,12 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, df_test["Frequency"].values, y_pred, sample_weight=df_test["Exposure"].values, - n_bins=5) + n_bins=10) axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions") axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations") axi.set_xlim(0, 1.0) - axi.set_ylim(0, 0.3) + axi.set_ylim(0, 0.6) axi.set( title=model[-1].__class__.__name__, xlabel='Fraction of samples sorted by y_pred', @@ -381,16 +361,13 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, ############################################################################## # -# On the above figure, ``PoissonRegressor`` is the model which presents the -# best consistency between predicted and observed targets, both for low and -# high predicted target values. -# -# The ridge regression model tends to predict very low expected frequencies -# that do not match the data. +# The ``Ridge`` regression model can predict very low expected frequencies +# that do not match the data. It can therefore severly under-estimate the risk +# for some policyholders. # -# The random forest regression model also tends to exaggerate low predicted -# frequencies although to a lower extent than ridge. It also tends to -# exaggerate high frequencies on the other hand. +# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency +# between predicted and observed targets, especially for low predicted target +# values. # # However, for some business applications, we are not necessarily interested # in the the ability of the model in predicting the expected frequency value @@ -399,9 +376,8 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # problem as a ranking problem rather than a regression problem. # # To compare the 3 models under this light on, one can plot the fraction of -# cumulated number of claims vs the fraction of cumulated of exposure for test -# samples ordered by the model predictions, from riskiest to safest according -# to each model: +# the number of claims vs the fraction of exposure for test samples ordered by +# the model predictions, from riskiest to safest according to each model: def _cumulated_claims(y_true, y_pred, exposure): @@ -433,16 +409,19 @@ def _cumulated_claims(y_true, y_pred, exposure): df_test["Frequency"].values, df_test["Frequency"].values, df_test["Exposure"].values) -ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label="Oracle") +area = auc(cum_exposure, cum_claims) +label = "Oracle (area under curve: {:.3f})".format(area) +ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) # Random Baseline -ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") +ax.plot([0, 1], [0, 1], linestyle="--", color="black", + label="Random baseline") ax.set( - title="Cumulated claims by model", - xlabel='Fraction of cumulated exposure (from riskiest to safest)', - ylabel='Fraction of cumulated number of claims' + title="Cumulated number of claims by model", + xlabel='Fraction of exposure (from riskiest to safest)', + ylabel='Fraction of number of claims' ) -ax.legend() +ax.legend(loc="lower right") ############################################################################## # From 88757fdb99cc516be230fe08ec1ebfb7bea0b694 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 15:59:36 +0200 Subject: [PATCH 177/269] Remove solver parameter from {Poisson,Gamma,Tweedie}Regression --- sklearn/linear_model/_glm/glm.py | 54 +++++++++++--------------------- sklearn/neighbors/base.py | 8 ++--- 2 files changed, 22 insertions(+), 40 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index d6da8b8b80949..f7985c0f3bae3 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -80,7 +80,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Calls scipy's L-BFGS-B optimizer. max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -113,7 +113,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', @@ -423,14 +423,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - solver : {'lbfgs'}, optional (default='lbfgs') - Algorithm to use in the optimization problem: - - 'lbfgs' - Calls scipy's L-BFGS-B optimizer. - max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -458,16 +452,16 @@ class PoissonRegressor(GeneralizedLinearRegressor): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ def __init__(self, alpha=1.0, fit_intercept=True, - solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, + max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="poisson", link='log', - solver=solver, max_iter=max_iter, tol=tol, - warm_start=warm_start, copy_X=copy_X, verbose=verbose) + family="poisson", link='log', max_iter=max_iter, + tol=tol, warm_start=warm_start, copy_X=copy_X, + verbose=verbose) @property def family(self): @@ -511,14 +505,8 @@ class GammaRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - solver : {'lbfgs'}, optional (default='lbfgs') - Algorithm to use in the optimization problem: - - 'lbfgs' - Calls scipy's L-BFGS-B optimizer. - max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -546,16 +534,16 @@ class GammaRegressor(GeneralizedLinearRegressor): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, solver='lbfgs', + def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, - family="gamma", link='log', - solver=solver, max_iter=max_iter, tol=tol, - warm_start=warm_start, copy_X=copy_X, verbose=verbose) + family="gamma", link='log', max_iter=max_iter, + tol=tol, warm_start=warm_start, copy_X=copy_X, + verbose=verbose) @property def family(self): @@ -632,14 +620,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): Specifies if a constant (a.k.a. bias or intercept) should be added to the linear predictor (X*coef+intercept). - solver : {'lbfgs'}, optional (default='lbfgs') - Algorithm to use in the optimization problem: - - 'lbfgs' - Calls scipy's L-BFGS-B optimizer. - max_iter : int, optional (default=100) - The maximal number of iterations for solver algorithms. + The maximal number of iterations for the solver. tol : float, optional (default=1e-4) Stopping criterion. For the lbfgs solver, @@ -666,15 +648,15 @@ class TweedieRegressor(GeneralizedLinearRegressor): Intercept (a.k.a. bias) added to linear predictor. n_iter_ : int - Actual number of iterations used in solver. + Actual number of iterations used in the solver. """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, - link='auto', solver='lbfgs', max_iter=100, tol=1e-4, + link='auto', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, check_input=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, - solver=solver, max_iter=max_iter, tol=tol, + max_iter=max_iter, tol=tol, warm_start=warm_start, copy_X=copy_X, verbose=verbose) @property diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 9548a619b0b14..d178d607d3636 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -289,13 +289,13 @@ def _pairwise(self): return self.metric == 'precomputed' -def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance): +def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance, **kwargs): """Helper for the Parallel calls in KNeighborsMixin.kneighbors The Cython method tree.query is not directly picklable by cloudpickle under PyPy. """ - return tree.query(data, n_neighbors, return_distance) + return tree.query(data, n_neighbors, return_distance, **kwargs) class KNeighborsMixin: @@ -336,7 +336,7 @@ def _kneighbors_reduce_func(self, dist, start, result = neigh_ind return result - def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, **kwargs): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -458,7 +458,7 @@ class from an array representing our data set and ask who's parallel_kwargs = {"prefer": "threads"} result = Parallel(n_jobs, **parallel_kwargs)( delayed_query( - self._tree, X[s], n_neighbors, return_distance) + self._tree, X[s], n_neighbors, return_distance, **kwargs) for s in gen_even_slices(X.shape[0], n_jobs) ) else: From 6d119d43577e46f1a0dde29980df20434a920739 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 16:02:15 +0200 Subject: [PATCH 178/269] Revert some accidental changes from 88757fdb99cc516be230fe08ec1ebfb7bea0b694. --- sklearn/neighbors/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index d178d607d3636..9548a619b0b14 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -289,13 +289,13 @@ def _pairwise(self): return self.metric == 'precomputed' -def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance, **kwargs): +def _tree_query_parallel_helper(tree, data, n_neighbors, return_distance): """Helper for the Parallel calls in KNeighborsMixin.kneighbors The Cython method tree.query is not directly picklable by cloudpickle under PyPy. """ - return tree.query(data, n_neighbors, return_distance, **kwargs) + return tree.query(data, n_neighbors, return_distance) class KNeighborsMixin: @@ -336,7 +336,7 @@ def _kneighbors_reduce_func(self, dist, start, result = neigh_ind return result - def kneighbors(self, X=None, n_neighbors=None, return_distance=True, **kwargs): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -458,7 +458,7 @@ class from an array representing our data set and ask who's parallel_kwargs = {"prefer": "threads"} result = Parallel(n_jobs, **parallel_kwargs)( delayed_query( - self._tree, X[s], n_neighbors, return_distance, **kwargs) + self._tree, X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) else: From b735eb786ca9ef03a9bbfc25851e62a4fbb71f3b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 16:23:12 +0200 Subject: [PATCH 179/269] Additional comment about the use of properties with setters --- sklearn/linear_model/_glm/distribution.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/linear_model/_glm/distribution.py index a5e42bcee5d1c..5f9e9ed06847c 100644 --- a/sklearn/linear_model/_glm/distribution.py +++ b/sklearn/linear_model/_glm/distribution.py @@ -245,6 +245,9 @@ def power(self): @power.setter def power(self, power): + # We use a property with a setter, to update lower and + # upper bound when the power parameter is updated e.g. in grid + # search. if not isinstance(power, numbers.Real): raise TypeError('power must be a real number, input was {0}' .format(power)) From 2d911143067a9ab0fb206cbbb0b13fa228955969 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 26 Sep 2019 16:43:13 +0200 Subject: [PATCH 180/269] Add additional tests for link derivatives --- sklearn/linear_model/_glm/tests/test_link.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py index 36219e09b58e3..27ec4ed19bdc2 100644 --- a/sklearn/linear_model/_glm/tests/test_link.py +++ b/sklearn/linear_model/_glm/tests/test_link.py @@ -4,6 +4,7 @@ import numpy as np from numpy.testing import assert_allclose import pytest +from scipy.optimize import check_grad from sklearn.linear_model._glm.link import ( IdentityLink, @@ -15,12 +16,12 @@ LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink] -@pytest.mark.parametrize('link', LINK_FUNCTIONS) -def test_link_properties(link): +@pytest.mark.parametrize('Link', LINK_FUNCTIONS) +def test_link_properties(Link): """Test link inverse and derivative.""" rng = np.random.RandomState(42) x = rng.rand(100) * 100 - link = link() # instantiate object + link = Link() if isinstance(link, LogitLink): # careful for large x, note expit(36) = 1 # limit max eta to 15 @@ -30,3 +31,15 @@ def test_link_properties(link): # g = link, h = link.inverse assert_allclose(link.derivative(link.inverse(x)), 1 / link.inverse_derivative(x)) + + +@pytest.mark.parametrize('Link', LINK_FUNCTIONS) +def test_link_derivative(Link): + link = Link() + x = np.random.RandomState(0).rand(1) + err = check_grad(link, link.derivative, x) / link.derivative(x) + assert abs(err) < 1e-6 + + err = (check_grad(link.inverse, link.inverse_derivative, x) + / link.derivative(x)) + assert abs(err) < 1e-6 From 89103bc417646864a1aa85c616a3148cb26ac2ed Mon Sep 17 00:00:00 2001 From: Alexandre Gramfort Date: Sun, 29 Sep 2019 23:06:18 +0200 Subject: [PATCH 181/269] cosmits + typos --- doc/modules/linear_model.rst | 4 +-- ...plot_poisson_regression_non_normal_loss.py | 36 +++++++++---------- ...lot_tweedie_regression_insurance_claims.py | 6 ++-- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index a9b2e66599537..e53f309076b3b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -996,8 +996,8 @@ of the unit variance function: weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Consider :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, - in this case one might say that y has a 'scaled' Poisson distributions. + Considering :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, + in this case one might say that y has a 'scaled' Poisson distribution. The same holds for other distributions. The estimator can be used as follows:: diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index da2a4bd2dccf8..d99654cf04080 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -129,8 +129,8 @@ def load_mtpl2(n_samples=None): # significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a -# baseline an estimator that constantly predicts the mean frequency of the -# training sample. +# baseline a "dummy" estimator that constantly predicts the mean frequency of +# the training sample. df_train, df_test = train_test_split(df, random_state=0) @@ -143,16 +143,16 @@ def load_mtpl2(n_samples=None): def score_estimator(estimator, df_test): - """Score an estimatr on the test set""" + """Score an estimator on the test set""" y_pred = estimator.predict(df_test) - print("MSE: %.3f" % mean_squared_error( - df_test["Frequency"], y_pred, - df_test["Exposure"])) - print("MAE: %.3f" % mean_absolute_error( - df_test["Frequency"], y_pred, - df_test["Exposure"])) + print("MSE: %.3f" % + mean_squared_error(df_test["Frequency"], y_pred, + df_test["Exposure"])) + print("MAE: %.3f" % + mean_absolute_error(df_test["Frequency"], y_pred, + df_test["Exposure"])) # ignore negative predictions, as they are invalid for # the Poisson deviance @@ -160,12 +160,12 @@ def score_estimator(estimator, df_test): if (~mask).any(): warnings.warn("estimator yields negative predictions for {} samples " "out of {}. These will be ignored while computing the " - "poisson deviance".format((~mask).sum(), mask.shape[0])) + "Poisson deviance".format((~mask).sum(), mask.shape[0])) - print("mean Poisson deviance: %.3f" % mean_poisson_deviance( - df_test["Frequency"][mask], - y_pred[mask], - df_test["Exposure"][mask])) + print("mean Poisson deviance: %.3f" % + mean_poisson_deviance(df_test["Frequency"][mask], + y_pred[mask], + df_test["Exposure"][mask])) print("Constant mean frequency evaluation:") @@ -285,8 +285,8 @@ def score_estimator(estimator, df_test): # # To ensure that estimators yield reasonable predictions for different # policyholder types, we can bin test samples according to `y_pred` returned -# by each model. Then for each bin, compare the mean predicted `y_pred`, with -# the mean observed target: +# by each model. Then for each bin, we compare the mean predicted `y_pred`, +# with the mean observed target: def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, @@ -325,7 +325,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)): weights = sample_weight[idx_sort][sl] y_pred_bin[n] = np.average( - y_pred[idx_sort][sl], weights=weights + y_pred[idx_sort][sl], weights=weights ) y_true_bin[n] = np.average( y_true[idx_sort][sl], @@ -337,7 +337,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5)) plt.subplots_adjust(wspace=0.3) -for axi, model in zip(ax, [ridge, poisson, rf]): +for axi, model in zip(ax, [ridge, poisson, rf]): y_pred = model.predict(df_test) q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group( diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 5a312f656d9ce..1c50541fcd85c 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -98,7 +98,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, predicted : frame a dataframe, with the same index as df, with the predicted target fill_legend : bool, default=False - wgether to show fill_between legend + whether to show fill_between legend """ # aggregate observed and predicted variables by feature level df_ = df.loc[:, [feature, weight]].copy() @@ -219,8 +219,8 @@ def score_estimator( # Score the model consisting of the product of frequency and # severity models, denormalized by the exposure values. est_freq, est_sev = estimator - y_pred = (df.Exposure.values * est_freq.predict(X) - * est_sev.predict(X)) + y_pred = (df.Exposure.values * est_freq.predict(X) * + est_sev.predict(X)) power = 1.5 else: y_pred = estimator.predict(X) From 4f28a44bff0a7d84de13f66e0d2b5264897f6c95 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 30 Sep 2019 11:20:25 +0200 Subject: [PATCH 182/269] Address some of Alex's comments - Link -> BaseLink - Removed reference to none existing notes - Use X.dtype for dtype of y - remove check_input --- sklearn/linear_model/_glm/glm.py | 51 ++++++++------------- sklearn/linear_model/_glm/link.py | 8 ++-- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- 3 files changed, 25 insertions(+), 36 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index f7985c0f3bae3..819e36e13addf 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -21,7 +21,7 @@ EDM_DISTRIBUTIONS ) from .link import ( - Link, + BaseLink, IdentityLink, LogLink, ) @@ -47,10 +47,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): ---------- alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). fit_intercept : boolean, optional (default=True) @@ -63,7 +61,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log'} or an instance of class Link, \ + link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \ optional (default='auto') The link function of the GLM, i.e. mapping from linear predictor (X*coef) to expectation (y_pred). Option 'auto' sets the link @@ -167,8 +165,8 @@ def fit(self, X, y, sample_weight=None): "; got (family={0})".format(self.family)) # Guarantee that self._link_instance is set to an instance of - # class Link - if isinstance(self.link, Link): + # class BaseLink + if isinstance(self.link, BaseLink): self._link_instance = self.link else: if self.link == 'auto': @@ -227,7 +225,7 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32], y_numeric=True, multi_output=False, copy=self.copy_X) - y = np.asarray(y, dtype=np.float64) + y = np.asarray(y, dtype=X.dtype) weights = _check_sample_weight(sample_weight, X) @@ -247,8 +245,7 @@ def fit(self, X, y, sample_weight=None): # deviance = sum(sample_weight * unit_deviance), # we rescale weights such that sum(weights) = 1 and this becomes # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance) - weights_sum = np.sum(weights) - weights = weights / weights_sum + weights = weights / weights.sum() if self.warm_start and hasattr(self, 'coef_'): if self.fit_intercept: @@ -318,7 +315,7 @@ def _linear_predictor(self, X): """ check_is_fitted(self) X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], - dtype='numeric', ensure_2d=True, + dtype=[np.float64, np.float32], ensure_2d=True, allow_nd=False) return X @ self.coef_ + self.intercept_ @@ -413,10 +410,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): ---------- alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). fit_intercept : boolean, optional (default=True) @@ -454,9 +449,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, - max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', max_iter=max_iter, @@ -495,10 +489,8 @@ class GammaRegressor(GeneralizedLinearRegressor): ---------- alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter. ``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). fit_intercept : boolean, optional (default=True) @@ -536,9 +528,8 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, - max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, + warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="gamma", link='log', max_iter=max_iter, @@ -601,10 +592,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the - regularization strength. - See the notes for the exact mathematical meaning of this - parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this - case, the design matrix X must have full column rank + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix X must have full column rank (no collinearities). link : {'auto', 'identity', 'log'}, default='auto' @@ -652,7 +641,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): """ def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, link='auto', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, check_input=True, verbose=0): + warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index cfdc6f181a832..7c404798b2c18 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -11,7 +11,7 @@ from scipy.special import expit, logit -class Link(metaclass=ABCMeta): +class BaseLink(metaclass=ABCMeta): """Abstract base class for Link functions.""" @abstractmethod @@ -65,7 +65,7 @@ def inverse_derivative(self, lin_pred): pass # pragma: no cover -class IdentityLink(Link): +class IdentityLink(BaseLink): """The identity link function g(x)=x.""" def __call__(self, y_pred): @@ -81,7 +81,7 @@ def inverse_derivative(self, lin_pred): return np.ones_like(lin_pred) -class LogLink(Link): +class LogLink(BaseLink): """The log link function g(x)=log(x).""" def __call__(self, y_pred): @@ -97,7 +97,7 @@ def inverse_derivative(self, lin_pred): return np.exp(lin_pred) -class LogitLink(Link): +class LogitLink(BaseLink): """The logit link function g(x)=logit(x).""" def __call__(self, y_pred): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 898d3c4edf9c0..030cdf8a9b141 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -338,7 +338,7 @@ def test_tweedie_regression_family(regression_data): with pytest.raises(TypeError, match=msg): est.family = None - # TODO: the following should not be allowed + # XXX: following is currently allowed, but maybe it shouldn't be # est.family.power = 2 From d4dfd0b13c9bd3ce7fae0d7ed2b0ad40411f46cc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 30 Sep 2019 12:10:01 +0200 Subject: [PATCH 183/269] Removing unnecessary comments / asarray call --- sklearn/linear_model/_glm/glm.py | 1 - sklearn/linear_model/_glm/tests/test_glm.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 819e36e13addf..65de7f9532717 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -225,7 +225,6 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32], y_numeric=True, multi_output=False, copy=self.copy_X) - y = np.asarray(y, dtype=X.dtype) weights = _check_sample_weight(sample_weight, X) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 030cdf8a9b141..ef49fabdee4b1 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -338,9 +338,6 @@ def test_tweedie_regression_family(regression_data): with pytest.raises(TypeError, match=msg): est.family = None - # XXX: following is currently allowed, but maybe it shouldn't be - # est.family.power = 2 - @pytest.mark.parametrize( 'estimator, value', From 64d6fbd0dd99d66789c3043558f80883a974e99e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 10:57:18 +0200 Subject: [PATCH 184/269] Update doc/modules/linear_model.rst Co-Authored-By: Nicolas Hug --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index e53f309076b3b..8945235279d9d 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -923,7 +923,7 @@ likelihood as \left( \log p(y|\mu,\phi) - \log p(y|y,\phi)\right). -The following table lists some specific EDM distributions—all are Tweedie +The following table lists some specific EDM distributions—all are instances of Tweedie distributions—and some of their properties. ================= =============================== ====================================== ============================================ From 82ace9f399c83d7f023f9dafcdada88540c0fd25 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 11:14:34 +0200 Subject: [PATCH 185/269] Remove unused solver parameter in tests --- sklearn/linear_model/_glm/tests/test_glm.py | 30 +++++++-------------- 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index ef49fabdee4b1..a5df69b50c967 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -26,8 +26,6 @@ from sklearn.exceptions import ConvergenceWarning from sklearn.model_selection import train_test_split -GLM_SOLVERS = ['lbfgs'] - @pytest.fixture(scope="module") def regression_data(): @@ -176,14 +174,13 @@ def test_glm_check_input_argument(check_input): glm.fit(X, y) -@pytest.mark.parametrize('solver', GLM_SOLVERS) -def test_glm_identity_regression(solver): +def test_glm_identity_regression(): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', - fit_intercept=False, solver=solver) + fit_intercept=False) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=1e-6) @@ -193,15 +190,14 @@ def test_glm_identity_regression(solver): [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]) -@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-6)]) -def test_glm_log_regression(family, solver, tol): +def test_glm_log_regression(family): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( alpha=0, family=family, link='log', fit_intercept=False, - solver=solver, tol=tol) + tol=1e-6) res = glm.fit(X, y) assert_allclose(res.coef_, coef, rtol=5e-6) @@ -239,8 +235,7 @@ def test_warm_start(fit_intercept): @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) -@pytest.mark.parametrize('solver', GLM_SOLVERS) -def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): +def test_normal_ridge_comparison(n_samples, n_features, fit_intercept): """Compare with Ridge regression for Normal distributions.""" alpha = 1.0 test_size = 10 @@ -264,8 +259,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, - solver=solver, check_input=False, - max_iter=300) + check_input=False, max_iter=300) glm.fit(X_train, y_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) @@ -274,8 +268,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver): assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=5e-5) -@pytest.mark.parametrize('solver, tol', [('lbfgs', 1e-7)]) -def test_poisson_glmnet(solver, tol): +def test_poisson_glmnet(): """Compare Poisson regression with L2 regularization and LogLink to glmnet """ # library("glmnet") @@ -294,19 +287,16 @@ def test_poisson_glmnet(solver, tol): glm = GeneralizedLinearRegressor(alpha=1, fit_intercept=True, family='poisson', link='log', tol=1e-7, - solver=solver, max_iter=300, - ) + max_iter=300) glm.fit(X, y) assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5) assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5) -@pytest.mark.parametrize("solver", GLM_SOLVERS) -def test_convergence_warning(solver, regression_data): +def test_convergence_warning(regression_data): X, y = regression_data - est = GeneralizedLinearRegressor(solver=solver, - max_iter=1, tol=1e-20) + est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20) with pytest.warns(ConvergenceWarning): est.fit(X, y) From 5288a0ff156c254df7e465971127685f5532fbf3 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 12:13:35 +0200 Subject: [PATCH 186/269] Add test for sample_weight consistency --- sklearn/linear_model/_glm/tests/test_glm.py | 36 +++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index a5df69b50c967..2a54b759011e7 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -181,8 +181,40 @@ def test_glm_identity_regression(): y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', fit_intercept=False) - res = glm.fit(X, y) - assert_allclose(res.coef_, coef, rtol=1e-6) + glm.fit(X, y) + assert_allclose(glm.coef_, coef, rtol=1e-6) + + +def test_glm_sample_weight_consistentcy(): + """Test that the impact of sample_weight is consistent""" + rng = np.random.RandomState(0) + n_samples, n_features = 10, 5 + + X = rng.rand(n_samples, n_features) + y = rng.rand(n_samples) + glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', + fit_intercept=False) + glm.fit(X, y) + coef = glm.coef_.copy() + + # sample_weight=np.ones(..) should be equivalent to sample_weight=None + sample_weight = np.ones(y.shape) + glm.fit(X, y, sample_weight=sample_weight) + assert_allclose(glm.coef_, coef, rtol=1e-6) + + # sample_weight are normalized to 1 so, scaling them has no effect + sample_weight = 2*np.ones(y.shape) + glm.fit(X, y, sample_weight=sample_weight) + assert_allclose(glm.coef_, coef, rtol=1e-6) + + # setting one element of sample_weight to 0 is equivalent to removing + # the correspoding sample + sample_weight = np.ones(y.shape) + sample_weight[-1] = 0 + glm.fit(X, y, sample_weight=sample_weight) + coef1 = glm.coef_.copy() + glm.fit(X[:-1], y[:-1]) + assert_allclose(glm.coef_, coef1, rtol=1e-6) @pytest.mark.parametrize( From 499e8d244b5f6bf3ad7856dace4338b1cf5d31e1 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 14:41:47 +0200 Subject: [PATCH 187/269] Move GLM losses under sklearn._loss.glm_distribution --- sklearn/_loss/__init__.py | 0 .../_glm/distribution.py => _loss/glm_distribution.py} | 0 sklearn/_loss/tests/__init__.py | 0 .../tests/test_glm_distribution.py} | 2 +- sklearn/linear_model/_glm/glm.py | 2 +- sklearn/linear_model/_glm/tests/test_glm.py | 2 +- sklearn/metrics/regression.py | 2 +- 7 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 sklearn/_loss/__init__.py rename sklearn/{linear_model/_glm/distribution.py => _loss/glm_distribution.py} (100%) create mode 100644 sklearn/_loss/tests/__init__.py rename sklearn/{linear_model/_glm/tests/test_distribution.py => _loss/tests/test_glm_distribution.py} (98%) diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/linear_model/_glm/distribution.py b/sklearn/_loss/glm_distribution.py similarity index 100% rename from sklearn/linear_model/_glm/distribution.py rename to sklearn/_loss/glm_distribution.py diff --git a/sklearn/_loss/tests/__init__.py b/sklearn/_loss/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/linear_model/_glm/tests/test_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py similarity index 98% rename from sklearn/linear_model/_glm/tests/test_distribution.py rename to sklearn/_loss/tests/test_glm_distribution.py index 97c3a485ef4bb..cb4c5ae07e4d1 100644 --- a/sklearn/linear_model/_glm/tests/test_distribution.py +++ b/sklearn/_loss/tests/test_glm_distribution.py @@ -9,7 +9,7 @@ from scipy.optimize import check_grad import pytest -from sklearn.linear_model._glm.distribution import ( +from sklearn._loss.glm_distribution import ( TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 65de7f9532717..360db6e4f741e 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -15,7 +15,7 @@ from ...utils import check_array, check_X_y from ...utils.optimize import _check_optimize_result from ...utils.validation import check_is_fitted, _check_sample_weight -from .distribution import ( +from ..._loss.glm_distribution import ( ExponentialDispersionModel, TweedieDistribution, EDM_DISTRIBUTIONS diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 2a54b759011e7..9e21ae7775cf4 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -17,7 +17,7 @@ IdentityLink, LogLink, ) -from sklearn.linear_model._glm.distribution import ( +from sklearn._loss.glm_distribution import ( TweedieDistribution, NormalDistribution, PoissonDistribution, GammaDistribution, InverseGaussianDistribution, diff --git a/sklearn/metrics/regression.py b/sklearn/metrics/regression.py index 706c484334d21..f7ef99794727b 100644 --- a/sklearn/metrics/regression.py +++ b/sklearn/metrics/regression.py @@ -25,6 +25,7 @@ import numpy as np import warnings +from .._loss.glm_distribution import TweedieDistribution from ..utils.validation import (check_array, check_consistent_length, _num_samples) from ..utils.validation import column_or_1d @@ -672,7 +673,6 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0): >>> mean_tweedie_deviance(y_true, y_pred, power=1) 1.4260... """ - from ..linear_model._glm.distribution import TweedieDistribution y_type, y_true, y_pred, _ = _check_reg_targets( y_true, y_pred, None, dtype=[np.float64, np.float32]) if y_type == 'continuous-multioutput': From f4aa839da1f0226ff8ed98adb22d6e90446d7120 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 15:21:42 +0200 Subject: [PATCH 188/269] Update sklearn/linear_model/_glm/glm.py Co-Authored-By: Nicolas Hug --- sklearn/linear_model/_glm/glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 360db6e4f741e..dd6f847895434 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -571,7 +571,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): mean (:math:`\y_\textrm{pred}`): :math:`v(\y_\textrm{pred}) = \y_\textrm{pred}^{power}`. - For ``0 Date: Thu, 3 Oct 2019 14:55:48 +0200 Subject: [PATCH 189/269] Add missing config.add_subpackage in setup.py --- sklearn/linear_model/setup.py | 1 + sklearn/setup.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py index 5cf7040d4c9d4..e50a30eca73da 100644 --- a/sklearn/linear_model/setup.py +++ b/sklearn/linear_model/setup.py @@ -43,6 +43,7 @@ def configuration(parent_package='', top_path=None): # add other directories config.add_subpackage('tests') config.add_subpackage('_glm') + config.add_subpackage('_glm/tests') return config diff --git a/sklearn/setup.py b/sklearn/setup.py index 53f6d3f6eb30c..3913965a375cd 100644 --- a/sklearn/setup.py +++ b/sklearn/setup.py @@ -47,6 +47,8 @@ def configuration(parent_package='', top_path=None): config.add_subpackage('experimental/tests') config.add_subpackage('ensemble/_hist_gradient_boosting') config.add_subpackage('ensemble/_hist_gradient_boosting/tests') + config.add_subpackage('_loss/') + config.add_subpackage('_loss/tests') # submodules which have their own setup.py config.add_subpackage('cluster') From d71fb9f8fb54608124f4947931ecf1cbaaba425f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 15:28:50 +0200 Subject: [PATCH 190/269] Address Nicolas comments in the documentation (partial) --- doc/modules/linear_model.rst | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 8945235279d9d..6667057dc5073 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -907,13 +907,13 @@ Generalized Linear Models (GLM) extend linear models in two ways combination of the input variables :math:`X` via an inverse link function :math:`h` as -.. math:: \hat{y}(w, x) = h(x^\top w) = h(w_0 + w_1 x_1 + ... + w_p x_p). +.. math:: \hat{y}(w, X) = h(x^\top w) = h(w_0 + w_1 X_1 + ... + w_p X_p). Secondly, the squared loss function is replaced by the unit deviance :math:`d` of a reproductive exponential dispersion model (EDM) [11]_. The minimization problem becomes -.. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, x_i)) + \frac{\alpha}{2} ||w||_2 +.. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, X_i)) + \frac{\alpha}{2} ||w||_2 with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` @@ -939,8 +939,8 @@ Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` Usage ----- -In the following use cases, a loss different from the squared loss might be -appropriate: +A GLM loss different from the classical squared loss might be appropriate in +the following cases: * If the target values :math:`y` are counts (non-negative integer valued) or frequencies (non-negative), you might use a Poisson deviance with log-link. @@ -985,13 +985,8 @@ of the unit variance function: * If you want to model a relative frequency, i.e. counts per exposure (time, volume, ...) you can do so by a Poisson distribution and passing :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values - together with :math:`s=\mathrm{exposure}` as sample weights. This is done - in both examples linked below. - * The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`. - * If the target `y` is a ratio, appropriate sample weights ``s`` should be - provided. + together with :math:`s=\mathrm{exposure}` as sample weights. + As an example, consider Poisson distributed counts z (integers) and weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. @@ -1000,6 +995,10 @@ of the unit variance function: in this case one might say that y has a 'scaled' Poisson distribution. The same holds for other distributions. + * The fit itself does not need Y to be from an EDM, but only assumes + the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and + :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`. + The estimator can be used as follows:: >>> from sklearn.linear_model import TweedieRegressor From fa90272e4d1925b4caa967febf417c4ea9e96457 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 3 Oct 2019 16:10:40 +0200 Subject: [PATCH 191/269] More cleanups in the plot_tweedie_regression_insurance_claims.py example --- ...lot_tweedie_regression_insurance_claims.py | 57 ++++++++----------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 1c50541fcd85c..dfd5555c83af4 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -142,7 +142,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, # Note: filter out claims with zero amount, as the severity model # requires strictly positive target values. -df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0 +df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0 # Correct for unreasonable observations (that might be data error) # and a few exceptionally large claim amounts @@ -150,30 +150,26 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df["Exposure"] = df["Exposure"].clip(upper=1) df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) +log_scale_transformer = make_pipeline( + FunctionTransformer(np.log, validate=False), + StandardScaler() +) + column_trans = ColumnTransformer( [ - ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), - ( - "Veh_Brand_Gas_Region", - OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"], - ), - ("BonusMalus", "passthrough", ["BonusMalus"]), - ( - "Density_log", - make_pipeline( - FunctionTransformer(np.log, validate=False), StandardScaler() - ), - ["Density"], - ), + ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ("onehot_categorical", OneHotEncoder(), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("passthrough_numeric", "passthrough", ["BonusMalus"]), + ("log_scaled_numeric", log_scale_transformer, ["Density"]), ], remainder="drop", ) X = column_trans.fit_transform(df) -df["Frequency"] = df.ClaimNb / df.Exposure -df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1) +df["Frequency"] = df["ClaimNb"] / df["Exposure"] +df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1) print(df[df.ClaimAmount > 0].head()) @@ -268,7 +264,7 @@ def score_estimator( # the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance # bonus/malus (``BonusMalus``). -fig, ax = plt.subplots(2, 2, figsize=(16, 8)) +fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8)) fig.subplots_adjust(hspace=0.3, wspace=0.2) plot_obs_pred( @@ -369,18 +365,12 @@ def score_estimator( # it is conditional on having at least one claim, and cannot be used to predict # the average claim amount per policy in general. -print( - "Mean AvgClaim Amount per policy: %.2f " - % df_train.AvgClaimAmount.mean() -) -print( - "Mean AvgClaim Amount | NbClaim > 0: %.2f" - % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean() -) -print( - "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" - % glm_sev.predict(X_train).mean() -) +print("Mean AvgClaim Amount per policy: %.2f " + % df_train["AvgClaimAmount"].mean()) +print("Mean AvgClaim Amount | NbClaim > 0: %.2f" + % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean()) +print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f" + % glm_sev.predict(X_train).mean()) ############################################################################## @@ -388,7 +378,7 @@ def score_estimator( # We can visually compare observed and predicted values, aggregated for # the drivers age (``DrivAge``). -fig, ax = plt.subplots(1, 2, figsize=(16, 4)) +fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 4)) # plot DivAge plot_obs_pred( @@ -500,9 +490,9 @@ def score_estimator( res.append( { "subset": subset_label, - "observed": df.ClaimAmount.values.sum(), + "observed": df["ClaimAmount"].values.sum(), "predicted, frequency*severity model": np.sum( - df.Exposure.values*glm_freq.predict(X)*glm_sev.predict(X) + df["Exposure"].values*glm_freq.predict(X)*glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" % glm_total.best_estimator_.family.power: np.sum( @@ -512,3 +502,4 @@ def score_estimator( ) print(pd.DataFrame(res).set_index("subset").T) +plt.plot() From 4d16f318ca07cf8a00200201042bba177d4d27ba Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 6 Oct 2019 16:13:21 +0200 Subject: [PATCH 192/269] Typos and text improvement in poisson example --- doc/modules/linear_model.rst | 2 +- ...plot_poisson_regression_non_normal_loss.py | 65 ++++++++++--------- 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 6667057dc5073..4a5aeab305b8a 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -991,7 +991,7 @@ of the unit variance function: weights s=exposure (time, money, persons years, ...). Then you fit y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. The weights are necessary for the right (finite sample) mean. - Considering :math:`\bar{y} = \frac{\\sum_i s_i y_i}{\sum_i s_i}`, + Considering :math:`\bar{y} = \frac{\sum_i s_i y_i}{\sum_i s_i}`, in this case one might say that y has a 'scaled' Poisson distribution. The same holds for other distributions. diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d99654cf04080..3ecb02108de4f 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -6,8 +6,8 @@ This example illustrates the use of log-linear Poisson regression on the French Motor Third-Party Liability Claims dataset [1] and compares it with models learned with least squared error. The goal is to predict the -number of insurance claims (or frequency) following car accidents for a -policyholder given historical data over a population of policyholders. +expected number of insurance claims (or frequency) following car accidents for +a policyholder given historical data over a population of policyholders. .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor Third-Party Liability Claims (November 8, 2018). @@ -42,8 +42,8 @@ from sklearn.metrics import mean_poisson_deviance -def load_mtpl2(n_samples=None): - """Fetcher for French Motor Third-Party Liability Claims dataset +def load_mtpl2(n_samples=100000): + """Fetch the French Motor Third-Party Liability Claims dataset. Parameters ---------- @@ -122,9 +122,13 @@ def load_mtpl2(n_samples=None): print("Average Frequency = {}" .format(np.average(df["Frequency"], weights=df["Exposure"]))) +print("Percentage of zero claims = {0:%}" + .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / + df["Exposure"].sum())) + ############################################################################## # -# It worth noting that 96 % of policyholders have zero claims, and if we were +# It worth noting that 92 % of policyholders have zero claims, and if we were # to convert this problem into a binary classification task, it would be # significantly imbalanced. # @@ -143,7 +147,7 @@ def load_mtpl2(n_samples=None): def score_estimator(estimator, df_test): - """Score an estimator on the test set""" + """Score an estimator on the test set.""" y_pred = estimator.predict(df_test) @@ -154,13 +158,14 @@ def score_estimator(estimator, df_test): mean_absolute_error(df_test["Frequency"], y_pred, df_test["Exposure"])) - # ignore negative predictions, as they are invalid for + # ignore non-positive predictions, as they are invalid for # the Poisson deviance mask = y_pred > 0 if (~mask).any(): - warnings.warn("estimator yields negative predictions for {} samples " - "out of {}. These will be ignored while computing the " - "Poisson deviance".format((~mask).sum(), mask.shape[0])) + warnings.warn("Estimator yields non-positive predictions for {} " + "samples out of {}. These will be ignored while " + "computing the Poisson deviance" + .format((~mask).sum(), mask.shape[0])) print("mean Poisson deviance: %.3f" % mean_poisson_deviance(df_test["Frequency"][mask], @@ -182,12 +187,12 @@ def score_estimator(estimator, df_test): ############################################################################## # -# The Poisson deviance cannot be computed on negative values predicted by the -# model. For models that do return a few negative predictions +# The Poisson deviance cannot be computed on non-positive values predicted by +# the model. For models that do return a few non-positive predictions # (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, # meaning that the obtained Poisson deviance is approximate. An alternative -# apporach could be to use class:`compose.TransformedTargetRegressor` -# meta-estimator to map ``y_pred`` to strictly positive domain. +# approach could be to use :class:`compose.TransformedTargetRegressor` +# meta-estimator to map ``y_pred`` to a strictly positive domain. print("Ridge evaluation:") score_estimator(ridge, df_test) @@ -210,9 +215,9 @@ def score_estimator(estimator, df_test): # # Finally, we will consider a non-linear model, namely a random forest. Random # forests do not require the categorical data to be one-hot encoded, instead -# we encode each category label with an arbirtrary integer using +# we encode each category label with an arbitrary integer using # :class:`preprocessing.OrdinalEncoder` to make the model faster to train (the -# same information is encoded with a small number of features than with +# same information is encoded with a smaller number of features than with # one-hot encoding). rf_preprocessor = ColumnTransformer( @@ -238,12 +243,13 @@ def score_estimator(estimator, df_test): ############################################################################## # -# The random forest model also minimizes the conditional least square error. -# However because of a higher predictive power it also results in a smaller -# Poisson deviance than the Poisson regression model. +# Like the Ridge regression above, the random forest model minimizes the +# conditional squared error, too. However, because of a higher predictive +# power, it also results in a smaller Poisson deviance than the Poisson +# regression model. # # Evaluating models with a single train / test split is prone to random -# fluctuations. If computation resources allow, it should be verified that +# fluctuations. If computing resources allow, it should be verified that # cross-validated performance metrics would lead to similar conclusions. # # The qualitative difference between these models can also be visualized by @@ -274,7 +280,7 @@ def score_estimator(estimator, df_test): # # The experimental data presents a long tail distribution for ``y``. In all # models we predict the mean expected value, so we will have necessarily fewer -# extreme values. Additionally normal distribution used in ``Ridge`` and +# extreme values. Additionally, normal distribution used in ``Ridge`` and # ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. @@ -291,11 +297,10 @@ def score_estimator(estimator, df_test): def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, n_bins=100): - """Compare predictions and observations for bins ordered by y_pred + """Compare predictions and observations for bins ordered by y_pred. We order the samples by ``y_pred`` and split it in bins. - In each bin the observed mean is compared with the predicted - mean. + In each bin the observed mean is compared with the predicted mean. Parameters ---------- @@ -306,7 +311,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, sample_weight : array-like of shape (n_samples,) Sample weights. n_bins: int - number of bins to use + Number of bins to use. Returns ------- @@ -370,10 +375,10 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # values. # # However, for some business applications, we are not necessarily interested -# in the the ability of the model in predicting the expected frequency value -# but instead in predicting which policyholder groups are the riskiest and -# which are the safest. In this case the model evaluation would cast the -# problem as a ranking problem rather than a regression problem. +# in the ability of the model to predict the expected frequency value, but +# instead to predict which policyholder groups are the riskiest and which are +# the safest. In this case, the model evaluation would cast the problem as a +# ranking problem rather than a regression problem. # # To compare the 3 models under this light on, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by @@ -435,6 +440,6 @@ def _cumulated_claims(y_true, y_pred, exposure): # # This last point is expected due to the nature of the problem: the occurrence # of accidents is mostly dominated by circumstantial causes that are not -# captured in the columns of the dataset. +# captured in the columns of the dataset or that are indeed random. plt.show() From 15eb1d39c6c2bbbdceeb64996d0f297ef4f80ebf Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 6 Oct 2019 16:20:33 +0200 Subject: [PATCH 193/269] EXA sharey for histograms --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 3ecb02108de4f..769c072c624fb 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -257,7 +257,7 @@ def score_estimator(estimator, df_test): # values: -fig, axes = plt.subplots(1, 4, figsize=(16, 3)) +fig, axes = plt.subplots(1, 4, figsize=(16, 3), sharey=True) fig.subplots_adjust(bottom=0.2) n_bins = 20 df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) @@ -265,6 +265,7 @@ def score_estimator(estimator, df_test): axes[0].set_title("Data") axes[0].set_yscale('log') axes[0].set_xlabel("y (observed Frequency)") +axes[0].set_ylim([1E2, 5E5]) for idx, model in enumerate([ridge, poisson, rf]): y_pred = model.predict(df_train) From 3d097c686dfb016a4561afca37f28ad0d40dc0f3 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 8 Oct 2019 15:07:56 +0200 Subject: [PATCH 194/269] Plot y_pred histograms on the test set --- ...plot_poisson_regression_non_normal_loss.py | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 769c072c624fb..0e948873da570 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -256,26 +256,32 @@ def score_estimator(estimator, df_test): # comparing the histogram of observed target values with that of predicted # values: - -fig, axes = plt.subplots(1, 4, figsize=(16, 3), sharey=True) +fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True) fig.subplots_adjust(bottom=0.2) n_bins = 20 -df_train["Frequency"].hist(bins=np.linspace(-1, 10, n_bins), ax=axes[0]) - -axes[0].set_title("Data") -axes[0].set_yscale('log') -axes[0].set_xlabel("y (observed Frequency)") -axes[0].set_ylim([1E2, 5E5]) - -for idx, model in enumerate([ridge, poisson, rf]): - y_pred = model.predict(df_train) - - pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), ax=axes[idx+1]) - axes[idx + 1].set( - title=model[-1].__class__.__name__, - yscale='log', - xlabel="y_pred (predicted expected Frequency)" - ) +for row_idx, label, df in zip(range(2), + ["train", "test"], + [df_train, df_test]): + df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins), + ax=axes[row_idx, 0]) + + axes[row_idx, 0].set_title("Data") + axes[row_idx, 0].set_yscale('log') + axes[row_idx, 0].set_xlabel("y (observed Frequency)") + axes[row_idx, 0].set_ylim([1e1, 5e5]) + axes[row_idx, 0].set_ylabel(label + " samples") + + for idx, model in enumerate([ridge, poisson, rf]): + y_pred = model.predict(df) + + pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins), + ax=axes[row_idx, idx+1]) + axes[row_idx, idx + 1].set( + title=model[-1].__class__.__name__, + yscale='log', + xlabel="y_pred (predicted expected Frequency)" + ) +plt.tight_layout() ############################################################################## # From 31f5b3d6a7a5e01055bc8bd914b51d3d92bcafb5 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 9 Oct 2019 16:30:37 +0200 Subject: [PATCH 195/269] Compound Poisson => Compound Poisson Gamma --- ...lot_tweedie_regression_insurance_claims.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index dfd5555c83af4..3bdb7d93f0130 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -410,15 +410,15 @@ def score_estimator( # Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # -# 4. Total claim amount -- Compound Poisson distribution -# ------------------------------------------------------- +# 4. Total claim amount -- Compound Poisson Gamma distribution +# ------------------------------------------------------------ # # As mentioned in the introduction, the total claim amount can be modeled # either as the product of the frequency model by the severity model, # denormalized by exposure. In the following code sample, the -# ``score_estimator`` is extended to score such a model. The mean deviance -# is computed assuming a Tweedie distribution with ``power=1.5`` to be -# comparable with the model from the following section, +# ``score_estimator`` is extended to score such a model. The mean deviance is +# computed assuming a Tweedie distribution with ``power=1.5`` to be comparable +# with the model from the following section, scores = score_estimator( @@ -436,9 +436,9 @@ def score_estimator( ############################################################################## # # Indeed, an alternative approach for modeling the total loss is with a unique -# Compound Poisson model, also corresponding to a Tweedie model -# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter -# ``p`` with a grid search, +# Compound Poisson Gamma model, also corresponding to a Tweedie model with a +# power :math:`p \in (1, 2)`. We determine the optimal hyperparameter ``p`` +# with a grid search: from sklearn.model_selection import GridSearchCV @@ -475,12 +475,13 @@ def score_estimator( ############################################################################## # # In this example, the mean absolute error is lower for the Compound Poisson -# model than when using separate models for frequency and severity. +# Gamma model than when using the product of the predictions of separate +# models for frequency and severity. # -# We can additionally validate these models by comparing observed and predicted -# total claim amount over the test and train subsets. We see that in our case -# the frequency-severity model underestimates the total claim amount, whereas -# the Tweedie model overestimates. +# We can additionally validate these models by comparing observed and +# predicted total claim amount over the test and train subsets. We see that, +# on average, the frequency-severity model underestimates the total claim +# amount, whereas the Tweedie model overestimates. res = [] for subset_label, X, df in [ From a498ff546da062fb4f6c33bef9d18e45e18249c8 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 9 Oct 2019 17:07:02 +0200 Subject: [PATCH 196/269] Compound Poisson => Compound Poisson Gamma --- sklearn/linear_model/_glm/glm.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index dd6f847895434..aae438733d424 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -28,7 +28,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): - """Regression via a Generalized Linear Model (GLM) with penalties. + """Regression via a penalized Generalized Linear Model (GLM). GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at fitting and predicting the mean of the target y as y_pred=h(X*w). @@ -575,19 +575,19 @@ class TweedieRegressor(GeneralizedLinearRegressor): Special cases are: - +-------+------------------+ - | Power | Distribution | - +=======+==================+ - | 0 | Normal | - +-------+------------------+ - | 1 | Poisson | - +-------+------------------+ - | (1,2) | Compound Poisson | - +-------+------------------+ - | 2 | Gamma | - +-------+------------------+ - | 3 | Inverse Gaussian | - +-------+------------------+ + +-------+------------------------+ + | Power | Distribution | + +=======+========================+ + | 0 | Normal | + +-------+------------------------+ + | 1 | Poisson | + +-------+------------------------+ + | (1,2) | Compound Poisson Gamma | + +-------+------------------------+ + | 2 | Gamma | + +-------+------------------------+ + | 3 | Inverse Gaussian | + +-------+------------------------+ alpha : float, optional (default=1) Constant that multiplies the penalty terms and thus determines the From 3fae28a06968ab8ce0d95b5e4a53b2e7a7d66205 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Wed, 9 Oct 2019 19:23:19 +0200 Subject: [PATCH 197/269] Various improvement in Tweedie regression example --- ...lot_tweedie_regression_insurance_claims.py | 156 +++++++++++++----- 1 file changed, 118 insertions(+), 38 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 3bdb7d93f0130..4b450fe34bb1e 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -48,7 +48,7 @@ from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer -from sklearn.metrics import mean_absolute_error, mean_squared_error +from sklearn.metrics import mean_absolute_error, mean_squared_error, auc def load_mtpl2(n_samples=100000): @@ -157,11 +157,14 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, column_trans = ColumnTransformer( [ - ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), + ("binned_numeric", KBinsDiscretizer(n_bins=10), + ["VehAge", "DrivAge"]), ("onehot_categorical", OneHotEncoder(), - ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), - ("passthrough_numeric", "passthrough", ["BonusMalus"]), - ("log_scaled_numeric", log_scale_transformer, ["Density"]), + ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), + ("passthrough_numeric", "passthrough", + ["BonusMalus"]), + ("log_scaled_numeric", log_scale_transformer, + ["Density"]), ], remainder="drop", ) @@ -194,7 +197,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, def score_estimator( - estimator, X_train, X_test, df_train, df_test, target, weights + estimator, X_train, X_test, df_train, df_test, target, weights, + power=None, ): """Evaluate an estimator on train and test sets with different metrics""" res = [] @@ -217,13 +221,15 @@ def score_estimator( est_freq, est_sev = estimator y_pred = (df.Exposure.values * est_freq.predict(X) * est_sev.predict(X)) - power = 1.5 else: y_pred = estimator.predict(X) - power = getattr(getattr(estimator, "_family_instance"), - "power") + if power is None: + power = getattr(getattr(estimator, "_family_instance"), + "power") if score_label == "mean deviance": + if power is None: + continue metric = partial(mean_tweedie_deviance, power=power) if metric is None: @@ -378,7 +384,7 @@ def score_estimator( # We can visually compare observed and predicted values, aggregated for # the drivers age (``DrivAge``). -fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 4)) +fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6)) # plot DivAge plot_obs_pred( @@ -403,7 +409,7 @@ def score_estimator( ax=ax[1], fill_legend=True ) - +plt.tight_layout() ############################################################################## # @@ -411,16 +417,16 @@ def score_estimator( # severity, both in observed and predicted data. # # 4. Total claim amount -- Compound Poisson Gamma distribution -# ------------------------------------------------------------ +# ------------------------------------------------------------ # # As mentioned in the introduction, the total claim amount can be modeled # either as the product of the frequency model by the severity model, # denormalized by exposure. In the following code sample, the # ``score_estimator`` is extended to score such a model. The mean deviance is -# computed assuming a Tweedie distribution with ``power=1.5`` to be comparable -# with the model from the following section, - +# computed assuming a Tweedie distribution with ``power=2`` to be comparable +# with the model from the following section: +eps = 1e-4 scores = score_estimator( (glm_freq, glm_sev), X_train, @@ -429,40 +435,54 @@ def score_estimator( df_test, target="ClaimAmount", weights="Exposure", + power=2-eps, ) print(scores) ############################################################################## # -# Indeed, an alternative approach for modeling the total loss is with a unique -# Compound Poisson Gamma model, also corresponding to a Tweedie model with a -# power :math:`p \in (1, 2)`. We determine the optimal hyperparameter ``p`` -# with a grid search: +# Instead of taking the product of two independently fit models for frequency +# and severity one can directly model the total loss is with a unique Compound +# Poisson Gamma generalized linear model (with a log link function). This +# model is a special case of the Tweedie model with a power parameter :math:`p +# \in (1, 2)`. +# +# We determine the optimal hyperparameter ``p`` with a grid search so as to +# minimize the deviance: from sklearn.model_selection import GridSearchCV -# exclude upper bound as power>=2 does not support y=0. -params = {"power": np.linspace(1 + 1e-4, 2 - 1e-4, 8)} - - -# this takes a while -glm_total = GridSearchCV( - TweedieRegressor(tol=1e-3, max_iter=500), cv=3, - param_grid=params, n_jobs=-1 +# exclude upper bound as power>=2 as p=2 would lead to an undefined unit +# deviance on data points with y=0. +params = {"power": np.linspace(1 + eps, 2 - eps, 5)} + +X_train_small, _, df_train_small, _ = train_test_split( + X_train, df_train, train_size=5000) + +# This can takes a while on the full training set, therefore we do the +# hyper-parameter search on a random subset, hoping that the best value of +# power does not depend too much on the dataset size. We use a bit +# penalization to avoid numerical issues with colinear features and speed-up +# convergence. +glm_total = TweedieRegressor(max_iter=10000, alpha=1e-2) +search = GridSearchCV( + glm_total, cv=3, + param_grid=params, n_jobs=-1, verbose=10, + refit=False, ) -glm_total.fit( - X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"] +search.fit( + X_train_small, df_train_small["ClaimAmount"], + sample_weight=df_train_small["Exposure"] ) +print("Best hyper-parameters: %s" % search.best_params_) - -print( - "Best hyperparameters: power=%.2f\n" - % glm_total.best_estimator_.family.power -) +glm_total.set_params(**search.best_params_) +glm_total.fit(X_train, df_train["ClaimAmount"], + sample_weight=df_train["Exposure"]) scores = score_estimator( - glm_total.best_estimator_, + glm_total, X_train, X_test, df_train, @@ -496,11 +516,71 @@ def score_estimator( df["Exposure"].values*glm_freq.predict(X)*glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" - % glm_total.best_estimator_.family.power: np.sum( - glm_total.best_estimator_.predict(X) - ), + % glm_total.power: np.sum(glm_total.predict(X)), } ) print(pd.DataFrame(res).set_index("subset").T) + +############################################################################## +# +# Finally, we can compare the two models using a plot of cumulated claims: for +# each model, the policyholders are ranked from riskiest to safest and the +# actual cumulated claims are plotted against the cumulated exposure. +# +# The area under the curve can be used as a model selection metric to quantify +# the ability of the model to rank policyholders. Note that this metric does +# not reflect the ability of the models to make accurate predictions in terms +# of absolute value of total claim amounts but only in terms of relative +# amounts as a ranking metric. +# +# Both models are able to rank policyholders by risky-ness significantly +# better than chance although they are also both far from perfect due to the +# natural difficulty of the prediction problem from few features. + + +def _cumulated_claims(y_true, y_pred, exposure): + idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest + sorted_exposure = exposure[idx_sort] + sorted_frequencies = y_true[idx_sort] + cumulated_exposure = np.cumsum(sorted_exposure) + cumulated_exposure /= cumulated_exposure[-1] + cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) + cumulated_claims /= cumulated_claims[-1] + return cumulated_exposure, cumulated_claims + + +fig, ax = plt.subplots(figsize=(8, 8)) + +y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test) +y_pred_total = glm_total.predict(X_test) + +for label, y_pred in [("Frequency * Severity model", y_pred_product), + ("Compound Poisson Gamma", y_pred_total)]: + cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + y_pred, + df_test["Exposure"].values) + area = auc(cum_exposure, cum_claims) + label += " (area under curve: {:.3f})".format(area) + ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) + +# Oracle model: y_pred == y_test +cum_exposure, cum_claims = _cumulated_claims( + df_test["Frequency"].values, + df_test["Frequency"].values, + df_test["Exposure"].values) +area = auc(cum_exposure, cum_claims) +label = "Oracle (area under curve: {:.3f})".format(area) +ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) + +# Random Baseline +ax.plot([0, 1], [0, 1], linestyle="--", color="black", + label="Random baseline") +ax.set( + title="Cumulated claim amount by model", + xlabel='Fraction of exposure (from riskiest to safest)', + ylabel='Fraction of total claim amount' +) +ax.legend(loc="lower right") plt.plot() From a47798afe6e30e75b66f274a2323838c6a1401ea Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:07:09 +0200 Subject: [PATCH 198/269] Update doc/modules/linear_model.rst Co-Authored-By: Thomas J Fan --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d9e60a3517f8e..3119b9b0db94b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -915,7 +915,7 @@ problem becomes .. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, X_i)) + \frac{\alpha}{2} ||w||_2 -with sample weights :math:`s`, and L2 regularization penalty :math:`\alpha`. +with sample weights :math:`s_i`, and L2 regularization penalty :math:`\alpha`. The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` likelihood as From 83391dd56bac107b21eea4cb258f3831a56d02ff Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:49:53 +0200 Subject: [PATCH 199/269] Use latest docstring conventions everywhere --- sklearn/_loss/glm_distribution.py | 36 +++++------ sklearn/linear_model/_glm/glm.py | 100 +++++++++++++++--------------- sklearn/linear_model/_glm/link.py | 8 +-- sklearn/metrics/regression.py | 6 +- 4 files changed, 74 insertions(+), 76 deletions(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index 5f9e9ed06847c..4020f74427c44 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -60,7 +60,7 @@ def in_y_range(self, y): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. """ # Note that currently supported distributions have +inf upper bound @@ -92,7 +92,7 @@ def unit_variance(self, y_pred): Parameters ---------- - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. """ pass # pragma: no cover @@ -105,7 +105,7 @@ def unit_variance_derivative(self, y_pred): Parameters ---------- - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Target values. """ pass # pragma: no cover @@ -121,10 +121,10 @@ def unit_deviance(self, y, y_pred, check_input=False): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. check_input : bool, default=False @@ -132,7 +132,7 @@ def unit_deviance(self, y, y_pred, check_input=False): they will be propagated as NaN. Returns ------- - deviance: array, shape (n_samples,) + deviance: array of shape (n_samples,) Computed deviance """ pass # pragma: no cover @@ -147,10 +147,10 @@ def unit_deviance_derivative(self, y, y_pred): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. """ return -2 * (y - y_pred) / self.unit_variance(y_pred) @@ -168,13 +168,13 @@ def deviance(self, y, y_pred, weights=1): Parameters ---------- - y : array, shape (n_samples,) + y : array of shape (n_samples,) Target values. - y_pred : array, shape (n_samples,) + y_pred : array of shape (n_samples,) Predicted mean. - weights : array, shape (n_samples,) (default=1) + weights : {int, array of shape (n_samples,)}, default=1 Weights or exposure to which variance is inverse proportional. """ return np.sum(weights * self.unit_deviance(y, y_pred)) @@ -193,7 +193,7 @@ def deviance_derivative(self, y, y_pred, weights=1): y_pred : array, shape (n_samples,) Predicted mean. - weights : array, shape (n_samples,) (default=1) + weights : {int, array of shape (n_samples,)}, default=1 Weights or exposure to which variance is inverse proportional. """ return weights * self.unit_deviance_derivative(y, y_pred) @@ -231,7 +231,7 @@ class TweedieDistribution(ExponentialDispersionModel): Parameters ---------- - power : float (default=0) + power : float, default=0 The variance power of the `unit_variance` :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`. For ``0 0. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns @@ -743,7 +743,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): y_pred : array-like of shape (n_samples,) Estimated target values. Requires y_pred > 0. - sample_weight : array-like, shape (n_samples,), optional + sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns From 3bfb54e9e172b6f2cae3e48a7ece4d011937e786 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:53:42 +0200 Subject: [PATCH 200/269] Drop check_input parameter --- sklearn/linear_model/_glm/glm.py | 22 ++++++--------------- sklearn/linear_model/_glm/tests/test_glm.py | 12 +---------- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 2012f3cbb32a9..990de8114a717 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -92,11 +92,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. - check_input : bool, default=True - Allow to bypass several checks on input: y values in range of family, - sample_weight non-negative. - Don't use this parameter unless you know what you do. - verbose : int, default=0 For the lbfgs solver set verbose to any positive number for verbosity. @@ -115,7 +110,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, alpha=1.0, fit_intercept=True, family='normal', link='auto', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, check_input=True, verbose=0): + copy_X=True, verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept self.family = family @@ -125,7 +120,6 @@ def __init__(self, alpha=1.0, self.tol = tol self.warm_start = warm_start self.copy_X = copy_X - self.check_input = check_input self.verbose = verbose def fit(self, X, y, sample_weight=None): @@ -213,9 +207,6 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.copy_X, bool): raise ValueError("The argument copy_X must be bool;" " got {0}".format(self.copy_X)) - if not isinstance(self.check_input, bool): - raise ValueError("The argument check_input must be bool; got " - "(check_input={0})".format(self.check_input)) family = self._family_instance link = self._link_instance @@ -228,12 +219,11 @@ def fit(self, X, y, sample_weight=None): _, n_features = X.shape - if self.check_input: - if not np.all(family.in_y_range(y)): - raise ValueError("Some value(s) of y are out of the valid " - "range for family {0}" - .format(family.__class__.__name__)) - # TODO: if alpha=0 check that X is not rank deficient + if not np.all(family.in_y_range(y)): + raise ValueError("Some value(s) of y are out of the valid " + "range for family {0}" + .format(family.__class__.__name__)) + # TODO: if alpha=0 check that X is not rank deficient # rescaling of sample_weight # diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 9e21ae7775cf4..c0ff6508db9c9 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -164,16 +164,6 @@ def test_glm_copy_X_argument(copy_X): glm.fit(X, y) -@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]]) -def test_glm_check_input_argument(check_input): - """Test GLM for invalid check_input argument.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(check_input=check_input) - with pytest.raises(ValueError, match="check_input must be bool"): - glm.fit(X, y) - - def test_glm_identity_regression(): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] @@ -291,7 +281,7 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept): glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', link='identity', fit_intercept=True, - check_input=False, max_iter=300) + max_iter=300) glm.fit(X_train, y_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) From d325fe23348f8dabfcc55dd4fbd8fa82fd60ff8d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 11:57:10 +0200 Subject: [PATCH 201/269] Use keyword only arguments SLEP009 --- sklearn/linear_model/_glm/glm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 990de8114a717..8ef912f9596b6 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -107,7 +107,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, + def __init__(self, *, alpha=1.0, fit_intercept=True, family='normal', link='auto', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, verbose=0): @@ -435,8 +435,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, verbose=0): + def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, + tol=1e-4, warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', max_iter=max_iter, @@ -514,8 +514,8 @@ class GammaRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, alpha=1.0, fit_intercept=True, max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, verbose=0): + def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, + tol=1e-4, warm_start=False, copy_X=True, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="gamma", link='log', max_iter=max_iter, @@ -626,7 +626,7 @@ class TweedieRegressor(GeneralizedLinearRegressor): n_iter_ : int Actual number of iterations used in the solver. """ - def __init__(self, power=0.0, alpha=1.0, fit_intercept=True, + def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True, link='auto', max_iter=100, tol=1e-4, warm_start=False, copy_X=True, verbose=0): From 661cf56e04bc23e53a7173bc43d998187892d9ec Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 10 Oct 2019 12:04:37 +0200 Subject: [PATCH 202/269] Move _y_pred_deviance_derivative from losses as a private function --- sklearn/_loss/glm_distribution.py | 19 ------------------- sklearn/linear_model/_glm/glm.py | 25 +++++++++++++++++++++++-- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index 4020f74427c44..55365d382c03b 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -13,14 +13,6 @@ from scipy.special import xlogy -def _safe_lin_pred(X, coef): - """Compute the linear predictor taking care if intercept is present.""" - if coef.size == X.shape[1] + 1: - return X @ coef[1:] + coef[0] - else: - return X @ coef - - DistributionBoundary = namedtuple("DistributionBoundary", ("value", "inclusive")) @@ -198,17 +190,6 @@ def deviance_derivative(self, y, y_pred, weights=1): """ return weights * self.unit_deviance_derivative(y, y_pred) - def _y_pred_deviance_derivative(self, coef, X, y, weights, link): - """Compute y_pred and the derivative of the deviance w.r.t coef.""" - lin_pred = _safe_lin_pred(X, coef) - y_pred = link.inverse(lin_pred) - d1 = link.inverse_derivative(lin_pred) - temp = d1 * self.deviance_derivative(y, y_pred, weights) - if coef.size == X.shape[1] + 1: - devp = np.concatenate(([temp.sum()], temp @ X)) - else: - devp = temp @ X # same as X.T @ temp - return y_pred, devp class TweedieDistribution(ExponentialDispersionModel): diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 8ef912f9596b6..b29dcd89a35a6 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -27,6 +27,27 @@ ) +def _safe_lin_pred(X, coef): + """Compute the linear predictor taking care if intercept is present.""" + if coef.size == X.shape[1] + 1: + return X @ coef[1:] + coef[0] + else: + return X @ coef + + +def _y_pred_deviance_derivative(coef, X, y, weights, family, link): + """Compute y_pred and the derivative of the deviance w.r.t coef.""" + lin_pred = _safe_lin_pred(X, coef) + y_pred = link.inverse(lin_pred) + d1 = link.inverse_derivative(lin_pred) + temp = d1 * family.deviance_derivative(y, y_pred, weights) + if coef.size == X.shape[1] + 1: + devp = np.concatenate(([temp.sum()], temp @ X)) + else: + devp = temp @ X # same as X.T @ temp + return y_pred, devp + + class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): """Regression via a penalized Generalized Linear Model (GLM). @@ -251,8 +272,8 @@ def fit(self, X, y, sample_weight=None): if solver == 'lbfgs': def func(coef, X, y, weights, alpha, family, link): - y_pred, devp = family._y_pred_deviance_derivative( - coef, X, y, weights, link + y_pred, devp = _y_pred_deviance_derivative( + coef, X, y, weights, family, link ) dev = family.deviance(y, y_pred, weights) intercept = (coef.size == X.shape[1] + 1) From 560c180fee5c1217353af106beeca04a0b9bd732 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 10 Oct 2019 15:28:21 +0200 Subject: [PATCH 203/269] Fix cumulated claim amount curve in Tweedie regression example --- ...lot_tweedie_regression_insurance_claims.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 4b450fe34bb1e..7e4a8599dec0e 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -539,15 +539,15 @@ def score_estimator( # natural difficulty of the prediction problem from few features. -def _cumulated_claims(y_true, y_pred, exposure): - idx_sort = np.argsort(y_pred)[::-1] # from riskiest to safest - sorted_exposure = exposure[idx_sort] - sorted_frequencies = y_true[idx_sort] - cumulated_exposure = np.cumsum(sorted_exposure) +def _cumulated_claim_amount(y_true, y_pred, exposure): + ranking = np.argsort(y_pred)[::-1] # from riskiest to safest + ranked_exposure = exposure[ranking] + ranked_claim_amount = y_true[ranking] + cumulated_exposure = np.cumsum(ranked_exposure) cumulated_exposure /= cumulated_exposure[-1] - cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies) - cumulated_claims /= cumulated_claims[-1] - return cumulated_exposure, cumulated_claims + cumulated_claim_amount = np.cumsum(ranked_claim_amount) + cumulated_claim_amount /= cumulated_claim_amount[-1] + return cumulated_exposure, cumulated_claim_amount fig, ax = plt.subplots(figsize=(8, 8)) @@ -557,8 +557,8 @@ def _cumulated_claims(y_true, y_pred, exposure): for label, y_pred in [("Frequency * Severity model", y_pred_product), ("Compound Poisson Gamma", y_pred_total)]: - cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, + cum_exposure, cum_claims = _cumulated_claim_amount( + df_test["ClaimAmount"].values, y_pred, df_test["Exposure"].values) area = auc(cum_exposure, cum_claims) @@ -566,9 +566,9 @@ def _cumulated_claims(y_true, y_pred, exposure): ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -cum_exposure, cum_claims = _cumulated_claims( - df_test["Frequency"].values, - df_test["Frequency"].values, +cum_exposure, cum_claims = _cumulated_claim_amount( + df_test["ClaimAmount"].values, + df_test["ClaimAmount"].values, df_test["Exposure"].values) area = auc(cum_exposure, cum_claims) label = "Oracle (area under curve: {:.3f})".format(area) From 0ea2dce29a52829fb2c0ace0c992f68e1f052f0a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Thu, 10 Oct 2019 16:15:25 +0200 Subject: [PATCH 204/269] PEP8 --- sklearn/_loss/glm_distribution.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index 55365d382c03b..dbfac6af673ae 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -191,7 +191,6 @@ def deviance_derivative(self, y, y_pred, weights=1): return weights * self.unit_deviance_derivative(y, y_pred) - class TweedieDistribution(ExponentialDispersionModel): r"""A class for the Tweedie distribution. From 4ca2e95535fd3ba55980fbf437a88df0175ab819 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 14 Oct 2019 14:50:48 +0200 Subject: [PATCH 205/269] MNT remove function body in abstract methods --- sklearn/_loss/glm_distribution.py | 3 --- sklearn/linear_model/_glm/link.py | 4 ---- 2 files changed, 7 deletions(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index dbfac6af673ae..920218ea7f674 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -87,7 +87,6 @@ def unit_variance(self, y_pred): y_pred : array of shape (n_samples,) Predicted mean. """ - pass # pragma: no cover @abstractmethod def unit_variance_derivative(self, y_pred): @@ -100,7 +99,6 @@ def unit_variance_derivative(self, y_pred): y_pred : array of shape (n_samples,) Target values. """ - pass # pragma: no cover @abstractmethod def unit_deviance(self, y, y_pred, check_input=False): @@ -127,7 +125,6 @@ def unit_deviance(self, y, y_pred, check_input=False): deviance: array of shape (n_samples,) Computed deviance """ - pass # pragma: no cover def unit_deviance_derivative(self, y, y_pred): r"""Compute the derivative of the unit deviance w.r.t. y_pred. diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py index e8d3c792d3efe..878d8e835bc42 100644 --- a/sklearn/linear_model/_glm/link.py +++ b/sklearn/linear_model/_glm/link.py @@ -26,7 +26,6 @@ def __call__(self, y_pred): y_pred : array of shape (n_samples,) Usually the (predicted) mean. """ - pass # pragma: no cover @abstractmethod def derivative(self, y_pred): @@ -37,7 +36,6 @@ def derivative(self, y_pred): y_pred : array of shape (n_samples,) Usually the (predicted) mean. """ - pass # pragma: no cover @abstractmethod def inverse(self, lin_pred): @@ -51,7 +49,6 @@ def inverse(self, lin_pred): lin_pred : array of shape (n_samples,) Usually the (fitted) linear predictor. """ - pass # pragma: no cover @abstractmethod def inverse_derivative(self, lin_pred): @@ -62,7 +59,6 @@ def inverse_derivative(self, lin_pred): lin_pred : array of shape (n_samples,) Usually the (fitted) linear predictor. """ - pass # pragma: no cover class IdentityLink(BaseLink): From 89b429d921bb65f2f8dcd431dee2720152a4abbe Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 14 Oct 2019 16:05:59 +0200 Subject: [PATCH 206/269] Improvements to Pure Premium example --- ...lot_tweedie_regression_insurance_claims.py | 258 +++++++++--------- 1 file changed, 132 insertions(+), 126 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 7e4a8599dec0e..0dd0ed25f4a02 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -130,8 +130,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, ############################################################################## # -# 1. Loading datasets and pre-processing -# -------------------------------------- +# Loading datasets, basic feature extraction and target definitions +# ----------------------------------------------------------------- # # We construct the freMTPL2 dataset by joining the freMTPL2freq table, # containing the number of claims (``ClaimNb``), with the freMTPL2sev table, @@ -170,7 +170,13 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, ) X = column_trans.fit_transform(df) +# Insurances companies are interested in modeling the Pure Premium, that is +# the expected total claim amount per unit of exposure for each policyholder +# in their portfolio: +df["PurePremium"] = df["ClaimAmount"] / df["Exposure"] +# This can be inderectly approximated by a 2-step modeling the product of the +# Frequency times the average claim amount per claim: df["Frequency"] = df["ClaimNb"] / df["Exposure"] df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1) @@ -178,8 +184,8 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, ############################################################################## # -# 2. Frequency model -- Poisson distribution -# ------------------------------------------- +# Frequency model -- Poisson distribution +# --------------------------------------- # # The number of claims (``ClaimNb``) is a positive integer that can be modeled # as a Poisson distribution. It is then assumed to be the number of discrete @@ -190,47 +196,50 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) -# Some of the features are colinear, we use a weak penalization to avoid -# numerical issues. -glm_freq = PoissonRegressor(alpha=1e-2) -glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure) +# The parameters of the model are estimated by minimizing the Poisson deviance +# on the training set via a quasi-Newton solver: l-BFGS. Some of the features +# are colinear, we use a weak penalization to avoid numerical issues. +glm_freq = PoissonRegressor(alpha=1e-3) +glm_freq.fit(X_train, df_train["Frequency"], + sample_weight=df_train["Exposure"]) def score_estimator( estimator, X_train, X_test, df_train, df_test, target, weights, - power=None, + tweedie_powers=None, ): """Evaluate an estimator on train and test sets with different metrics""" - res = [] + if isinstance(estimator, tuple): + model_name = " * ".join(e.__class__.__name__ for e in estimator) + else: + model_name = estimator.__class__.__name__ + print("\nEvaluation of {} of target {} ".format(model_name, target)) + + metrics = [ + ("D² explained", None), + ("mean abs. error", mean_absolute_error), + ("mean squared error", mean_squared_error), + ] + if tweedie_powers: + metrics += [( + "mean Tweedie deviance (p={:.4f})".format(power), + partial(mean_tweedie_deviance, power=power) + ) for power in tweedie_powers] + res = [] for subset_label, X, df in [ ("train", X_train, df_train), ("test", X_test, df_test), ]: y, _weights = df[target], df[weights] - - for score_label, metric in [ - ("D² explained", None), - ("mean deviance", mean_tweedie_deviance), - ("mean abs. error", mean_absolute_error), - ("mean squared error", mean_squared_error), - ]: + for score_label, metric in metrics: if isinstance(estimator, tuple) and len(estimator) == 2: # Score the model consisting of the product of frequency and - # severity models, denormalized by the exposure values. + # severity models. est_freq, est_sev = estimator - y_pred = (df.Exposure.values * est_freq.predict(X) * - est_sev.predict(X)) + y_pred = est_freq.predict(X) * est_sev.predict(X) else: y_pred = estimator.predict(X) - if power is None: - power = getattr(getattr(estimator, "_family_instance"), - "power") - - if score_label == "mean deviance": - if power is None: - continue - metric = partial(mean_tweedie_deviance, power=power) if metric is None: if not hasattr(estimator, "score"): @@ -266,8 +275,8 @@ def score_estimator( ############################################################################## # -# We can visually compare observed and predicted values, aggregated by -# the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance +# We can visually compare observed and predicted values, aggregated by the +# drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance # bonus/malus (``BonusMalus``). fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8)) @@ -325,11 +334,11 @@ def score_estimator( # # According to the observed data, the frequency of accidents is higher for # drivers younger than 30 years old, and it positively correlated with the -# `BonusMalus` variable. Our model is able to mostly correctly model -# this behaviour. +# `BonusMalus` variable. Our model is able to mostly correctly model this +# behaviour. # -# 3. Severity model - Gamma distribution -# --------------------------------------- +# Severity Model - Gamma distribution +# ------------------------------------ # The mean claim amount or severity (`AvgClaimAmount`) can be empirically # shown to follow approximately a Gamma distribution. We fit a GLM model for # the severity with the same features as the frequency model. @@ -343,7 +352,7 @@ def score_estimator( mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 -glm_sev = GammaRegressor() +glm_sev = GammaRegressor(alpha=10., max_iter=10000) glm_sev.fit( X_train[mask_train.values], @@ -351,7 +360,6 @@ def score_estimator( sample_weight=df_train.loc[mask_train, "ClaimNb"], ) - scores = score_estimator( glm_sev, X_train[mask_train.values], @@ -365,11 +373,13 @@ def score_estimator( ############################################################################## # -# Here, the scores for the test data call for caution as they are significantly -# worse than for the training data indicating an overfit. -# Note that the resulting model is the average claim amount per claim. As such, -# it is conditional on having at least one claim, and cannot be used to predict -# the average claim amount per policy in general. +# Here, the scores for the test data call for caution as they are +# significantly worse than for the training data indicating an overfit despite +# the strong regularization. +# +# Note that the resulting model is the average claim amount per claim. As +# such, it is conditional on having at least one claim, and cannot be used to +# predict the average claim amount per policy in general. print("Mean AvgClaim Amount per policy: %.2f " % df_train["AvgClaimAmount"].mean()) @@ -386,7 +396,6 @@ def score_estimator( fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6)) -# plot DivAge plot_obs_pred( df=df_train.loc[mask_train], feature="DrivAge", @@ -416,79 +425,63 @@ def score_estimator( # Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # -# 4. Total claim amount -- Compound Poisson Gamma distribution -# ------------------------------------------------------------ +# Pure Premium Modeling via a Product of Frequency and Severity Models +# -------------------------------------------------------------------- +# As mentioned in the introduction, the total claim amount per unit of +# exposure can be modeled either as the product of the frequency model by the +# severity model. +# +# To quantify the aggregate performance of this product model, one can compute +# the deviance of Tweedie distribution which is equivalent to a com. +# In the following code sample, the ``score_estimator`` is extended to score +# such a model. # -# As mentioned in the introduction, the total claim amount can be modeled -# either as the product of the frequency model by the severity model, -# denormalized by exposure. In the following code sample, the -# ``score_estimator`` is extended to score such a model. The mean deviance is -# computed assuming a Tweedie distribution with ``power=2`` to be comparable -# with the model from the following section: - -eps = 1e-4 +# The mean deviance is computed assuming a Tweedie distribution with a fixed +# grid of values for the power parameter to be comparable with the model from +# the following section: + +tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] scores = score_estimator( (glm_freq, glm_sev), X_train, X_test, df_train, df_test, - target="ClaimAmount", + target="PurePremium", weights="Exposure", - power=2-eps, + tweedie_powers=tweedie_powers, ) print(scores) ############################################################################## # +# Pure Premium Modeling Using a Single Compound Poisson Gamma Model +# ----------------------------------------------------------------- # Instead of taking the product of two independently fit models for frequency # and severity one can directly model the total loss is with a unique Compound # Poisson Gamma generalized linear model (with a log link function). This -# model is a special case of the Tweedie model with a power parameter :math:`p +# model is a special case of the Tweedie GLM with a "power" parameter :math:`p # \in (1, 2)`. # -# We determine the optimal hyperparameter ``p`` with a grid search so as to -# minimize the deviance: - -from sklearn.model_selection import GridSearchCV - -# exclude upper bound as power>=2 as p=2 would lead to an undefined unit -# deviance on data points with y=0. -params = {"power": np.linspace(1 + eps, 2 - eps, 5)} - -X_train_small, _, df_train_small, _ = train_test_split( - X_train, df_train, train_size=5000) - -# This can takes a while on the full training set, therefore we do the -# hyper-parameter search on a random subset, hoping that the best value of -# power does not depend too much on the dataset size. We use a bit -# penalization to avoid numerical issues with colinear features and speed-up -# convergence. -glm_total = TweedieRegressor(max_iter=10000, alpha=1e-2) -search = GridSearchCV( - glm_total, cv=3, - param_grid=params, n_jobs=-1, verbose=10, - refit=False, -) -search.fit( - X_train_small, df_train_small["ClaimAmount"], - sample_weight=df_train_small["Exposure"] -) -print("Best hyper-parameters: %s" % search.best_params_) +# Here we fix apriori the "power" parameter of the Tweedie model to some +# arbitrary value in the valid range. Ideally one would select this value via +# grid-search by minimizing the negative log-likelihood of the Tweedie model +# but unfortunately the current implementation does not allow for this (yet). -glm_total.set_params(**search.best_params_) -glm_total.fit(X_train, df_train["ClaimAmount"], - sample_weight=df_train["Exposure"]) +glm_pure_premium = TweedieRegressor(power=1.999, alpha=.1, max_iter=10000) +glm_pure_premium.fit(X_train, df_train["PurePremium"], + sample_weight=df_train["Exposure"]) scores = score_estimator( - glm_total, + glm_pure_premium, X_train, X_test, df_train, df_test, - target="ClaimAmount", + target="PurePremium", weights="Exposure", + tweedie_powers=tweedie_powers ) print(scores) @@ -500,23 +493,25 @@ def score_estimator( # # We can additionally validate these models by comparing observed and # predicted total claim amount over the test and train subsets. We see that, -# on average, the frequency-severity model underestimates the total claim -# amount, whereas the Tweedie model overestimates. +# on average, both model tend to underestimate the total claim (but this +# behavior depends on the amount of regularization). res = [] for subset_label, X, df in [ ("train", X_train, df_train), ("test", X_test, df_test), ]: + exposure = df["Exposure"].values res.append( { "subset": subset_label, "observed": df["ClaimAmount"].values.sum(), "predicted, frequency*severity model": np.sum( - df["Exposure"].values*glm_freq.predict(X)*glm_sev.predict(X) + exposure * glm_freq.predict(X) * glm_sev.predict(X) ), "predicted, tweedie, power=%.2f" - % glm_total.power: np.sum(glm_total.predict(X)), + % glm_pure_premium.power: np.sum( + exposure * glm_pure_premium.predict(X)), } ) @@ -525,62 +520,73 @@ def score_estimator( ############################################################################## # # Finally, we can compare the two models using a plot of cumulated claims: for -# each model, the policyholders are ranked from riskiest to safest and the -# actual cumulated claims are plotted against the cumulated exposure. +# each model, the policyholders are ranked from safest to riskiest and the +# fraction of observed total cumulated claims is plotted on the y axis. This +# plot is often called the ordered Lorenz curve of the model. # -# The area under the curve can be used as a model selection metric to quantify -# the ability of the model to rank policyholders. Note that this metric does -# not reflect the ability of the models to make accurate predictions in terms -# of absolute value of total claim amounts but only in terms of relative -# amounts as a ranking metric. +# The Gini coefficient (based on the area under the curve) can be used as a +# model selection metric to quantify the ability of the model to rank +# policyholders. Note that this metric does not reflect the ability of the +# models to make accurate predictions in terms of absolute value of total +# claim amounts but only in terms of relative amounts as a ranking metric. # # Both models are able to rank policyholders by risky-ness significantly # better than chance although they are also both far from perfect due to the # natural difficulty of the prediction problem from few features. +# +# Note that the Gini index only characterize the ranking performance of the +# model but not its calibration: any monotonic transformation of the +# predictions leaves the Gini index of the model unchanged. +# +# Finally on should highlight that the Compound Poisson Gamma model that +# is directly fit on the pure premium is operationally simpler to develop and +# maintain as it consists in a single scikit-learn estimator instead of a +# pair of models. + +def ordered_lorenz_curve(y_true, y_pred, exposure): + y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) + exposure = np.asarray(exposure) -def _cumulated_claim_amount(y_true, y_pred, exposure): - ranking = np.argsort(y_pred)[::-1] # from riskiest to safest + # order samples by increasing predicted risk: + ranking = np.argsort(y_pred) ranked_exposure = exposure[ranking] - ranked_claim_amount = y_true[ranking] - cumulated_exposure = np.cumsum(ranked_exposure) - cumulated_exposure /= cumulated_exposure[-1] - cumulated_claim_amount = np.cumsum(ranked_claim_amount) + ranked_pure_premium = y_true[ranking] + cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure) cumulated_claim_amount /= cumulated_claim_amount[-1] - return cumulated_exposure, cumulated_claim_amount + cumulated_samples = np.linspace(0, 1, len(cumulated_claim_amount)) + return cumulated_samples, cumulated_claim_amount fig, ax = plt.subplots(figsize=(8, 8)) y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test) -y_pred_total = glm_total.predict(X_test) +y_pred_total = glm_pure_premium.predict(X_test) for label, y_pred in [("Frequency * Severity model", y_pred_product), ("Compound Poisson Gamma", y_pred_total)]: - cum_exposure, cum_claims = _cumulated_claim_amount( - df_test["ClaimAmount"].values, - y_pred, - df_test["Exposure"].values) - area = auc(cum_exposure, cum_claims) - label += " (area under curve: {:.3f})".format(area) - ax.plot(cum_exposure, cum_claims, linestyle="-", label=label) + ordered_samples, cum_claims = ordered_lorenz_curve( + df_test["PurePremium"], y_pred, df_test["Exposure"]) + gini = 1 - 2 * auc(ordered_samples, cum_claims) + label += " (Gini index: {:.3f})".format(gini) + ax.plot(ordered_samples, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -cum_exposure, cum_claims = _cumulated_claim_amount( - df_test["ClaimAmount"].values, - df_test["ClaimAmount"].values, - df_test["Exposure"].values) -area = auc(cum_exposure, cum_claims) -label = "Oracle (area under curve: {:.3f})".format(area) -ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label) - -# Random Baseline +ordered_samples, cum_claims = ordered_lorenz_curve( + df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]) +gini = 1 - 2 * auc(ordered_samples, cum_claims) +label = "Oracle (Gini index: {:.3f})".format(gini) +ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray", + label=label) + +# Random baseline ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") ax.set( - title="Cumulated claim amount by model", - xlabel='Fraction of exposure (from riskiest to safest)', + title="Ordered Lorenz Curves", + xlabel=('Fraction of policyholds\n' + '(ordered by model from safest to riskiest)'), ylabel='Fraction of total claim amount' ) -ax.legend(loc="lower right") +ax.legend(loc="upper left") plt.plot() From 2d0b195bae7566d4a0ebd36211d869d1a86d703a Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 14 Oct 2019 18:38:49 +0200 Subject: [PATCH 207/269] s/ordered Lorenz/Lorenz/ --- .../plot_tweedie_regression_insurance_claims.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 0dd0ed25f4a02..27e0449d84ce8 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -538,13 +538,13 @@ def score_estimator( # model but not its calibration: any monotonic transformation of the # predictions leaves the Gini index of the model unchanged. # -# Finally on should highlight that the Compound Poisson Gamma model that +# Finally one should highlight that the Compound Poisson Gamma model that # is directly fit on the pure premium is operationally simpler to develop and # maintain as it consists in a single scikit-learn estimator instead of a -# pair of models. +# pair of models, each with its own set of hyperparameters. -def ordered_lorenz_curve(y_true, y_pred, exposure): +def lorenz_curve(y_true, y_pred, exposure): y_true, y_pred = np.asarray(y_true), np.asarray(y_pred) exposure = np.asarray(exposure) @@ -565,14 +565,14 @@ def ordered_lorenz_curve(y_true, y_pred, exposure): for label, y_pred in [("Frequency * Severity model", y_pred_product), ("Compound Poisson Gamma", y_pred_total)]: - ordered_samples, cum_claims = ordered_lorenz_curve( + ordered_samples, cum_claims = lorenz_curve( df_test["PurePremium"], y_pred, df_test["Exposure"]) gini = 1 - 2 * auc(ordered_samples, cum_claims) label += " (Gini index: {:.3f})".format(gini) ax.plot(ordered_samples, cum_claims, linestyle="-", label=label) # Oracle model: y_pred == y_test -ordered_samples, cum_claims = ordered_lorenz_curve( +ordered_samples, cum_claims = lorenz_curve( df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"]) gini = 1 - 2 * auc(ordered_samples, cum_claims) label = "Oracle (Gini index: {:.3f})".format(gini) @@ -583,8 +583,8 @@ def ordered_lorenz_curve(y_true, y_pred, exposure): ax.plot([0, 1], [0, 1], linestyle="--", color="black", label="Random baseline") ax.set( - title="Ordered Lorenz Curves", - xlabel=('Fraction of policyholds\n' + title="Lorenz Curves", + xlabel=('Fraction of policyholders\n' '(ordered by model from safest to riskiest)'), ylabel='Fraction of total claim amount' ) From ea6a3e8bc57c7775cab5f89bd923fee062d2d94d Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 14 Oct 2019 19:02:46 +0200 Subject: [PATCH 208/269] More doc improvements to the pure premium example --- ...lot_tweedie_regression_insurance_claims.py | 165 +++++++++--------- 1 file changed, 83 insertions(+), 82 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 27e0449d84ce8..a95b8f0301663 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -3,17 +3,19 @@ Tweedie regression on insurance claims ====================================== -This example illustrates the use of Poisson, Gamma and Tweedie regression -on the French Motor Third-Party Liability Claims dataset, and is inspired -by an R tutorial [1]. +This example illustrates the use of Poisson, Gamma and Tweedie regression on +the French Motor Third-Party Liability Claims dataset, and is inspired by an R +tutorial [1]. Insurance claims data consist of the number of claims and the total claim amount. Often, the final goal is to predict the expected value, i.e. the mean, -of the total claim amount. There are several possibilities to do that, two of -which are: +of the total claim amount per exposure unit also referred to as the pure +premium. -1. Model the number of claims with a Poisson distribution, the average - claim amount per claim, also known as severity, as a Gamma distribution and +There are several possibilities to do that, two of which are: + +1. Model the number of claims with a Poisson distribution, the average claim + amount per claim, also known as severity, as a Gamma distribution and multiply the predictions of both in order to get the total claim amount. 2. Model total claim amount directly, typically with a Tweedie distribution of Tweedie power :math:`p \\in (1, 2)`. @@ -21,16 +23,16 @@ In this example we will illustrate both approaches. We start by defining a few helper functions for loading the data and visualizing results. - .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor - Third-Party Liability Claims (November 8, 2018). - `doi:10.2139/ssrn.3164764 `_ + Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764 + `_ """ print(__doc__) # Authors: Christian Lorentzen # Roman Yurchak +# Olivier Grisel # License: BSD 3 clause from functools import partial @@ -128,6 +130,64 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, ) +def score_estimator( + estimator, X_train, X_test, df_train, df_test, target, weights, + tweedie_powers=None, +): + """Evaluate an estimator on train and test sets with different metrics""" + if isinstance(estimator, tuple): + model_name = " * ".join(e.__class__.__name__ for e in estimator) + else: + model_name = estimator.__class__.__name__ + print("\nEvaluation of {} of target {} ".format(model_name, target)) + + metrics = [ + ("D² explained", None), + ("mean abs. error", mean_absolute_error), + ("mean squared error", mean_squared_error), + ] + if tweedie_powers: + metrics += [( + "mean Tweedie deviance (p={:.4f})".format(power), + partial(mean_tweedie_deviance, power=power) + ) for power in tweedie_powers] + + res = [] + for subset_label, X, df in [ + ("train", X_train, df_train), + ("test", X_test, df_test), + ]: + y, _weights = df[target], df[weights] + for score_label, metric in metrics: + if isinstance(estimator, tuple) and len(estimator) == 2: + # Score the model consisting of the product of frequency and + # severity models. + est_freq, est_sev = estimator + y_pred = est_freq.predict(X) * est_sev.predict(X) + else: + y_pred = estimator.predict(X) + + if metric is None: + if not hasattr(estimator, "score"): + continue + score = estimator.score(X, y, _weights) + else: + score = metric(y, y_pred, _weights) + + res.append( + {"subset": subset_label, "metric": score_label, "score": score} + ) + + res = ( + pd.DataFrame(res) + .set_index(["metric", "subset"]) + .score.unstack(-1) + .round(2) + .loc[:, ['train', 'test']] + ) + return res + + ############################################################################## # # Loading datasets, basic feature extraction and target definitions @@ -203,65 +263,6 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"]) - -def score_estimator( - estimator, X_train, X_test, df_train, df_test, target, weights, - tweedie_powers=None, -): - """Evaluate an estimator on train and test sets with different metrics""" - if isinstance(estimator, tuple): - model_name = " * ".join(e.__class__.__name__ for e in estimator) - else: - model_name = estimator.__class__.__name__ - print("\nEvaluation of {} of target {} ".format(model_name, target)) - - metrics = [ - ("D² explained", None), - ("mean abs. error", mean_absolute_error), - ("mean squared error", mean_squared_error), - ] - if tweedie_powers: - metrics += [( - "mean Tweedie deviance (p={:.4f})".format(power), - partial(mean_tweedie_deviance, power=power) - ) for power in tweedie_powers] - - res = [] - for subset_label, X, df in [ - ("train", X_train, df_train), - ("test", X_test, df_test), - ]: - y, _weights = df[target], df[weights] - for score_label, metric in metrics: - if isinstance(estimator, tuple) and len(estimator) == 2: - # Score the model consisting of the product of frequency and - # severity models. - est_freq, est_sev = estimator - y_pred = est_freq.predict(X) * est_sev.predict(X) - else: - y_pred = estimator.predict(X) - - if metric is None: - if not hasattr(estimator, "score"): - continue - score = estimator.score(X, y, _weights) - else: - score = metric(y, y_pred, _weights) - - res.append( - {"subset": subset_label, "metric": score_label, "score": score} - ) - - res = ( - pd.DataFrame(res) - .set_index(["metric", "subset"]) - .score.unstack(-1) - .round(2) - .loc[:, ['train', 'test']] - ) - return res - - scores = score_estimator( glm_freq, X_train, @@ -425,20 +426,21 @@ def score_estimator( # Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # -# Pure Premium Modeling via a Product of Frequency and Severity Models -# -------------------------------------------------------------------- +# Pure Premium Modeling via a Product of Frequency and Severity +# ------------------------------------------------------------- # As mentioned in the introduction, the total claim amount per unit of -# exposure can be modeled either as the product of the frequency model by the -# severity model. +# exposure can be modeled either as the product of the prediction of the +# frequency model by the prediction of the severity model. # # To quantify the aggregate performance of this product model, one can compute -# the deviance of Tweedie distribution which is equivalent to a com. -# In the following code sample, the ``score_estimator`` is extended to score -# such a model. +# the mean deviance of the train and test data assuming a Compound +# Poisson-Gamma distribution of the total claim amount. This is equivalent to +# a Tweedie distribution with "power" parameter between 1 and 2. # -# The mean deviance is computed assuming a Tweedie distribution with a fixed -# grid of values for the power parameter to be comparable with the model from -# the following section: +# As we do not know the true value of the "power" parameter, we compute the +# mean deviances for a grid of possible values of the "power" parameter, +# hoping that a good model for one value of "power" will stay a good model for +# another: tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] scores = score_estimator( @@ -487,9 +489,8 @@ def score_estimator( ############################################################################## # -# In this example, the mean absolute error is lower for the Compound Poisson -# Gamma model than when using the product of the predictions of separate -# models for frequency and severity. +# In this example, both modeling approaches yield comparable performance +# metrics. # # We can additionally validate these models by comparing observed and # predicted total claim amount over the test and train subsets. We see that, From ddae396ddc636c65f9ff53be8ce61373e477f4b5 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 15 Oct 2019 16:20:47 -0400 Subject: [PATCH 209/269] doc update, simplification and fixes --- doc/modules/classes.rst | 5 +- doc/modules/linear_model.rst | 179 +++++++++++-------------------- sklearn/linear_model/_glm/glm.py | 96 ++++++----------- 3 files changed, 97 insertions(+), 183 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index bb62b47945e6e..adbd960dfbb6a 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -857,9 +857,8 @@ Any estimator using the Huber loss would also be robust to outliers, e.g. Generalized linear models (GLM) for regression ---------------------------------------------- -A generalization of linear models that allows for response variables to -have error distribution other than a normal distribution is implemented -in the following models, +These models allow for response variables to have error distribution other +than a normal distribution: .. autosummary:: :toctree: generated/ diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 3119b9b0db94b..b930a0d2a8106 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -907,99 +907,74 @@ Generalized Linear Models (GLM) extend linear models in two ways combination of the input variables :math:`X` via an inverse link function :math:`h` as -.. math:: \hat{y}(w, X) = h(x^\top w) = h(w_0 + w_1 X_1 + ... + w_p X_p). +.. math:: \hat{y}(w, X) = h(Xw). -Secondly, the squared loss function is replaced by the unit deviance :math:`d` -of a reproductive exponential dispersion model (EDM) [11]_. The minimization -problem becomes +Secondly, the squared loss function is replaced by the unit deviance +:math:`d` of a distribution in the exponential family (or more precisely, a +reproductive exponential dispersion model (EDM) [11]_). -.. math:: \min_{w} \frac{1}{2 \sum_i s_i} \sum_i s_i \cdot d(y_i, \hat{y}(w, X_i)) + \frac{\alpha}{2} ||w||_2 +The minimization problem becomes: -with sample weights :math:`s_i`, and L2 regularization penalty :math:`\alpha`. -The unit deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)` -likelihood as +.. math:: \min_{w} \frac{1}{2 \cdot n\text{_samples}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2, -.. math:: d(y, \mu) = -2\phi\cdot - \left( \log p(y|\mu,\phi) - - \log p(y|y,\phi)\right). +where :math:`\alpha` is the L2 regularization penalty. When sample weights are +provided, the average becomes a weighted average. -The following table lists some specific EDM distributions—all are instances of Tweedie -distributions—and some of their properties. +The following table lists some specific EDMs and their unit deviance (all of +these are instances of the Tweedie family): -================= =============================== ====================================== ============================================ -Distribution Target Domain Unit Variance Function :math:`v(\mu)` Unit Deviance :math:`d(y, \mu)` -================= =============================== ====================================== ============================================ -Normal :math:`y \in (-\infty, \infty)` :math:`1` :math:`(y-\mu)^2` -Poisson :math:`y \in [0, \infty)` :math:`\mu` :math:`2(y\log\frac{y}{\mu}-y+\mu)` -Gamma :math:`y \in (0, \infty)` :math:`\mu^2` :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)` -Inverse Gaussian :math:`y \in (0, \infty)` :math:`\mu^3` :math:`\frac{(y-\mu)^2}{y\mu^2}` -================= =============================== ====================================== ============================================ +================= =============================== ============================================ +Distribution Target Domain Unit Deviance :math:`d(y, \hat{y})` +================= =============================== ============================================ +Normal :math:`y \in (-\infty, \infty)` :math:`(y-\hat{y})^2` +Poisson :math:`y \in [0, \infty)` :math:`2(y\log\frac{y}{\hat{y}}-y+\hat{y})` +Gamma :math:`y \in (0, \infty)` :math:`2(\log\frac{\hat{y}}{y}+\frac{y}{\hat{y}}-1)` +Inverse Gaussian :math:`y \in (0, \infty)` :math:`\frac{(y-\hat{y})^2}{y\hat{y}^2}` +================= =============================== ============================================ +The choice of the distribution depends on the problem at hand: -Usage ------ - -A GLM loss different from the classical squared loss might be appropriate in -the following cases: - - * If the target values :math:`y` are counts (non-negative integer valued) or - frequencies (non-negative), you might use a Poisson deviance with log-link. - - * If the target values are positive valued and skewed, you might try a - Gamma deviance with log-link. - - * If the target values seem to be heavier tailed than a Gamma distribution, - you might try an Inverse Gaussian deviance (or even higher variance powers - of the Tweedie family). - -Since the linear predictor :math:`x^\top w` can be negative and -Poisson, Gamma and Inverse Gaussian distributions don't support negative values, -it is convenient to apply a link function different from the identity link -:math:`h(x^\top w)=x^\top w` that guarantees the non-negativeness, e.g. the -log-link `link='log'` with :math:`h(x^\top w)=\exp(x^\top w)`. - -:class:`TweedieRegressor` implements a generalized linear model -for the Tweedie distribution, that allows to model any of the above mentioned -distributions using the appropriate ``power`` parameter, i.e. the exponent -of the unit variance function: - - - ``power = 0``: Normal distribution. Specialized solvers such as - :class:`Ridge`, :class:`ElasticNet` are generally - more appropriate in this case. +* If the target values :math:`y` are counts (non-negative integer valued) or + relative frequencies (non-negative), you might use a Poisson deviance + with log-link. +* If the target values are positive valued and skewed, you might try a + Gamma deviance with log-link. +* If the target values seem to be heavier tailed than a Gamma distribution, + you might try an Inverse Gaussian deviance (or even higher variance powers + of the Tweedie family). - - ``power = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed for - convenience. However, it is strictly equivalent to - `TweedieRegressor(power=1)`. - - ``power = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for - convenience. However, it is strictly equivalent to - `TweedieRegressor(power=2)`. +.. topic:: References: - - ``power = 3``: Inverse Gamma distribution. + .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, + Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. + .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models + and analysis of deviance. Monografias de matemática, no. 51. See also + `Exponential dispersion model. + `_ -.. note:: +Usage +----- - * The feature matrix `X` should be standardized before fitting. This - ensures that the penalty treats features equally. - * If you want to model a relative frequency, i.e. counts per exposure (time, - volume, ...) you can do so by a Poisson distribution and passing - :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values - together with :math:`s=\mathrm{exposure}` as sample weights. +:class:`TweedieRegressor` implements a generalized linear model for the +Tweedie distribution, that allows to model any of the above mentioned +distributions using the appropriate ``power`` parameter. In particular: - As an example, consider Poisson distributed counts z (integers) and - weights s=exposure (time, money, persons years, ...). Then you fit - y = z/s, i.e. ``PoissonRegressor.fit(X, y, sample_weight=s)``. - The weights are necessary for the right (finite sample) mean. - Considering :math:`\bar{y} = \frac{\sum_i s_i y_i}{\sum_i s_i}`, - in this case one might say that y has a 'scaled' Poisson distribution. - The same holds for other distributions. +- ``power = 0``: Normal distribution. Specific estimators such as + :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in + this case. +- ``power = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed + for convenience. However, it is strictly equivalent to + `TweedieRegressor(power=1, link='log')`. +- ``power = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for + convenience. However, it is strictly equivalent to + `TweedieRegressor(power=2, link='log')`. +- ``power = 3``: Inverse Gaussian distribution. - * The fit itself does not need Y to be from an EDM, but only assumes - the first two moments to be :math:`E[Y_i]=\mu_i=h((Xw)_i)` and - :math:`Var[Y_i]=\frac{\phi}{s_i} v(\mu_i)`. +The link function is determined by the `link` parameter. -The estimator can be used as follows:: +Usage example:: >>> from sklearn.linear_model import TweedieRegressor >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log') @@ -1016,49 +991,25 @@ The estimator can be used as follows:: * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py` -Mathematical formulation +Practical considerations ------------------------ -In the unpenalized case, the assumptions are the following: - - * The target values :math:`y_i` are realizations of random variables - :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})` - with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter - :math:`\phi` and sample weights :math:`s_i`. - * The aim is to predict the expectation :math:`\mu_i` with - :math:`\hat{y}_i = h(\eta_i)`, linear predictor - :math:`\eta_i=(Xw)_i` and inverse link function :math:`h`. - -Note that the first assumption implies -:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance -function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the -same as specifying a unit variance function (they are one-to-one). - -A few remarks: - -* The deviance is independent of :math:`\phi`. Therefore, also the estimation - of the coefficients :math:`w` is independent of the dispersion parameter of - the EDM. -* The minimization is equivalent to (penalized) maximum likelihood estimation. -* The deviances for at least Normal, Poisson and Gamma distributions are - strictly consistent scoring functions for the mean :math:`\mu`, see Eq. - (19)-(20) in [12]_. This means that, given an appropriate feature matrix `X`, - you get good (asymptotic) estimators for the expectation when using these - deviances. - +The feature matrix `X` should be standardized before fitting. This ensures +that the penalty treats features equally. -.. topic:: References: - - .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, - Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5. +Since the linear predictor :math:`Xw` can be negative and Poisson, +Gamma and Inverse Gaussian distributions don't support negative values, it +is necessary to apply an inverse link function that guarantees the +non-negativeness. For example with `link='log'`, the inverse link function +becomes :math:`h(Xw)=\exp(Xw)`. - .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models - and analysis of deviance. Monografias de matemática, no. 51. See also - `Exponential dispersion model. - `_ +If you want to model a relative frequency, i.e. counts per exposure (time, +volume, ...) you can do so by using a Poisson distribution and passing +:math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values +together with :math:`s=\mathrm{exposure}` as sample weights. For a concrete +example see e.g. +:ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`. - .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. - `_ Stochastic Gradient Descent - SGD ================================= diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index b29dcd89a35a6..557184fdd5c85 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -67,9 +67,9 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Parameters ---------- alpha : float, default=1 - Constant that multiplies the penalty terms and thus determines the - regularization strength. ``alpha = 0`` is equivalent to unpenalized - GLMs. In this case, the design matrix X must have full column rank + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). fit_intercept : bool, default=True @@ -81,15 +81,13 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \ - default='auto' + link : {'auto', 'identity', 'log'}, default='auto' The link function of the GLM, i.e. mapping from linear predictor - (X*coef) to expectation (y_pred). Option 'auto' sets the link - depending on the chosen family as follows: - - - 'identity' for family 'normal' + `Xw` to prediction `y_pred`. Option 'auto' sets the link depending + on the chosen family as follows: - - 'log' for families 'poisson', 'gamma', 'inverse-gaussian' + - 'identity' for Normal distribution + - 'log' for Poisson, Gamma and Inverse Gaussian distributions solver : 'lbfgs', default='lbfgs' Algorithm to use in the optimization problem: @@ -155,12 +153,7 @@ def fit(self, X, y, sample_weight=None): Target values. sample_weight : array-like of shape (n_samples,), default=None - Individual weights w_i for each sample. Note that for an - Exponential Dispersion Model (EDM), one has - Var[Y_i]=phi/w_i * v(y_pred). - If Y_i ~ EDM(y_pred, phi/w_i), then - sum(w*Y)/sum(w) ~ EDM(y_pred, phi/sum(w)), i.e. the mean of y is a - weighted average with weights=sample_weight. + Sample weights. Returns ------- @@ -400,25 +393,16 @@ def _more_tags(self): class PoissonRegressor(GeneralizedLinearRegressor): - """Regression with the response variable y following a Poisson distribution - - GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as y_pred=h(X*w). - The fit minimizes the following objective function with L2 regularization:: - - 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 - - with inverse link function h and s=sample_weight. Note that for - ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + """Generalized Linear Model with a Poisson distribution. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=1 - Constant that multiplies the penalty terms and thus determines the - regularization strength. ``alpha = 0`` is equivalent to unpenalized - GLMs. In this case, the design matrix X must have full column rank + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). fit_intercept : bool, default=True @@ -479,25 +463,16 @@ def family(self, value): class GammaRegressor(GeneralizedLinearRegressor): - """Regression with the response variable y following a Gamma distribution - - GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as y_pred=h(X*w). - The fit minimizes the following objective function with L2 regularization:: - - 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 - - with inverse link function h and s=sample_weight. Note that for - ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + """Generalized Linear Model with a Gamma distribution. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=1 - Constant that multiplies the penalty terms and thus determines the - regularization strength. ``alpha = 0`` is equivalent to unpenalized - GLMs. In this case, the design matrix X must have full column rank + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). fit_intercept : bool, default=True @@ -558,30 +533,18 @@ def family(self, value): class TweedieRegressor(GeneralizedLinearRegressor): - r"""Regression with the response variable y following a Tweedie distribution - - GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at - fitting and predicting the mean of the target y as y_pred=h(X*w). - The fit minimizes the following objective function with L2 regularization:: - - 1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2 + """Generalized Linear Model with a Tweedie distribution. - with inverse link function h and s=sample_weight. Note that for - ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples). + This estimator can be used to model different GLMs depending on the + ``power`` parameter, which determines the underlying distribution. Read more in the :ref:`User Guide `. Parameters ---------- power : float, default=0 - The power determines the underlying target distribution. By - definition it links distribution variance (:math:`v`) and - mean (:math:`\y_\textrm{pred}`): - :math:`v(\y_\textrm{pred}) = \y_\textrm{pred}^{power}`. - - For ``0 < power < 1``, no distribution exists. - - Special cases are: + The power determines the underlying target distribution according + to the following table: +-------+------------------------+ | Power | Distribution | @@ -597,20 +560,21 @@ class TweedieRegressor(GeneralizedLinearRegressor): | 3 | Inverse Gaussian | +-------+------------------------+ + For ``0 < power < 1``, no distribution exists. + alpha : float, default=1 - Constant that multiplies the penalty terms and thus determines the - regularization strength. ``alpha = 0`` is equivalent to unpenalized - GLMs. In this case, the design matrix X must have full column rank + Constant that multiplies the penalty term and thus determines the + regularization strength. ``alpha = 0`` is equivalent to unpenalized + GLMs. In this case, the design matrix `X` must have full column rank (no collinearities). link : {'auto', 'identity', 'log'}, default='auto' The link function of the GLM, i.e. mapping from linear predictor - (X*coef) to expectation (y_pred). Option 'auto' sets the link - depending on the chosen family as follows: + `Xw` to prediction `y_pred`. Option 'auto' sets the link depending + on the chosen family as follows: - 'identity' for Normal distribution - - - 'log' for Poisson, Gamma or Inverse Gaussian distributions + - 'log' for Poisson, Gamma and Inverse Gaussian distributions fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be From 21572d93e07e2352e2b9e25fc044d4f03df607a9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 15 Oct 2019 16:23:22 -0400 Subject: [PATCH 210/269] put back doc for BaseLink removed by mistake --- sklearn/linear_model/_glm/glm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 557184fdd5c85..9b17f1814a497 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -81,7 +81,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): The distributional assumption of the GLM, i.e. which distribution from the EDM, specifies the loss function to be minimized. - link : {'auto', 'identity', 'log'}, default='auto' + link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \ + default='auto' The link function of the GLM, i.e. mapping from linear predictor `Xw` to prediction `y_pred`. Option 'auto' sets the link depending on the chosen family as follows: From d7ff6f44f4b5e477e6887850c4fcd9a5f80ea043 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 17 Oct 2019 11:10:42 +0200 Subject: [PATCH 211/269] TST GLM/Ridge comparison with sample_weight (xfail for now) --- sklearn/linear_model/_glm/tests/test_glm.py | 39 ++++++++++++++------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index c0ff6508db9c9..cf6681f56acde 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -257,37 +257,52 @@ def test_warm_start(fit_intercept): @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) -def test_normal_ridge_comparison(n_samples, n_features, fit_intercept): +@pytest.mark.parametrize('sample_weight', [None, pytest.mark.xfail('rand')]) +def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, + sample_weight, request): """Compare with Ridge regression for Normal distributions.""" - alpha = 1.0 test_size = 10 X, y = make_regression(n_samples=n_samples + test_size, n_features=n_features, n_informative=n_features-2, noise=0.5, random_state=42) - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=test_size, random_state=0 - ) if n_samples > n_features: ridge_params = {"solver": "svd"} else: - ridge_params = {"solver": "sag", "max_iter": 10000, "tol": 1e-9} + ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-9} + + X_train, X_test, y_train, y_test, = train_test_split( + X, y, test_size=test_size, random_state=0 + ) + + if sample_weight is None: + alpha = 1.0 + sw_train = None + else: + sw_train = np.random.RandomState(0).rand(len(y_train)) + alpha = 0.0 + sw_train /= sw_train.sum() + request.applymarker(pytest.mark.xfail( + run=False, reason=('TODO: GLM / Ridge comparison with ' + 'sample_weight should be fixed'))) # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 ridge = Ridge(alpha=alpha*n_samples, normalize=False, - random_state=42, **ridge_params) - ridge.fit(X_train, y_train) + random_state=42, fit_intercept=fit_intercept, + **ridge_params) + ridge.fit(X_train, y_train, sample_weight=sw_train) glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', - link='identity', fit_intercept=True, + link='identity', + fit_intercept=fit_intercept, max_iter=300) - glm.fit(X_train, y_train) + glm.fit(X_train, y_train, sample_weight=sw_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5) - assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=5e-5) - assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=5e-5) + assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=2e-4) + assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4) def test_poisson_glmnet(): From 939d24012088df5d26693a06a8dc991c3af0d4fd Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 17 Oct 2019 11:36:17 +0200 Subject: [PATCH 212/269] TST More invariance checks for sample_weight --- sklearn/linear_model/_glm/tests/test_glm.py | 28 ++++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index cf6681f56acde..e526db5002ca2 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -175,16 +175,20 @@ def test_glm_identity_regression(): assert_allclose(glm.coef_, coef, rtol=1e-6) -def test_glm_sample_weight_consistentcy(): +@pytest.mark.parametrize('fit_intercept', [False, True]) +@pytest.mark.parametrize('alpha', [0.0, 1.0]) +@pytest.mark.parametrize('family', ['normal', 'poisson', 'gamma']) +def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family): """Test that the impact of sample_weight is consistent""" rng = np.random.RandomState(0) n_samples, n_features = 10, 5 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples) - glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', - fit_intercept=False) - glm.fit(X, y) + glm_params = dict(alpha=alpha, family=family, link='auto', + fit_intercept=fit_intercept) + + glm = GeneralizedLinearRegressor(**glm_params).fit(X, y) coef = glm.coef_.copy() # sample_weight=np.ones(..) should be equivalent to sample_weight=None @@ -206,6 +210,22 @@ def test_glm_sample_weight_consistentcy(): glm.fit(X[:-1], y[:-1]) assert_allclose(glm.coef_, coef1, rtol=1e-6) + # check that multiplying sample_weight by 2 is equivalent + # to repeating correspoding samples twice + X2 = np.concatenate([X, X[:n_samples//2]], axis=0) + y2 = np.concatenate([y, y[:n_samples//2]]) + sample_weight_1 = np.ones(len(y)) + sample_weight_1[:n_samples//2] = 2 + + glm1 = GeneralizedLinearRegressor(**glm_params).fit( + X, y, sample_weight=sample_weight_1 + ) + + glm2 = GeneralizedLinearRegressor(**glm_params).fit( + X2, y2, sample_weight=None + ) + assert_allclose(glm1.coef_, glm2.coef_) + @pytest.mark.parametrize( 'family', From da0d2a6f7fc372f7ae6b3daf359e8d1a4000a57a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 17 Oct 2019 12:06:59 +0200 Subject: [PATCH 213/269] Remove copy_X parameter --- sklearn/linear_model/_glm/glm.py | 34 +++++---------------- sklearn/linear_model/_glm/tests/test_glm.py | 10 ------ 2 files changed, 8 insertions(+), 36 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 9b17f1814a497..9135cd2392952 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -109,9 +109,6 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_``. - copy_X : bool, default=True - If ``True``, X will be copied; else, it may be overwritten. - verbose : int, default=0 For the lbfgs solver set verbose to any positive number for verbosity. @@ -130,7 +127,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): def __init__(self, *, alpha=1.0, fit_intercept=True, family='normal', link='auto', solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False, - copy_X=True, verbose=0): + verbose=0): self.alpha = alpha self.fit_intercept = fit_intercept self.family = family @@ -139,7 +136,6 @@ def __init__(self, *, alpha=1.0, self.max_iter = max_iter self.tol = tol self.warm_start = warm_start - self.copy_X = copy_X self.verbose = verbose def fit(self, X, y, sample_weight=None): @@ -219,16 +215,13 @@ def fit(self, X, y, sample_weight=None): if not isinstance(self.warm_start, bool): raise ValueError("The argument warm_start must be bool;" " got {0}".format(self.warm_start)) - if not isinstance(self.copy_X, bool): - raise ValueError("The argument copy_X must be bool;" - " got {0}".format(self.copy_X)) family = self._family_instance link = self._link_instance X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32], - y_numeric=True, multi_output=False, copy=self.copy_X) + y_numeric=True, multi_output=False) weights = _check_sample_weight(sample_weight, X) @@ -423,9 +416,6 @@ class PoissonRegressor(GeneralizedLinearRegressor): If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_`` . - copy_X : bool, default=True - If ``True``, X will be copied; else, it may be overwritten. - verbose : int, default=0 For the lbfgs solver set verbose to any positive number for verbosity. @@ -442,12 +432,11 @@ class PoissonRegressor(GeneralizedLinearRegressor): Actual number of iterations used in the solver. """ def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, - tol=1e-4, warm_start=False, copy_X=True, verbose=0): + tol=1e-4, warm_start=False, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="poisson", link='log', max_iter=max_iter, - tol=tol, warm_start=warm_start, copy_X=copy_X, - verbose=verbose) + tol=tol, warm_start=warm_start, verbose=verbose) @property def family(self): @@ -493,9 +482,6 @@ class GammaRegressor(GeneralizedLinearRegressor): If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_`` . - copy_X : bool, default=True - If ``True``, X will be copied; else, it may be overwritten. - verbose : int, default=0 For the lbfgs solver set verbose to any positive number for verbosity. @@ -512,12 +498,11 @@ class GammaRegressor(GeneralizedLinearRegressor): Actual number of iterations used in the solver. """ def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, - tol=1e-4, warm_start=False, copy_X=True, verbose=0): + tol=1e-4, warm_start=False, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family="gamma", link='log', max_iter=max_iter, - tol=tol, warm_start=warm_start, copy_X=copy_X, - verbose=verbose) + tol=tol, warm_start=warm_start, verbose=verbose) @property def family(self): @@ -594,9 +579,6 @@ class TweedieRegressor(GeneralizedLinearRegressor): If set to ``True``, reuse the solution of the previous call to ``fit`` as initialization for ``coef_`` and ``intercept_`` . - copy_X : bool, default=True - If ``True``, X will be copied; else, it may be overwritten. - verbose : int, default=0 For the lbfgs solver set verbose to any positive number for verbosity. @@ -614,12 +596,12 @@ class TweedieRegressor(GeneralizedLinearRegressor): """ def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True, link='auto', max_iter=100, tol=1e-4, - warm_start=False, copy_X=True, verbose=0): + warm_start=False, verbose=0): super().__init__(alpha=alpha, fit_intercept=fit_intercept, family=TweedieDistribution(power=power), link=link, max_iter=max_iter, tol=tol, - warm_start=warm_start, copy_X=copy_X, verbose=verbose) + warm_start=warm_start, verbose=verbose) @property def family(self): diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index e526db5002ca2..5bec1fb6f493a 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -154,16 +154,6 @@ def test_glm_warm_start_argument(warm_start): glm.fit(X, y) -@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]]) -def test_glm_copy_X_argument(copy_X): - """Test GLM for invalid copy_X arguments.""" - y = np.array([1, 2]) - X = np.array([[1], [1]]) - glm = GeneralizedLinearRegressor(copy_X=copy_X) - with pytest.raises(ValueError, match="copy_X must be bool"): - glm.fit(X, y) - - def test_glm_identity_regression(): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] From 162fb3b23c5132eeca7d0fb020a9b00d84ad1352 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 17 Oct 2019 20:08:23 +0200 Subject: [PATCH 214/269] Minor doc improvements --- doc/modules/linear_model.rst | 4 ++-- .../plot_poisson_regression_non_normal_loss.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index b930a0d2a8106..d9cbce4eebe8b 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -915,7 +915,7 @@ reproductive exponential dispersion model (EDM) [11]_). The minimization problem becomes: -.. math:: \min_{w} \frac{1}{2 \cdot n\text{_samples}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2, +.. math:: \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2, where :math:`\alpha` is the L2 regularization penalty. When sample weights are provided, the average becomes a weighted average. @@ -988,8 +988,8 @@ Usage example:: .. topic:: Examples: - * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py` + * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py` Practical considerations ------------------------ diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 0e948873da570..e84a60eb3d8ee 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -80,10 +80,10 @@ def load_mtpl2(n_samples=100000): # # The remaining columns can be used to predict the frequency of claim events. # Those columns are very heterogeneous with a mix of categorical and numeric -# variables with different scales, possibly with heavy tails. +# variables with different scales, possibly very uneven distributed. # # In order to fit linear models with those predictors it is therefore -# necessary to perform standard feature transformation as follows: +# necessary to perform standard feature transformations as follows: log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), @@ -128,8 +128,8 @@ def load_mtpl2(n_samples=100000): ############################################################################## # -# It worth noting that 92 % of policyholders have zero claims, and if we were -# to convert this problem into a binary classification task, it would be +# It is worth noting that 92 % of policyholders have zero claims, and if we +# were to convert this problem into a binary classification task, it would be # significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a From cafc92f0b55307d3dc0f0d9dedefcad1bf73d482 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 17 Oct 2019 20:13:59 +0200 Subject: [PATCH 215/269] DOC consistent coef_=w for OMP in user guide --- doc/modules/linear_model.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d9cbce4eebe8b..7b4e3ca9d6750 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -525,13 +525,13 @@ orthogonal matching pursuit can approximate the optimum solution vector with a fixed number of non-zero elements: .. math:: - \underset{\gamma}{\operatorname{arg\,min\,}} ||y - X\gamma||_2^2 \text{ subject to } ||\gamma||_0 \leq n_{\text{nonzero\_coefs}} + \underset{w}{\operatorname{arg\,min\,}} ||y - Xw||_2^2 \text{ subject to } ||w||_0 \leq n_{\text{nonzero\_coefs}} Alternatively, orthogonal matching pursuit can target a specific error instead of a specific number of non-zero coefficients. This can be expressed as: .. math:: - \underset{\gamma}{\operatorname{arg\,min\,}} ||\gamma||_0 \text{ subject to } ||y-X\gamma||_2^2 \leq \text{tol} + \underset{w}{\operatorname{arg\,min\,}} ||w||_0 \text{ subject to } ||y-Xw||_2^2 \leq \text{tol} OMP is based on a greedy algorithm that includes at each step the atom most From d3235db48a063ba01c5dafa543cba2d0f42b252c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 17 Oct 2019 20:31:31 +0200 Subject: [PATCH 216/269] EXA typos --- .../plot_tweedie_regression_insurance_claims.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index a95b8f0301663..606de6b2df609 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -235,7 +235,7 @@ def score_estimator( # in their portfolio: df["PurePremium"] = df["ClaimAmount"] / df["Exposure"] -# This can be inderectly approximated by a 2-step modeling the product of the +# This can be indirectly approximated by a 2-step modeling: the product of the # Frequency times the average claim amount per claim: df["Frequency"] = df["ClaimNb"] / df["Exposure"] df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1) @@ -258,7 +258,7 @@ def score_estimator( # The parameters of the model are estimated by minimizing the Poisson deviance # on the training set via a quasi-Newton solver: l-BFGS. Some of the features -# are colinear, we use a weak penalization to avoid numerical issues. +# are collinear, we use a weak penalization to avoid numerical issues. glm_freq = PoissonRegressor(alpha=1e-3) glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"]) @@ -461,14 +461,14 @@ def score_estimator( # Pure Premium Modeling Using a Single Compound Poisson Gamma Model # ----------------------------------------------------------------- # Instead of taking the product of two independently fit models for frequency -# and severity one can directly model the total loss is with a unique Compound +# and severity one can directly model the total loss with a unique Compound # Poisson Gamma generalized linear model (with a log link function). This # model is a special case of the Tweedie GLM with a "power" parameter :math:`p # \in (1, 2)`. # # Here we fix apriori the "power" parameter of the Tweedie model to some # arbitrary value in the valid range. Ideally one would select this value via -# grid-search by minimizing the negative log-likelihood of the Tweedie model +# grid-search by minimizing the negative log-likelihood of the Tweedie model, # but unfortunately the current implementation does not allow for this (yet). glm_pure_premium = TweedieRegressor(power=1.999, alpha=.1, max_iter=10000) From 9401230d387870a00cf50148aa74b43eccd059ad Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 17 Oct 2019 20:38:42 +0200 Subject: [PATCH 217/269] EXA set less extreme Tweedie power=1.9 --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 606de6b2df609..02e528e98b130 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -471,7 +471,7 @@ def score_estimator( # grid-search by minimizing the negative log-likelihood of the Tweedie model, # but unfortunately the current implementation does not allow for this (yet). -glm_pure_premium = TweedieRegressor(power=1.999, alpha=.1, max_iter=10000) +glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000) glm_pure_premium.fit(X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"]) From 5bc48e520135ea8ce4ef2a0be156a28cd8d33ce8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 17 Oct 2019 21:35:09 +0200 Subject: [PATCH 218/269] TST fix normal_ridge_comparison with sample_weight --- sklearn/linear_model/_glm/tests/test_glm.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 5bec1fb6f493a..074c8c90a4898 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -267,7 +267,7 @@ def test_warm_start(fit_intercept): @pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)]) @pytest.mark.parametrize('fit_intercept', [True, False]) -@pytest.mark.parametrize('sample_weight', [None, pytest.mark.xfail('rand')]) +@pytest.mark.parametrize('sample_weight', [None, True]) def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, sample_weight, request): """Compare with Ridge regression for Normal distributions.""" @@ -280,33 +280,31 @@ def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, if n_samples > n_features: ridge_params = {"solver": "svd"} else: - ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-9} + ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-7} X_train, X_test, y_train, y_test, = train_test_split( X, y, test_size=test_size, random_state=0 ) + alpha = 1.0 if sample_weight is None: - alpha = 1.0 sw_train = None + alpha_ridge = alpha * n_samples else: sw_train = np.random.RandomState(0).rand(len(y_train)) - alpha = 0.0 - sw_train /= sw_train.sum() - request.applymarker(pytest.mark.xfail( - run=False, reason=('TODO: GLM / Ridge comparison with ' - 'sample_weight should be fixed'))) + alpha_ridge = alpha * sw_train.sum() # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2 - ridge = Ridge(alpha=alpha*n_samples, normalize=False, + ridge = Ridge(alpha=alpha_ridge, normalize=False, random_state=42, fit_intercept=fit_intercept, **ridge_params) ridge.fit(X_train, y_train, sample_weight=sw_train) - glm = GeneralizedLinearRegressor(alpha=1.0, family='normal', + glm = GeneralizedLinearRegressor(alpha=alpha, family='normal', link='identity', fit_intercept=fit_intercept, - max_iter=300) + max_iter=300, + tol=1e-5) glm.fit(X_train, y_train, sample_weight=sw_train) assert glm.coef_.shape == (X.shape[1], ) assert_allclose(glm.coef_, ridge.coef_, atol=5e-5) From e572c3100508ca74abb097d73f36795e404715fd Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 20 Oct 2019 18:38:46 +0200 Subject: [PATCH 219/269] DOC advice against cv for Tweedie power in UG --- doc/modules/linear_model.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 7b4e3ca9d6750..4fc65cf668045 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1010,6 +1010,10 @@ together with :math:`s=\mathrm{exposure}` as sample weights. For a concrete example see e.g. :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`. +The `power` parameter of `TweedieRegressor` does not qualify to be optimized +by cross-validation, because it defines the very scoring criteria itself, i.e. +:meth:`TweedieRegressor.score`. Without the full likelihood of Tweedie +distributions at hand, it is suggested to choose it's value a priori. Stochastic Gradient Descent - SGD ================================= From 8d70042c09539af11d4c6283f1b46584473d7bbd Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 20 Oct 2019 19:19:56 +0200 Subject: [PATCH 220/269] DOC improve advice for power cv --- doc/modules/linear_model.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 4fc65cf668045..fe5f5966016bd 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1010,10 +1010,10 @@ together with :math:`s=\mathrm{exposure}` as sample weights. For a concrete example see e.g. :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`. -The `power` parameter of `TweedieRegressor` does not qualify to be optimized -by cross-validation, because it defines the very scoring criteria itself, i.e. -:meth:`TweedieRegressor.score`. Without the full likelihood of Tweedie -distributions at hand, it is suggested to choose it's value a priori. +When performing cross-validation for the `power` parameter of +`TweedieRegressor`, it is advisable to specify an explicit `scoring` function, +because the default scorer :meth:`TweedieRegressor.score` is a function of +`power` itself. Stochastic Gradient Descent - SGD ================================= From c47988993f939b28949e1cbd73946c4d3b426a4c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 10 Nov 2019 20:28:06 +0100 Subject: [PATCH 221/269] EXA rely on default of FunctionTransformer Co-Authored-By: Guillaume Lemaitre --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 02e528e98b130..ade6912efd80d 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -211,7 +211,7 @@ def score_estimator( df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) log_scale_transformer = make_pipeline( - FunctionTransformer(np.log, validate=False), + FunctionTransformer(func=np.log), StandardScaler() ) From 88d150e77e56175dad823663804f7418e91fd055 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 10 Nov 2019 21:19:49 +0100 Subject: [PATCH 222/269] EXA print all columns of DataFrame with max_columns option --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index ade6912efd80d..ab2ca2e8bfc28 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -240,7 +240,8 @@ def score_estimator( df["Frequency"] = df["ClaimNb"] / df["Exposure"] df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1) -print(df[df.ClaimAmount > 0].head()) +with pd.option_context("display.max_columns", 15): + print(df[df.ClaimAmount > 0].head()) ############################################################################## # From 0d31f47a1a4b644e281e4002437c4a54234ed88e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 10 Nov 2019 21:27:11 +0100 Subject: [PATCH 223/269] EXA improve wording Poisson target --- .../plot_tweedie_regression_insurance_claims.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index ab2ca2e8bfc28..1edcebcd3609a 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -248,12 +248,12 @@ def score_estimator( # Frequency model -- Poisson distribution # --------------------------------------- # -# The number of claims (``ClaimNb``) is a positive integer that can be modeled -# as a Poisson distribution. It is then assumed to be the number of discrete -# events occuring with a constant rate in a given time interval -# (``Exposure``, in units of years). Here we model the frequency -# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, -# and use ``Exposure`` as `sample_weight`. +# The number of claims (``ClaimNb``) is a positive integer (0 included). +# Thus, this target can be modelled by a Poisson distribution. +# It is then assumed to be the number of discrete events occuring with a +# constant rate in a given time interval (``Exposure``, in units of years). +# Here we model the frequency ``y = ClaimNb / Exposure``, which is still a +# (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`. df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0) From 3ab2877685301d9c3ccc42fab314fd455eceb332 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 10 Nov 2019 21:40:51 +0100 Subject: [PATCH 224/269] EXA increase digits in scores --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 1edcebcd3609a..9570573caaaa9 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -182,7 +182,7 @@ def score_estimator( pd.DataFrame(res) .set_index(["metric", "subset"]) .score.unstack(-1) - .round(2) + .round(4) .loc[:, ['train', 'test']] ) return res From 87de01bcdfdc3ba63779c2f958d37356112e4064 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 10 Nov 2019 21:42:01 +0100 Subject: [PATCH 225/269] EXA convergence issue solve with max_iter --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 9570573caaaa9..8b8cbe143ddb9 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -260,7 +260,7 @@ def score_estimator( # The parameters of the model are estimated by minimizing the Poisson deviance # on the training set via a quasi-Newton solver: l-BFGS. Some of the features # are collinear, we use a weak penalization to avoid numerical issues. -glm_freq = PoissonRegressor(alpha=1e-3) +glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400) glm_freq.fit(X_train, df_train["Frequency"], sample_weight=df_train["Exposure"]) From fbc22d86523373ce2940084c07d50d1760eab8ca Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 29 Jan 2020 23:19:17 +0100 Subject: [PATCH 226/269] Update sklearn/metrics/_regression.py Co-Authored-By: Nicolas Hug --- sklearn/metrics/_regression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py index 6ba0331ccd9d4..91bf350395fa2 100644 --- a/sklearn/metrics/_regression.py +++ b/sklearn/metrics/_regression.py @@ -761,7 +761,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None): Gamma deviance is equivalent to the Tweedie deviance with the power parameter `power=2`. It is invariant to scaling of - the target variable, and mesures relative errors. + the target variable, and measures relative errors. Read more in the :ref:`User Guide `. From 27ae4a285e70081738e53bd5907745a388156a7c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 29 Jan 2020 23:35:22 +0100 Subject: [PATCH 227/269] DOC Add what's new entry --- doc/whats_new/v0.23.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 45219b8346b35..0a70b65809e35 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -120,6 +120,13 @@ Changelog :mod:`sklearn.linear_model` ........................... +- |MajorFeature| Added generalized linear models (GLM) with non normal error + distributions, including :class:`linear_model.PoissonRegressor`, + :class:`linear_model.GammaRegressor` and :class:`linear_model.TweedieRegressor` + which use Poisson, Gamma and Tweedie distributions respectively. + :pr:`14300` by :user:`Christian Lorentzen `, `Roman Yurchak`_, + `Olivier Grisel`_ and `Nicolas Hug`_. + - |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit method of :class:`linear_model.RANSACRegressor`, it would not be passed to the wrapped `base_estimator` during the fitting of the final model. From 51be7e193e7b7891c13ee70ad65d48a0a6cbf3ee Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 30 Jan 2020 14:19:06 -0500 Subject: [PATCH 228/269] Removed myself from what's new --- doc/whats_new/v0.23.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 0a70b65809e35..ee24943e5f210 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -125,7 +125,7 @@ Changelog :class:`linear_model.GammaRegressor` and :class:`linear_model.TweedieRegressor` which use Poisson, Gamma and Tweedie distributions respectively. :pr:`14300` by :user:`Christian Lorentzen `, `Roman Yurchak`_, - `Olivier Grisel`_ and `Nicolas Hug`_. + and `Olivier Grisel`_. - |Fix| Fixed a bug where if a `sample_weight` parameter was passed to the fit method of :class:`linear_model.RANSACRegressor`, it would not be passed to From 25f0e531ceaaaf55d9124c127304d6c578acc72e Mon Sep 17 00:00:00 2001 From: Thomas J Fan Date: Tue, 4 Feb 2020 11:39:26 -0500 Subject: [PATCH 229/269] CLN Minor link clean up --- .../plot_poisson_regression_non_normal_loss.py | 8 ++++---- .../plot_tweedie_regression_insurance_claims.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index e84a60eb3d8ee..4c8d49ed01463 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -4,7 +4,7 @@ ====================================== This example illustrates the use of log-linear Poisson regression -on the French Motor Third-Party Liability Claims dataset [1] and compares +on the French Motor Third-Party Liability Claims dataset [1]_ and compares it with models learned with least squared error. The goal is to predict the expected number of insurance claims (or frequency) following car accidents for a policyholder given historical data over a population of policyholders. @@ -47,9 +47,9 @@ def load_mtpl2(n_samples=100000): Parameters ---------- - n_samples: int, default=100000 - number of samples to select (for faster run time). Full dataset has - 678013 samples. + n_samples: int or None, default=100000 + number of samples to select (for faster run time). If None, the full + dataset has with 678013 samples is returned. """ # freMTPL2freq dataset from https://www.openml.org/d/41214 diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 8b8cbe143ddb9..7a01ebe1ea112 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -5,7 +5,7 @@ This example illustrates the use of Poisson, Gamma and Tweedie regression on the French Motor Third-Party Liability Claims dataset, and is inspired by an R -tutorial [1]. +tutorial [1]_. Insurance claims data consist of the number of claims and the total claim amount. Often, the final goal is to predict the expected value, i.e. the mean, From 5eddf9ced305678f2d00ff6ce55cdde8e0bf0bbe Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 5 Feb 2020 08:11:12 +0100 Subject: [PATCH 230/269] CLN one word too much --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 4c8d49ed01463..7ddebacc4ec71 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -49,7 +49,7 @@ def load_mtpl2(n_samples=100000): ---------- n_samples: int or None, default=100000 number of samples to select (for faster run time). If None, the full - dataset has with 678013 samples is returned. + dataset with 678013 samples is returned. """ # freMTPL2freq dataset from https://www.openml.org/d/41214 From c700acfe787d7e50d588d87b428223ca1daa5dbc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 15:23:05 -0500 Subject: [PATCH 231/269] minor typo --- doc/modules/classes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 2fb8a9f25bd83..c138f51f6c06f 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -840,7 +840,7 @@ Any estimator using the Huber loss would also be robust to outliers, e.g. Generalized linear models (GLM) for regression ---------------------------------------------- -These models allow for response variables to have error distribution other +These models allow for response variables to have error distributions other than a normal distribution: .. autosummary:: From a126f4aa3aad97c79de88912ad369332e3ba7394 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 15:23:45 -0500 Subject: [PATCH 232/269] remove unused symbol --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index d16806446fd7c..fd17227bbadb1 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -1037,7 +1037,7 @@ becomes :math:`h(Xw)=\exp(Xw)`. If you want to model a relative frequency, i.e. counts per exposure (time, volume, ...) you can do so by using a Poisson distribution and passing :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values -together with :math:`s=\mathrm{exposure}` as sample weights. For a concrete +together with :math:`\mathrm{exposure}` as sample weights. For a concrete example see e.g. :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`. From 82c44830625bdb4ac32b0f06c05d44056db72b4c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 15:23:58 -0500 Subject: [PATCH 233/269] use j instead of i to index features --- sklearn/linear_model/_glm/glm.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 9135cd2392952..76735678a8f0b 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -101,8 +101,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): tol : float, default=1e-4 Stopping criterion. For the lbfgs solver, - the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient (derivative) of + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of the objective function. warm_start : bool, default=False @@ -408,8 +408,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): tol : float, default=1e-4 Stopping criterion. For the lbfgs solver, - the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient (derivative) of + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of the objective function. warm_start : bool, default=False @@ -474,8 +474,8 @@ class GammaRegressor(GeneralizedLinearRegressor): tol : float, default=1e-4 Stopping criterion. For the lbfgs solver, - the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient (derivative) of + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of the objective function. warm_start : bool, default=False @@ -571,8 +571,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): tol : float, default=1e-4 Stopping criterion. For the lbfgs solver, - the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol`` - where ``g_i`` is the i-th component of the gradient (derivative) of + the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol`` + where ``g_j`` is the j-th component of the gradient (derivative) of the objective function. warm_start : bool, default=False From d3083db043674c8f6fe938a7d9312119134ff007 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 15:25:54 -0500 Subject: [PATCH 234/269] removed redundant variable --- sklearn/linear_model/_glm/glm.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 76735678a8f0b..a8df8931961db 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -263,12 +263,12 @@ def func(coef, X, y, weights, alpha, family, link): coef, X, y, weights, family, link ) dev = family.deviance(y, y_pred, weights) - intercept = (coef.size == X.shape[1] + 1) - idx = 1 if intercept else 0 # offset if coef[0] is intercept - coef_scaled = alpha * coef[idx:] - obj = 0.5 * dev + 0.5 * (coef[idx:] @ coef_scaled) + # offset if coef[0] is intercept + offset = 1 if self.fit_intercept else 0 + coef_scaled = alpha * coef[offset:] + obj = 0.5 * dev + 0.5 * (coef[offset:] @ coef_scaled) objp = 0.5 * devp - objp[idx:] += coef_scaled + objp[offset:] += coef_scaled return obj, objp args = (X, y, weights, self.alpha, family, link) From f63f795270b15837aae873d38412fa6dbf183f16 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 17:13:47 -0500 Subject: [PATCH 235/269] removed unused var --- sklearn/linear_model/_glm/tests/test_glm.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 074c8c90a4898..9b9bf08aebd5b 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -237,10 +237,9 @@ def test_glm_log_regression(family): @pytest.mark.parametrize('fit_intercept', [True, False]) def test_warm_start(fit_intercept): n_samples, n_features = 110, 10 - X, y, coef = make_regression(n_samples=n_samples, - n_features=n_features, - n_informative=n_features-2, noise=0.5, - coef=True, random_state=42) + X, y = make_regression(n_samples=n_samples, n_features=n_features, + n_informative=n_features-2, noise=0.5, + random_state=42) glm1 = GeneralizedLinearRegressor( warm_start=False, From 45de11068f083970516a793a0a2154d70662469b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 17:20:45 -0500 Subject: [PATCH 236/269] Added comment and basic test to family attribute --- sklearn/linear_model/_glm/tests/test_glm.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 9b9bf08aebd5b..69fd25021d079 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -346,6 +346,8 @@ def test_convergence_warning(regression_data): def test_poisson_regression_family(regression_data): + # Make sure the family attribute is read-only to prevent searching over it + # e.g. in a grid search est = PoissonRegressor() est.family == "poisson" @@ -355,6 +357,8 @@ def test_poisson_regression_family(regression_data): def test_gamma_regression_family(regression_data): + # Make sure the family attribute is read-only to prevent searching over it + # e.g. in a grid search est = GammaRegressor() est.family == "gamma" @@ -364,10 +368,21 @@ def test_gamma_regression_family(regression_data): def test_tweedie_regression_family(regression_data): + # Make sure the family attribute is always a TweedieDistribution and that + # the power attribute is properly updated power = 2.0 est = TweedieRegressor(power=power) assert isinstance(est.family, TweedieDistribution) assert est.family.power == power + assert est.power == power + + new_power = 0 + new_family = TweedieDistribution(power=new_power) + est.family = new_family + assert isinstance(est.family, TweedieDistribution) + assert est.family.power == new_power + assert est.power == new_power + msg = "TweedieRegressor.family must be of type TweedieDistribution!" with pytest.raises(TypeError, match=msg): est.family = None From b8459b0540c1af4d239bc149155e12d83816b724 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 17:26:11 -0500 Subject: [PATCH 237/269] Added test for link='auto' --- sklearn/linear_model/_glm/tests/test_glm.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 69fd25021d079..1d1d0c55ceeae 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -92,6 +92,20 @@ def test_glm_link_argument(name, instance): glm.fit(X, y) +@pytest.mark.parametrize('family, expected_link_class', [ + ('normal', IdentityLink), + ('poisson', LogLink), + ('gamma', LogLink), + ('inverse-gaussian', LogLink), +]) +def test_glm_link_auto(family, expected_link_class): + # Make sure link='auto' delivers the expected link function + y = np.array([0.1, 0.5]) # in range of all distributions + X = np.array([[1], [2]]) + glm = GeneralizedLinearRegressor(family=family, link='auto').fit(X, y) + assert isinstance(glm._link_instance, expected_link_class) + + @pytest.mark.parametrize('alpha', ['not a number', -4.2]) def test_glm_alpha_argument(alpha): """Test GLM for invalid alpha argument.""" From 581b4d7492d39b8662fa2a8ec31960ecf545eed2 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 17:56:16 -0500 Subject: [PATCH 238/269] more comment about OneHotEncoder vs OrdinalEncoder for forests --- .../plot_poisson_regression_non_normal_loss.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 7ddebacc4ec71..d51c4a36c9322 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -80,7 +80,7 @@ def load_mtpl2(n_samples=100000): # # The remaining columns can be used to predict the frequency of claim events. # Those columns are very heterogeneous with a mix of categorical and numeric -# variables with different scales, possibly very uneven distributed. +# variables with different scales, possibly very unevenly distributed. # # In order to fit linear models with those predictors it is therefore # necessary to perform standard feature transformations as follows: @@ -214,11 +214,14 @@ def score_estimator(estimator, df_test): ############################################################################## # # Finally, we will consider a non-linear model, namely a random forest. Random -# forests do not require the categorical data to be one-hot encoded, instead -# we encode each category label with an arbitrary integer using -# :class:`preprocessing.OrdinalEncoder` to make the model faster to train (the -# same information is encoded with a smaller number of features than with -# one-hot encoding). +# forests do not require the categorical data to be one-hot encoded: instead, +# we can encode each category label with an arbitrary integer using +# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will +# treat the categorical features as ordered features, which might not be always +# a desired behavior. However this effect is limited for deep enough trees +# which are able to recover the categorical nature of the features. The main +# advantage of the :class:`preprocessing.OrdinalEncoder` over the +# :class:`preprocessing.OneHotEncoder` is that it will make training faster. rf_preprocessor = ColumnTransformer( [ From bb75435964600a6dc92342472634415c2473e550 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 28 Feb 2020 18:45:42 -0500 Subject: [PATCH 239/269] minor typos or formulations --- .../plot_poisson_regression_non_normal_loss.py | 6 +++--- .../plot_tweedie_regression_insurance_claims.py | 13 +++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d51c4a36c9322..70b4f36d014e3 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -289,8 +289,8 @@ def score_estimator(estimator, df_test): ############################################################################## # # The experimental data presents a long tail distribution for ``y``. In all -# models we predict the mean expected value, so we will have necessarily fewer -# extreme values. Additionally, normal distribution used in ``Ridge`` and +# models we predict a mean expected value, so we will have necessarily fewer +# extreme values. Additionally, the normal distribution used in ``Ridge`` and # ``RandomForestRegressor`` has a constant variance, while for the Poisson # distribution used in ``PoissonRegressor``, the variance is proportional to # the mean predicted value. @@ -390,7 +390,7 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, # the safest. In this case, the model evaluation would cast the problem as a # ranking problem rather than a regression problem. # -# To compare the 3 models under this light on, one can plot the fraction of +# To compare the 3 models within this perspective, one can plot the fraction of # the number of claims vs the fraction of exposure for test samples ordered by # the model predictions, from riskiest to safest according to each model: diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 7a01ebe1ea112..9154aa50e361d 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -14,11 +14,12 @@ There are several possibilities to do that, two of which are: -1. Model the number of claims with a Poisson distribution, the average claim - amount per claim, also known as severity, as a Gamma distribution and - multiply the predictions of both in order to get the total claim amount. -2. Model total claim amount directly, typically with a Tweedie distribution of - Tweedie power :math:`p \\in (1, 2)`. +1. Model the number of claims with a Poisson distribution, and the average + claim amount per claim, also known as severity, as a Gamma distribution + and multiply the predictions of both in order to get the total claim + amount. +2. Model the total claim amount per exposure directly, typically with a Tweedie + distribution of Tweedie power :math:`p \\in (1, 2)`. In this example we will illustrate both approaches. We start by defining a few helper functions for loading the data and visualizing results. @@ -430,7 +431,7 @@ def score_estimator( # Pure Premium Modeling via a Product of Frequency and Severity # ------------------------------------------------------------- # As mentioned in the introduction, the total claim amount per unit of -# exposure can be modeled either as the product of the prediction of the +# exposure can be modeled as the product of the prediction of the # frequency model by the prediction of the severity model. # # To quantify the aggregate performance of this product model, one can compute From 94dfc00a5179ddf63bdbd254280b693acfb0cc49 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 29 Feb 2020 09:33:29 +0100 Subject: [PATCH 240/269] Remove unused ExponentialDispersionModel.unit_variance_derivative --- sklearn/_loss/glm_distribution.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py index 920218ea7f674..cb20fda1c022d 100644 --- a/sklearn/_loss/glm_distribution.py +++ b/sklearn/_loss/glm_distribution.py @@ -40,7 +40,6 @@ class ExponentialDispersionModel(metaclass=ABCMeta): unit_deviance unit_deviance_derivative unit_variance - unit_variance_derivative References ---------- @@ -88,18 +87,6 @@ def unit_variance(self, y_pred): Predicted mean. """ - @abstractmethod - def unit_variance_derivative(self, y_pred): - r"""Compute the derivative of the unit variance w.r.t. y_pred. - - Return :math:`v'(y_\textrm{pred})`. - - Parameters - ---------- - y_pred : array of shape (n_samples,) - Target values. - """ - @abstractmethod def unit_deviance(self, y, y_pred, check_input=False): r"""Compute the unit deviance. @@ -258,17 +245,6 @@ def unit_variance(self, y_pred): """ return np.power(y_pred, self.power) - def unit_variance_derivative(self, y_pred): - """Compute the derivative of the unit variance of a Tweedie - distribution v(y_pred)=power*y_pred**(power-1). - - Parameters - ---------- - y_pred : array of shape (n_samples,) - Predicted mean. - """ - return self.power * np.power(y_pred, self.power - 1) - def unit_deviance(self, y, y_pred, check_input=False): r"""Compute the unit deviance. From 0810bf31d467f31e301ccd5b4e84a222b5e0d41c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 29 Feb 2020 11:07:07 +0100 Subject: [PATCH 241/269] DOC more detailed description of the dataset used in examples --- .../plot_poisson_regression_non_normal_loss.py | 4 +++- .../plot_tweedie_regression_insurance_claims.py | 11 ++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 70b4f36d014e3..d923afbc70891 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -4,10 +4,12 @@ ====================================== This example illustrates the use of log-linear Poisson regression -on the French Motor Third-Party Liability Claims dataset [1]_ and compares +on the `French Motor Third-Party Liability Claims dataset +`_ [1]_ and compares it with models learned with least squared error. The goal is to predict the expected number of insurance claims (or frequency) following car accidents for a policyholder given historical data over a population of policyholders. +Available features include driver age, vehicle age, vehicle power, etc. .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor Third-Party Liability Claims (November 8, 2018). diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 9154aa50e361d..f9898e8b73542 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -4,13 +4,14 @@ ====================================== This example illustrates the use of Poisson, Gamma and Tweedie regression on -the French Motor Third-Party Liability Claims dataset, and is inspired by an R -tutorial [1]_. +the `French Motor Third-Party Liability Claims dataset +`_, and is inspired by an R tutorial [1]_. Insurance claims data consist of the number of claims and the total claim -amount. Often, the final goal is to predict the expected value, i.e. the mean, -of the total claim amount per exposure unit also referred to as the pure -premium. +amount, together with policyholder features such as driver age, vehicle age, +vehicle power, etc. Often, the final goal is to predict the expected value, +i.e. the mean, of the total claim amount per exposure unit also referred to as +the pure premium. There are several possibilities to do that, two of which are: From a90a0aadb59830c338cb915ac61fdb03d5152450 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 11:17:38 +0100 Subject: [PATCH 242/269] EXA fix minor typo --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index f9898e8b73542..655ca578184fb 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -337,7 +337,7 @@ def score_estimator( ############################################################################## # # According to the observed data, the frequency of accidents is higher for -# drivers younger than 30 years old, and it positively correlated with the +# drivers younger than 30 years old, and is positively correlated with the # `BonusMalus` variable. Our model is able to mostly correctly model this # behaviour. # From f74ab9604b3f1c2e4810df11a63145545ec47acd Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 12:27:37 +0100 Subject: [PATCH 243/269] EXA compare metric by metric --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 655ca578184fb..98d515a4f9418 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -443,7 +443,8 @@ def score_estimator( # As we do not know the true value of the "power" parameter, we compute the # mean deviances for a grid of possible values of the "power" parameter, # hoping that a good model for one value of "power" will stay a good model for -# another: +# another. Here, every value of "power" defines a separate metric and models +# are to be compared metric by metric: tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] scores = score_estimator( From a349be7e3d4ad3fca2151f55c3c60476dc147b51 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 29 Feb 2020 12:38:05 +0100 Subject: [PATCH 244/269] Add examples of use-cases --- doc/modules/linear_model.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index fd17227bbadb1..1721a67d7e6c5 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -975,6 +975,19 @@ The choice of the distribution depends on the problem at hand: of the Tweedie family). +Examples of use cases include: + +* Agriculture / weather modeling: number of rain events per year (Poisson), + amount of rainfall per event (Gamma), total rainfall per year (Tweedie / + Compound Poisson Gamma). +* Risk modeling / insurance policy pricing: number of claim events / + policyholder per year (Poisson), cost per event (Gamma), total cost per + policyholder per year (Tweedie / Compound Poisson Gamma). +* Predictive maintenance: number of production interruption event per year: + Poisson, duration of interruption: Gamma, total interruption time per year + (Tweedie / Compound Poisson Gamma). + + .. topic:: References: .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, From 497a76c46bf5a92ef995f5f3cd810c7b4b108233 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sat, 29 Feb 2020 13:07:04 +0100 Subject: [PATCH 245/269] Add figure with Poisson, Gamma and Tweedie distributions --- .../poisson_gamma_tweedie_distributions.png | Bin 0 -> 38253 bytes doc/modules/linear_model.rst | 10 ++++++++++ 2 files changed, 10 insertions(+) create mode 100644 doc/modules/glm_data/poisson_gamma_tweedie_distributions.png diff --git a/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png b/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png new file mode 100644 index 0000000000000000000000000000000000000000..b4cbc187ada9b28b4c971e96cd1ae83bf5f2c6ce GIT binary patch literal 38253 zcmagF2Q-{v*ETu`f*^Gwcu?i{ap~~ z9!O2;@hdOX#+Qk+shK7^5!iHl?*XG^SQ$%KtkGmz+@(sDyG`Jw%@@#Q3 zQV^^(){kq`&qoeh&kV<~ga17c^z>^WrGJl(&-j}Ee#uSI4(9sz>aco&0u9ymV~z7w zc2wl`kgCn?aQGWO|hTPz>+=snV3>dyJm z@$m%Y+s(a4OQW=mi6#xXSc=hQh0z5vIBZA3FX$JE}!&gR8Vr%pLQ zif+9zXFs34+P;bp@`vreb-dYmrCirONT?|vgLnF?0O`*+yeo+{bH->=vpv7!V#9vFU#-2izgwCo(0DK%S- zlXi00g}JZ>?K_p8-Cvx^`JiF>nUwNv3HT1 z;UBhv>O?m=LIWKtaJ(ouW!75L7srP~pW2C0@VhOqETncyZg!9GhJDS~$Q9qmwGZsT z+r%y|7I7D_+~sH@olIUQf~!f}qS7M#D}s~3dowW7i|RipUp|W&^B~>0qq`m3I2YWpo+ETS zb?PAd>XUL^FZvO>xC?!DI@oLv=lWipZ$IrT(d%i&Swj8tS`5R!in-nrgV9_0Ao>bPtu z@ai$dc)h!P4&C&J{sxXX@y5z~6JP~2t8isEx|lI_yZJ~vdU;_L8_(w(A=N7x{cHRqchVXX7ve_aWQ_GElk74$c;ahJp zwFv?ByaP-|z8!OZZG>C9E(Hgxi7g@8yU zGtQtye<=3Dl1?z4GP?LC55fGNxe@3UzG4t1!2+c6 zxO#V73*S(I>SMI$@3m0xutBY0z{`;>-YpMtvf^j*^mv>KL_7ELrP&zBO>WIPLf_7i zTw)c-g&1n4aJGZ6Amd9fE|8!kHuhIW;>N8SqZtwOT1X3xg4mGRcB^uW>f)nCrt#i# z5C}U@=*cT*(JNu>j8> zd);y74nk%ST;SJ`2wGp@c@3kf(Y!}=_&D;FMN=8%jGoNufWN_w3z}`;N5nP#XaNW8 z{R>iwAkVOiN;ed50GVIgAnnT_PE75?MZ4$QbA@{|=43J-Qpj+#T~|xN@ZG(+q!v#; zi^(^>C^#G55q`>P_qi?8z}1c7_Woct!T?L?HrZsrcbxaPVESOTYInYU(xKL?CVW#+ z;_{{GT->qe*sEMzmwc=Jy0xKoRvCfvRn#QcJ}GXLZf1e-iQK5}844b3*?<~Bylvs$ zX;Re3&l0Vq{B8pmo3Cw=Y1p2`wbw@qvDTC-n@KSVrmW(}Enb?-5%3VoQOMZj=cV6( zJG9i-5ctv7jK+QWypJeb{JrhhfbycB=UIxyXbIft!cVEL9MoF;J=UrqJ1=Pwaq(+( zfq->CZeslSOy!9uhNnOwnocWC{MUtOLqgH^Y3KBL5KIDVhk_RcMeH>B@sIuDM83e9 zgutD`;9HT+B~+;v&_YbDDW=x*VnUwiA|LLQ5IB6S*-(TD)_>EqjFSF^;_a4}v^o58 zs5MkdxaEOifnyqDJmW_*r7Td$LpX-T4O6nQ;<;Z3-%`Vxe1>l+Njo+zv-yhdNH3l} zKbk7M7l*ZQEkeb+-!6sD~n=i1)7LG->tK*ZGEzJ^G3P~)*a>nG~ z@v#fa#?{iY&Z?CQ84-|J490m*?W4!4fo zdH?lk2Hf}Zr4tsZ#y4oBTK2hj5j@kA7Uq zJ^40mWsHJLXTa?*X@+CGxZkol{p6#$IrA)e*!mTocQDRV3X6fygJpD2uoX~`AIL@r z=s9{rXx>}XEZTpQ#LMqf3>LV4r{o0m{9fFg<;_-X4I4hwZnzfgQKhuB%}j2k%*5Vg zR27@Oe&}D6jvtWlC!XKyZhkm#$y5H>N)gxB?}>sh?}Q7Rg)NLMqkP9DJs-^LrhRWI zBRG=hvk+I@t!pQsc_nmoy3K1>Q6?sdop;0d-`-Z*Ra@N6)7)(2jASrkt+(}B5x&Q7 zVceh46??>Tux*|j(HQP&wfC&yjU%lPQCa?HTd!q-hV9>xLGy`xSd)=Y6f?U%jRd~N z!zg&*?t<~;QtGCEel>x;@do#Zq^CQRqvx_e_T2z&nKZVvO&ZUO0>(gnfH6TVyt!#n z?FvMA5UxpwU+nwgaRp5pwu;#iYptyR)<#gIVM%JpprL35Y`x^Z$T3fdjFovOo|N4q znL>CGKaS&S|H9inue>8!IxTrji!~vc|ML#u3QKtdn~fz22F9%Q6q8!f%-G|vzdc_! z=SW%K$s(=d`Hh;16)#&pU)9Xa3Kh339Q$hq^*|ss51O5y?lgUjfr;Ypk?ioy ze|)-A_ovwb+J5OWuDKjn?^bp%(5uBf2$pV`=1oRk<-KqVqW%b0hz0DB!Djzb#y4m- zPc7f{pzNxmdwHPH*2vL;RvQYHgo@ofc@cfQ*~+9oH;yGeI~E*HK*1w|Ja<-|8~>v0 zJJ!|CMFN6X1DgZLE;<^b{y^v7u%?{VpVICSLejTj{X&(s&}5QUr zZ~SF=sZ%Kz+iLHSnvW_A9Xm`3WMagRdB&Qn`6m~eRkeI+w@J79g0O2RgbCZ~UO11GPz^; zD?C3c82d*>FPwMa=*-IVMSim^w>?rRg2-w|Bk?@EX5c?QnFc3peK7R$V9d4`(t zaI0+FRlY^1a1}ZH4**e!!!rjUTIKjcd>%{~d=)DmxC)wYBc70enZ4HcBZ5ti$L-Jw zNd(LHEXv(CGb>Aie;`^Se{F)lGR^~DPtj-eJ3vX zScxMsAa3tt2aulxefxJ*Y$vvLfHw==G!9Tb6-=rl4XHCcBZU3!h~; zz=%BH$&|KNmzhlBGTb|<%0I!13gLYH$$x8&D0PC_p&n#r)$(g=Ya?AB?$Z2J?9Jqx!cStlWToMafm5A|4HoCoKSrSYa?@ezx&(SG9pn_kQv4`hIiRA9MtV^q$tTXrTHmM4_bIN_JlQrk{!Db7HO{jDq}$r8Sr{ALtNy6Y@Z^S+IFsA@$mbQ_QHh-r*I~X zJNd8Z7fL=FlRiV}4IK-ykJiwn7Fh+bSJ894=%7ap(uHiqPNe#L;9>Cxlu@d=VS<#B z1@k0Rig>D;A3m!Prm1RNn+;Xpl(0!rBWu!4Pxa3tWcI95RVB3w>gvxHl}boO5C*qt zqeB4&p=a&y-@gUL#kqNTT*Em^CeNOchJ}T3aB-qYKz zCyEMT!l9(Z@i3zpwld^mTA1APwXl#~T)bL~-Wmi|adL86SXcX?CDW*&{$T{n4iuj?ocSjx^`RX%+F;!SL=#(DZ%KDjZvo#hu*aivKC&V+YM%U7kOZ{tLFS?bTjEi z{L**TNYxM5g2^zs@lg@6rV^NkorGfBritFj?u`uhI+Lw~8x@6y5+;#UgbE?>>_gm3 z%Cw%lMh#xVpP$C%RaGUPZ#Os3d31JmYM>+#4_@X8`JWxIekf0g*214yoFgMIQy>sQ z>u?a+&_2Ds!1Ktr(SS9b>}_lok7i)lQ*2g#dHKh?5U7ynmPuC(6Tm4_%ma`O4GpZZ z-+-&^T^($PB`3RmRZ!O&U&~Mx0X7h+O3%sJQK1VQ2uR5H$YH$HluSWamzIW0OWK?l;5-QCxH ziR5-#2$-eZB0qilv^qgbMrK&&W}AL44iTu|c!d673~Nj>cB=g%&BlGy!A<2QSoNxz zSo8P^O;o@?x8RXvk2SI|9&_QkW2r2~jEJ17bLS-_B<$$)CUAZD@L{TmEj36zna}pv zUrT0JX{g(4^ZNu350Az6e4~la+LsS^A>m{KHH&EcI6~+UtfvrP6F#K$Cj37qwMSOP zuiO19TdnN2hL*lWVoHiQy(I_;W{dd-Z-s7I8(<}eg{H>S?|psGNJ&YjWmHDi35aej z{w*qlPJOd_IFx2d85T1C;=GV?iH90;t)RK-ym&WZ3>a{DqQvO?`?NGhkeI_{OIsLO z?RL|7P^|CJ-}UvW^4A3E;;_8z>>xrix}Bc;aaDGn88P@Xh3y!sc>m!q8pFr z`)gd5y-t2bNc)>Vd4f+VcXAJK7s>Lc8d6y{@%20+0&Ko8NK1-vS*@Sd#R4p~$TKDdXRrKE!BRPyAvmN#g<|MjA%1xXOq% zuV2ml_o|%{kL3*B;io9o&;vFi2Ikpf;KC~ZuqrRlH^*_R4A``(`~HA9#G(bnrI9-H zy1p}-9^!Lg^Gsd67I$^-%Tvb_|8ZlEa6XJE}4)|uYnqW0rEe?G^31EQGqz0msu!El0fHTCz{)Ya@LC0W{QPe@EkR`oFY z1UR3Z0SCIu#UgfmJRvEWVJcNAcc#t9 zyA!>dIwb{}v4ML2sPHeiE+k<{!!Tr07=)E-&9^NhA{_f=nb~3D#rZJt|3Z}X2nhkh zZ=t$Hdf(4Ten=3c3cn>{5_7QFUmX}7AOB|2ioc4tfVKM5Zj4z%@P%&DW7{!d!%vb$A*3|m%o;` zw`5FAj7Fvm(s#Qt#bvo`cc-0__;7x`tT_`tD4XKGKCWY6&;twDPhWNXKj;z~dV|$( z=dnQZg=<$VOTYVsY4^$Qa{Hl6+oz|PhL4njNN z?L5Bc>~BJRu+L_VC*A&!*}NW9Y~R1|&9Tk7y%qvszd^0*HCHeV*!~kIhsyX zUVxYPb1~mn;>iQ_skYMtBwXo4YwfQwh@$+R9PiU1IOJAohbW+1;|wSz4%kNV-slUGPcm7>v8RGB)aC);N|fs_zM=UWDrr;B5cRh7gR@uBVlzR}cxVScvnMa;k&&U# z;#P8A*RE~IxV=j_Ogm-=mt32q)I7U$bEV)qj=z`F+F)vZ7WZ2!Tk5@vXd2%_kJAiC z7;DY(;`aS$E$ zoND`l#{(rkfA!ekBe|w@P=whvrTa@hIQHTkJWRF{^QLBVNj1g?a9HFu96}N^l2w}% zZvg&6ZRq{wPMs;UtQpOks?y4>YekARfe6~{Ix zdptYhH1oe8Rj7|=Meuu;c|Cty@O3Tn@7kTUMt0^L2EDz#4F|*O0GX(?8+%A6XodzL z_UBhqMMZ+@g#hV6nf+V#8vwy)@Vk0cfoLopbd~d)73iHp=A7pzp-v;ci)PUAuN-kPCmR<^X)>YNB6>3Q&3!~!O%d|~jYCZRPk=qVr_5T>^^76~k_xR5ToW|8oUjPbJ zBLP!*uB=R$z@>?A(Hqan(j+b?mkFSn%by|k(xx-Jd0?GvpM~7F+?DFR3$-5|iH|-e zn2nw?DdkT5=UkyGE@pK=Y*A7DUx*x5)<*MNWo7N3PtgVqUM@xWCBUMaGu7hqWk8LR zoqcbV?S9@UHU=-->6noRNEFmD*J(Xe6<`%GM|p-dAtfL5SKXq5!d<8L4X@jY=mbo9 z0B%O?HXHH-wjf7ML(_}Ftae1v$lxya{g#dR>?dX*E^V#a3drmA!sJ>y60@6VEyK4{Y*sD7FWGzpRjdnf?;5 zO*<_a1s;Zv0$ub(XqSK>GbSpNR-ugFkS;0Sp2OF1BrO7-n@U=dj3phi~7$IdxA?9H?u)3E|D&83M2k-~qYUtnT*R9{ z|F`VOF{Xc>m+;T2?`BR^jWz+4VKN1Y>^(dD`(@1$phHut?1kJmS)}Nij9=OcAl0SPPvNzb-pKnsb8_mlzur<=6G|K&s{0%RB#P1|7y3} z4>7XeN1Gy5IS#bcEoH;znMJT*me@!)#J}w`W^R|Q)Mx?elj9Na(OoA~yiw-xzjFwR zr%H9I*G%(fOwYcPcB3us=gp^Ju)pqeBWhZA#RhJp-n$;~&Hv5<=y3eKMk|t>;wq)R zNAX^;GGGTBkG_p07|a-~$WHBhScU*~;LB@NdBf>__wQqxH6<4pVH)n|mVJp2G_vFx z=RBh61)&FjeggE$6?b`jY!UjCmzzH-7adEnDSmXS<=ZLKcuoj+nk|g6%SwtNTRjjH~-IMEbf?aP3FsLm~C+`bZNrG{W z%p-8y7v(2jb=>fbRXY5!=iHcWeGHTMBfC)^&~>#Z;Q|I#LI*z8)z#^q6SH~+sSDKJ zfGmIO7idS<9F7~;{!TVF`abu+NTH|vi!8)4y-ocK^}B0~4NQ9U8ZpkVz|?%#q>fz- zykOkB+vwk&DyBi0)=^qwT>U~@Tfqb+elP#ir*~=zTzKV{z3~(a=hOCOI@;PT^Nqf> z0Kfwgev973`SIl2EwXQy?6JnwD$aObWMRP3Mlt?##NzV6hf%v-zUNiikp~|veCpbO z&~B)`M_^F#h7jOr1XA1AOc7+!5`<^se>lcx{G(VmSPxj@XmchV=ql-nHET_$pl)mV zypSg#Kf-i;#T9a8IUHr+bQK|*Uka<-wsF=gDbrW zNJ%)CzL_^Cd=+i`{|`&hm0WI@MKAtaX@B^?FHAD5eN^I5eZBb@7`t$JXj)!P4jmzJnidHQqdCE$vq830T*i@fV7p_9R?9Q+8 z=H>rHGTsLduFXET8BND;7hs~Nr($SmSp9Z`9E^wn{L-7Zmm=*Z?23`XF&GsOnfelW z@Uc8YG}TaM2fj#Z!R*|KjIsX-<=D|&^vBfhwxmeSY-ipV_1nl1H>fzYVE9CbFrY4# zx$^;tj!|)`A8Zn=@8_Zpq{ha^QVmn0E1S4pc=L$MMqNI=#3y%aw?ptyWBThL!7-Rx zO`w-(fw$92=O{;T0Z{MTEt1vZsVoy?;(J|rUq2G@f`uPAco#r4)i;~-y=|yJKY~CE zn*;oF+armndKJCzs%^;oP^ZWf@tIvydQ}OsDtrsymi=eA?+TxTbf8C+5Z`t-{Gs>A zxi`@M0f$-~3jWNsow_QrBe8rX$N&7u-Di=?8%P4_fM4a7MXR#*4Qi|cWz$n88umbu z4ZA?q!pH*wL?#0Q^x+1SV1l4ota2kc2I!EwVl+K5np%+%(zE~p- zOlJebzdztmgXF~956(BL@}}Olqk<+!OmsFEe$IgAbH&FqbiWg?4wHLB6(#z*SFx2* zSOE|*yd)1JL5)|(w-GXv$D7suipfupJN9*I8b?_=RJYK*qerw!-Oql(h})u-6XAUR|l;CGq%8MU6p z7Zv*K^=tN|q@){_Hk!riYHB&<<;R(nN?^)EEp@OQrh&C8)RIBM>{=KB|G^(X6M?n~ z!a>|Y)xp0pwXDsem*`)Tw+7X*72?%^__3zw^OBXcs~~v(SuE3$9(?O&i41Q}Qfg|? zKQUG%i-zBuM*!2KskOJ79W0J^Iiwkyyy=24yZ>BCYIse26fN+m(mJ^L*PNw`(0mMC zGhwk<<#wuJ^G^8GL$1{Uo;Op;1ZfNG!^8(&K>R$ue9@!v+~S`O4Aj~@jQiRF&`0NK z;RcjQQb1S<{-{wwix48sBQCjh@&R5t_LUi$3^E}!A-$07sBr(2R!DfUGvzLGwd&Di zDE5P!k{>KvroWnlNxgfY7+M|>C%_Ta5me9*#7*ypJmXLZ(Y#Q1!*CBFCpd#NFYX-c z$a#N4rJH-YN!Ipqt znLOW5Q5|*jNf8u|VOGuIG{n-5(+%2@fUKod5B&9=kPrk}N7Nm>d`0+){pD+YLXlBr zF(Ilo?fh#`270Pii_k?|i0=&gA==?PQa4n-yL`0nkl{>E!rDu*>&zO60)I|xl}$`Pszz? z8ptDfl+gQ3)AG{n+IpKlx~!yai+tOwbJCQj^cB&VqpR5_ktbA9szOpMPaUCTK~EMx zV`|Mjd#!Ao7_l)uzbk&v)Lwpiv9!cD%pRp0dDtf-VoB<^^uTMaerGz+M=^Z$s(9=& z4E;fe03y<%!_f!4V>Ie z5o_(7<$b;0{jZ{`n=!(3b3K| zIq#lZr6)PnQqwqpI;V5#13{;GKf9cdpKtw1q0YIdymIS$y;yd`LVlkis867iIq8%# zg5;T!)E)H0Sb4lW{5bs6md>kvmKJhMt$hZZsg2R7uWyZTx`pIpYAK-D`cz~){rH2j2W1`y`h2Mf zITx-KiYDy;0blm+bQ2QRdz|VCCGm?QmqXs<+5ZPWh1BHcyY~l=bbU=hP#S})%j_AeeD2!ReHjc zws(8P>D@2-7C4UdsRurQa&L)`m*?iLL0I?I?;KD_<%6e0Z83exbM_X$= z;Kt`d(V;8EGFPfK_#|PJ8mt;X7bwoQ;x@v7M%4QzL?=r1DS#KBRvo~A#LUZ0zFMd&E#dKPGrjA* z>Z|u@GB*@3&Jx>X}CE>imy zco<=^nw^w7^)iT1#%UFm&Xn9!XM6PloN1ubyP&tdRJUf7p~|76q|1tLG0sx{m&$8( zeFvFxSG4!?NNp+*kwQ^+3aFl1kV6W>BtM%|`tWR_pO8{xfbftv3z3Tc!8W88RHh zKciG9l+9W3_aO0VY9HR72QTNtcrpoL7xbg75xG@8H@Lx$f2$OLM)&l=Bzo_1;wTJK z0C^2XXX$ht)0J2)&)Wg&k%|-tAoTo9j!4t2p}RVTc>O--(7X`3zX$R1oIt2P%(tadO4Ripc-x+rPL~pP{W+;mZ#?zF!2}oZ8sW1VrpJr8)t-eQllRj5CzX-L*mP&*LGldko8!0oaAnXasu@ zI5)zb;3H%JZEWcd)(YhS)CQfm-5NeV3oD@_0L20&_sQ zKniS{5wJzYboj`27?F@4-G%|)cOz51Tyfwi)!i_gu+vZSTT;RY>|tGm_xJAt(S%P9 zzY|5MgN#U{)d;D8KkzwbKZA#g)>Ax{0#mKk6$~gU!W)zojL6}9a%_z2=`dn2xKUAv z(8y7|TKSb93dkv87yj*X9ybK|dBHz<9_|m>3e=GmNn@IGWy@HBX3(AQ-UV~`C4ZPB z)rVZq!HpV1gwvJ~={L}^t3gB_wxz?F*o#YUNw5+Jq>t-j+L{M2PbJ>l*R#Y(Z076p zSQbRA%72XZIr8R+=$?!ciGmkV4vFZ z#jp&kc~eHX%EYyNr2l?Z#x0Gk`T19A3?Xq=c3vs_@K2z1ssv|!rup7l0qVNWDiTCR z-;SwXG|(L+<^nsCLXnquAm{N&SsQ3|s-cj0^_FO9x>-l8BKJvmmtK39GRHF|UsgLY zXPJ63f#t56&f=L*rxoCz9Mk(lLnx6SKXkK`%q~ZK8{( z2KPxT`nxDOr1=ru$?KM?9PgFgSxN86pRXTOE{inn_Fv~BQ<^bv8=w_oTr_P^;?i?R zb(0bp(z(8C%~q4kyI55EO$%r%0p0+a?OE?!NqYiRaBY+VH`Y~@F zZL>8uvx>Xu*$d)=Q?6W~zsf;H^86Jxhl*hMbieP}SNp#`8YB#eyY!Tt^sT;v@8A}6 zwfB^7+$<$7DP0SOm&$#p2pwp+t2`vc2?Ma`J{$d#K5xl2N>4sMvo(b>GS%^ z5RZv8;L8PTrk9{r23ekj2g*}2O@7jtYoO)1a|mp|nF``n@r!R(K9GX@`%05)V&q?$ z(O21x)!Q~qObgqWg|B15jZgITTpWxFdDs_SnvdqKp}Q@E!Ms5(Er(FC(ijh9OEXBl z9w#g%9;iaKe$4#4WHX}8n}LcX#GqGElZzI*TX|MgkruME3iDa|2zCwsBuh!&uX&#x zGc7Q5Mjjqa(*P_&?b3RX?LMJl@fnz7g_qJ9fc(m1JQfH8K%=3;bfsxBp;ftFtj2kr zPn6C+nk7Y~0h1iiB;GHNu*_bUj|&{%T8UCc(Kdo9IlH=rMZ{x{kA4XMqYx!!Mf&Es z^f#@8^*}fD6oJ;o0NUZ9_6q9U_^I?>N~P)l={=NKJc_5Ou^YC?ES>EAnV=)?W};M= zum3xT3m~VrdY}D@I*4nl@GoCFwyu(=S{w7xWMxETwRZjR83L&c1b0aR;l4%)bS2z- z&`UwM8%@K8#{aVuQSXoThJ?E|{0MH=-PkyiQ{wOdiV9KWtJHx6;_$Ze{aAELcCxIks8Q*?{Wi-pe5 zhq9zZ(h&rY40iIuCEp)V0VGruX^`5P{B@aG7E2*f$E0^DJR5Nb(ep?2pEtRa$2vOB zl}pLtshCo%XN85=jn63Oc869f_|h=gLfZ%UNKz?f#;SfwdQM8FU{Smt7+L}t2%Z7& zZ>Rl;6b9%W%u%}IOqTI2KTv_x$&6aMz7_+)I<-REkUP_HIZa}-5wwWR3Hv?mH4u!m zn|w2aAVwf0O|g9QP@Bz2sqP+7AqK{M93&XT-^P!V#?A{@`VKfa)x^#qiv}Ow6Q1|d zXavS6%Bok{gVdA_Q(KK^DBHu#Bid(j-aO_|q(%t+(e$=CXwg`@H9t4@JXmb$@^;oO zdr@E#B%Zph7LJ_-Tj)ejY&qp`Ze)A-SP~Zp;1g;7)zheLA^-Bd*rXzHdQIhp-$h}} z`9%AbyS5D3d#Hy!4m?~R%1Iff(445GLW#CK9S_;LQn%Q-W3b02bozN-`#x3F-9}Ez zFI=53Q7M@E`srFnV^^X?^wDb0z^Pnn81EgbHHCu=LgqQLz^l=_o1Y}_UwtbnAtHIO z)0_V)H%iq&lVv>v zS}RGBt>XE9&#tdObr7%?kFOtc)!uMUCF4zVmB21xg&+2edYK2^1e4 zl;4w1GQ7X<{pQqzkul@=%}=sK|URdK)zo7ZbQ1o?t_L#N|`1 z%%U(drAf@zVVohxk)f0|bI3S1Gi551!^v3Sy)B>jBhV=5`z$phxuTV;Ook6NQt(dy45t^W>P zH8d6e9gQYYbG}N+-qSo>idO9bwn?;D;>dCfuF=0s_uTh;erA zn`LPdBS8;`KsBzz7C@Wj^i-Yc^t7KzOLLr9bs+p3vBj}n88^^O>ENP=CZsV8G6lCY zx30Hp4eDQ;a~3cspk_YS?Uy_WrjCET^weT=!EK!#!J3OtuB!EX5!gvXl@l9ocUDU7 zXoxAF-gbGlZIZHu_gHbv7eV$)=?yUV!8{UuzJKP+Pg<2LB5}N5zGhWm!iro};@~0_ zr6jlL>(2hg(NF>?8wbGbZ_{JLLXe15K@rTspY!}Z+Y`S>GM9ozx|&K)wovgVO9q0Zbedjk(EiQt@OrdM5any?7M{FGt7>j+w2LxgEv1uB1)C-nc4^IXF6i) z#%3R#dT(LQ7yz;i#3iufGYMdW09@?D^tEmP!}HVSS0zlEmmGHn-(jz4zj_`!hk~j5 zx#%nr2vmzb({c5@#6BHNxtqS}GpipJ2@;TA4H}n)#zZst-ie+1sHz5wv;9T3{JSn* zX_jQ7dKblFS&^s9i5F9E-z~Wvnu4-mW%Kn@z~i9+GjN%YG`VUDFkd$g;K*wWTL|j2 zD&Io(+~5HVV>jYW7+LrhrhD=4*-8riGU5rmWhZ5rX0C)uC9R^{s&Nw;6Ft1JQkBw0jjFOGM&}oWiPwk z#1D}pckgZhYbv!pBQzpg4nJ7_suprOC;y|ox$%qj;5q)Sw3~tF4^e+C5ARbU?hUZY zf`atNH|-mR<*4Ry{FC)Wdbq-f`!ooH#I!qufNLfHK%Gw~zbh27{3?#tQ7lnzdkBmI z9jKT$x!s8d3RQZ+=I%K*hKahzeqi+F-HWr!oi^F|a$rCb^6FzoUk~2hYtbqzQ+K$o zwT7~Fk*&QSN3>iP9GaU+`9kSUU+sO@YiF1AYltnb>WF0;ytpNEYB+h}$x~ZC;?E}M zoht6x+*=p2KI!U3rYow8Dv*ABKgN>FDn0?F=%OdgImM~bo7I+m@RnFiQ;aa-n#(1U=7s+k|4rS-*U$d)H}9x4h%C5)P&KJh6(Nb-YUT z=55%Mojez~#W2%QnAa5EJuce?Te#vUwxbX#UJ!CZx{+bYCF5LbmsMl6oZu;r z-#Usvd$u1g&gw;Jc6pph4n*$ohhLX!njf&g-F;`AEQnG9DW9_j>jw^1UL$!_97KR2 zX^?mQ=;tIu$Lu#`R&a+1>;_^xOZuvJjI3-m_&23c9nL1e_Y~%Rrl~)BfV=`iFPe^f?|D zD6F?&1s`(CVNQ0*n5JQ`9x>(lF<jS>3bI%c^ge0QeUW9(JCZa0Z+s>}FpR{B{@ z>l^r5@Qcg9391{NnV2X%oOew)`hc%zcR13=-jjV0%qH91bg5!1seQCvZcod1S(Zr# z@QGnST#vtP%@^S7c^)&ozt~pR8jiJSCzGW|MRuSXK3HNH(*`QH-9Ezc9ljCC$+_{Q zYq2gq#Rf!_4V)N7v@;C8GJ3-({=5cZUFsb!oj3Qmvx!vh;vnL5YklFjg68e&o42DL z0p>s?WOfW-DY3|E&wY=l`jy@OH&s$v)p10`VuvO{U~2fcVQNWFC4s@FN~VdQm*LQP zQc2F84O4CN88{vg%@5LrQawFbZ3E{Uw=*uccX~Usa^-|$B0M*jlO($YnK0=1t^WU zn)che`t_S>x|Di)LB)8-jAK+EQq#i5M=mh@cbxzYam*!MBNtT$>S{rN1))x)sq3lJGR-Zlf<(ds2N5~Y8@~CgWy4Pk#{ip#_$QTEdXFH zelT@)wh{+CYZqMTUe=oH(UMRwRb(Ma4r)g*q%iEUmg@TI@2+9xG#wTBDY~=9aZv*i zIC)>}T~^&-(M3<6M-LL_rSYimDcnm83N=%t8YQHv{}fQ?QNB5CYHCBDeroU`dhPUr8*C4%&X4$NXKVzqAnH9nl`QPHWVP6-2QU8 zqvfNa(Ee~!O3zpHQaZ`P(A=&k-sgV9DpOc1x@_8L#mC`}ScA|BVV*~N9~1Gy;;HN) zWIj+vkkb1uy?u&DLjSw4ydgVT8|o+}B0eH6g2o_ePHMHX1(xKSN;lsP(HjuB{c#-h zq}XkrX-&g9sMo$;zLJ!j9ciS*J+k~cF&+KAy3}ea*Gh)!N~tK3?Z|U|aF|5h+qETu zkEJ;=EQBuJFsb*mG^^E7*OY_SLTs;t-A{X?@}awbHhgUC%(Yftkl53}ug(@*LuN?hc0zR1}}deuz$>tK{~jBqJ8I{Mel>-_-MyEnJSM zTmLbpLM-_t?!aSfpzlwTjO?QAo60xH+W&{BuZ*gr3zj9gLxAA!?(Po3J-E9&1b03h zg1ftWaCdiicXxY}d*54+Kf|B1&YazQS65e8&)AZuD|2kQZ~uXS!;4(RMPbJLO9(Ao zMhNYp&&IfS3I;B;Oa~qCI2+^OCr>jvXRYFhB_EA&=xTmy>C<0YW{XT>*%k`M&-Nf zCkH(FSON*0PPklUXoihy`y_&WgMa&vC(}>QNe5BBzBpyHin~>Y@04pu2pr)KgS17G6 zSXGd1?8FXS@lzod3jj|Q4Y7G*Jg?_4>}tW4NLJvr;k~JQ=$%_cCD0;ZrIz*VqJu+k z#mVXAnPS= z3Pr$Za22bab)PH`ET8Uz0G*o^&rtF|QQ}l6Q4o^qyLT^re!uK&Jshb=2{dM^smQAy zZt$s;ZhC+EQYCTDG9Y3YKM58vGdQ1dWy1H3r9gz2tGqgfy}R1%e6^CV*WjI} z|45#iJK%eage>N$Z`sgORR+DiFG)r@!ha#Pk4nGk8fvYwe`38%S&>`9L4vh5u->1X zVBXy^?H?G?()MfXM70z_)~iM6GZV#)2+~UsCzI4SOBBrzI@t>z?ncfIK?TP!Na+z2 zU`GwWG)xUxSt#)gMbUUKe-J>f$@Wh!vxG8@*9p@3{66oG7 zonn0t#tPM6A^pz_l)Q6}x{eJhex?6*|G+Eh87oVbIV(<%l(bJ&$C3FJ4g(qf*Qy{X z^$$lkH{+%z$py;-c6P$#z>y$et&bm4sA-rIlZ~huBuYBiG0Er56@r8Gf1m_Vvx1jT z>y`wxg&}4I(~BwJD_EIM@A_Y&gpnc-e>((ms^%9Yjtc_Q1B=Jbi{ysoVp~~;@RHCy zU}`TU^5U_f;=}cPf9a9JVTx$FAp6f7Z*Kz*Uv%j15tm+z}* zQntqA_$5nA^^^DWvP%j&E*yHAbm+W@Y0vm)#>acYEh>~FqCQ&W^XdyTk1=~(Rvf%D zfxB#fYq}~HQy7$n8*fZo=03JJbJ>&&G|7YIohCs_nSKoXt!A%`5~25h=?!2m+&GcuGBL_+xCw);OaLddr2 zf7i3HSOZtOW6XT^#|jwYewwJO%8KZ*2W?Enh?8~s-_%`!Ex#NV5|A=JW*dswKoQk5 z$6v{L54A>|A%pvm_{aaQnF<8wfdtEul@awM#ediqD(UOfdVj6-_JO+)k@!zr?@xZ; zF;S(^*wm4a>B5TYzRJIfX+AXuDkw3 zyf)i>^OU7`eOO)S<^3c`{`hGA3}ZRAeb=LZ1%Y&d28G#3;xaOlQ2i1-I+4zlm`J2Z zU5mvI|4<|O_!s@|>W`{qp(8rHX7~7ZDxGc;C1%ty3v&sN3+%;=k$DWhnC}fS2nr3& zya=Qb4>9c8_vV~?J3)|1;o^KMj}le9hQ*u?+y>==V)u{jvx0;|#z8lDfUBe?;8Hz# zxdm|)JsoZud|UE$Sp5Ec#mcxCCLlrQ$Mj{6+JYk@B>PPuO-coHVLbYXq?L= z=03UdK{<85s!CF|485&Lqqt8+n>I_2e&}q&*FeCftkCw2JSQJhyi&;MA$Bpx#AVd7 zQ{aKmq?H-+FT(a@EDzQh2YRr~Opa3p_n#5zuB`p!A!4lKtN9u@jO@^AiEzbpPYywn zdVL@PbzD<>(>z24I95 z%tX(8Byk=U47EzN%!Kh^Cu764+t{wE3)h}KATV|t=Ay7jV&qof2f*(sY zZawA|i5!eRbWVTR2!cjwd_Fh%w=_v$gcj~<0( zD{KB5wZGcoUA$s?ZF#i1XR~q^4|%tu*z;_;n>iOxKp?9OXpHGCN!Fi_i0ZsF0kS4V zgnS*36-TW`ZGl<===5AI1u7UhGm$>%=VQ`wb#QbrfY-R0KZh!oSIWJ6k9J)OC?u#t0)CflY)n?*-5al zVW$T6x#*dpkN?9!0{jnT7?i9C_h=rE^%>J1LvL}&g||oL4m*a`zkSf$c+3bMQ)9Xv^=A8$;*)QA-+6_D^F9YslL!kZW$ zyNct|yD0;olZ$V!*Tv{uy=@GMQ=bI<^ce}`#vEF&bR%CN=KBeHQfSmXhH-HfkXlC# z&Sv^{Yg|BETMe8Wwkfahe#ZcL0Bf7w&x|C?=8uspTXhxBz$uBwxvvAOt!otK?#&sA z{Vl~`Pdf0A6Z^m7@fxWkmlYd8))i7xL;=Z_%awsn=?4FTd0uJ0>eqoo@6HRw?wgI5 ze%W89@M8JIA@1mpOjWZZJ^k1Sa$l~0&&9EUd`4|3ygp!5Sm}A^%dzez9VRy{t0*u& zX-=Tmi_TdvWMPIqg2ET&b63dEsK_|Q)#Z7{4!uKU={JI|M)K^KC=yhS$#j0a+tci4`*D@$y!}N__#A`|@cZnAC^IxFjouIemNaEzSX;XW z4=ebhL~&UZQN)y{=qnyJ-uY`6GNF)`^I zMu-~DMOb0p3DL$xo)5ys3uAu@>-?a_f@2e76hOz-VxLRYi=TSDzpx%#9t_Jv^*gSR zg-;po_X|s6mV~tf48){S6(eG8M=?b?5WU zh)4K?l$gH(&|6=6>+CxJ&Xuau(=hdPH@|$z%hKsDmD{9!E=|J^<@N#ZuSj=|_=#Ju z8}oSmz03%RLR6)jbA`u4uj@RewM0yBncLNwY2hE(cu7jhoHh|RSCfm~=DV()9LMdO z&lo4In6-XB85=($4h$PFUN(v7oGAJ3SgJ*9>5FsG&MVDNo&b@wM%df}1i8<{PSgon?W)S&uNwakxhbReRmnkEQ|}gNw*U-(erU$vlX* zmAccN+$cBbrVh|h{?N6~;F>ipQC%VAlJ)k~-X{zLTFP$MeM#-?-`M4`;^0-E0qv)} zo1FKMP>x89-8LX!xUW;MI~EuI8r8+7SGv^f)&(9EXRpDusuJ-yr*>g!;zh9abioPLoUKBC3HW{C}hF7 zeL?y}(GvXqHke0T_H_9pvknaEdxIvan%XivPRK<`+;TK-Sno#1&m%edpNAjMPd_Oo z373gdAil$49`aQT7~4^XyuVAYurc)&O$~zxS{enq^b1-;A_$kw=u^`R@j3rzB5aH^ zmk3V+@FlY$exsEW6YyoI>&5G4+Y}c&#AGmG23f?NxR3-@y@{5Knp?YRvm|rcM#p`& zV(w5n3$5jZ)-Zst$4c*udWfOmp-q@?%^+y`Sp8dYDUXuU3|j7y^$DlnIa6S~EZ0+R zS#&(HgE%;oHTPN2gAbMk7eE5}cJhvTJ&kC8wL$ya1%p2JYjMpSiD0nI&2C-KGmTA5it_kX&whZQ{iKLKT$)w zHVjVmtVmkJ4H5v{2;hU0q?X!^#&@f+pYB!DyXCbzF_`?JQtP&3UV#=xd$sXIbnXQv z-UUj6(G>0HD=cLJn?B24%jLoXu?kvouo!uRBjT*j8mAj)lN8oG-FQq5tUYu<_s7~L zMF(-FY(2X{e$C6=9y~cqJpX2dhM$eRL~a7dt!D!@L1#3ITOWTrPDQmq&7Mb!l@-AC_(lEi0qbWg1lhax_gTJ3F zgjw@tPHjTuJJj0+`K!f&jggD!K&&^z@V)}_LeZ&ZIRATWU1IsEZ}Isio~$!#Q@%tP z7@H=OR29bwz>@;1<-MXNON``(aq%ynD6tcZ7OGYhUC8l#V)ONqGejRHzd)9mQMQ=f?49Pml?+VUEcudvdu>XKG~!*%u>wB!_xZmVaG(|5 z@X^%)SP?{Cx@XY8@x}vE^C2WwfDM0vwN^U&YsdJYAHa7}D6zSv!s^1}Ug3zDANLAQ zoGDJR60W?xz!Dt-T-DC~{4zoKQV)HKMp^hbXIY#F6yYB7yBjrlB(fF{_FqGALCrNMc$ik;$jw=5)OO{I-wST#zBWnH^Dof(p-^lDjbyT9>6+QZZ z!-)QasIhs-XO@(erO1d+-%qT$2^GNi??B+2i#reEjk2ew8VRx^KZ7dDXi;EM{WvF(IzcW2B<%rbPc!0IT_dwP2)Z@qc_?d1NbM^ViSnNo> z+-C2*=P@Hl%cc3kA-M+U^aZss#G!#GgpBOWJ>J*|khuU|Uoy(uw=0ZP_%Nkll|oTN zlVBl8s9VH3Di>7Gx~CkLlk3_C1F31Wyudt|S=Jut2K^msTUWk$Tu(`4@I&h~k>IKT zgl1dwX}LkD&KL^%#vtG*?3~R9p&axd)Lc3o3m}<#eI}bM0@*j5|1)Toz(;WBAoz&q zN}r|SU{xqBpvQRoZEoxCA*?#?r!nvPlfmVs3kS8v(1;2QIVzijgGz;#gz7kA(5fa{ zXQz3QtO$!ys|k@Sj*-JO_J5lNM^PenHJ`g4Mc{Hf6FYWI%LI!+1*1PPW9=#+0820= zPF=kVXGktf9r(r(I7bkLD&oUmIF_XNI5!2|*ti@;5Hm3xht~elbo?JB+B4l-)y8Y3 zl9aM~Ld}1Dze8c>!RrVOzma%}H$>20AE6RQ4GgH*ZipX*Ak5^YN#i#lG9>cRdA-b&3xsp6=pZFs?g z++*CUt`tz40|MOB;5%p}*+2?4HRCT8eQJgw6`zyNBFNk-M4@7<#kW{WG)Wcenx?!( zDXFlEBw-6SHfjY++?(5TIgYDNPLEd-Cs*;y^MLgMtd{zE6NcE}u6@%*k%`7ofET*6 zKWc2^f3mX#n6VzSaUC?uyqhz8c(Kv?+DpzcGq-I8^Id%$*bc7l4UOnLgT6+>;~s5> ztvOBU)a$o)jFgS;0;w=;B79(NF&)rQ%__E*5t9pN`yMS6`el_ud6!Un!1bR#=UcNm_j!?q84O=e*RM zB#4NLtN*34Lg|43dL0^rj?_TwgFbtHDm%haD1q>TEG~+DhvLJ9KOuC#DWK}kAPtv{ z>;GM{2WoIZj!4Gk6Bb;*H{D8s2VVgC!d=A2aaA8RxDahlC`B=%;9rtFaw zV3onY)H-@20`;tyHcN+o;Tt`z2peKVT3+k20#uBM2F}}q`!zxN*r-!T&r*=#uK{f( zNdC<#hGgOJ`9&NTC*hoVEVp?q4<}(`t(c)ru!>1=|0rx}B5c?wY?#z3k)dKMo5hz(^V7)ju<6~n8%JX&l))yjL+*=Q34UM4Xla9jMRMD} zyd1{GTXU(A6BmPIqQc>F_h32rjm2qB7BmKWOo2q$z3IPH9Lh798>Lnbrd?@1jD^5T)gw|u+@ex>ZlJ1Aif78Nq7&te+?#brcZ#n z(eaJnoI5TGIKcJT%0A;TW9d(T#kS)Yy+c6?!2frF^P8bl^dXDy9~2$&*pbxRy}*hn~u}rFo>AZFm3_s#irjQYOC#pk`?&bG4^SQz*-p zLbQmh2>%l1(IaoDRMGAnQ}*mqqaBSTg_24k2~eFzo*hh@xv|rw7jD!A?0&V2%SMBi zMqv-Nxa~M!@eS)}YpSm<7FSe70!iuL_ge<1NxbooW)iuvmuzL z22$H-00*o|Dq}=X)*qkhi%pIpVvzUBNDn0vAP{k7FRH7vnxt}a7Tr0Dm|c>0UZvll z0RxTY;?nqBH-YI}aZ^)CG41e_Ps9bh`vcsI3?O zp%bqSpVrS-n)a>-TD^W<<1u9t=m%@{dkJK4%SU?ZE`tB(1t7@TKdYW0CncO)-bGUg zGAf{`dr24EXu)XLrEKfY3lhT}=lid#LtsjRcSIJ)BrkQL}@Fe&HHz1q`|lb7X}|Pjo}G$5FjS%PiLX3SGpy_ z!axE4az^Q_!R;mR!*a}E5*8YJ<<%}P44^`#yUIX(#+&YK&=$}Q=$cIvt|-YNWsLtW zU7uBPGphbF8XgzTP7UU#K@~AYf32W}*U-Scy{#Idm%6x6goMn2g+|LqB-*OWMHbZS@_&LOAjQ3J|9~d z%6k?o_*KNx?*z@#I*}u&HNAh5YBO*!5Sy6Ju7v8Q&bm+UP4qKC>b+CX)M2)&7dSe! z0X>2jBk}XNoRSl{BJ)|C-^58!62n`!krz)hVes)rGT!pj`F&c6p^~;B3o#59Ii@{L z$2!B*x{cW#Dy}ip=h#u>zXMzh)T)K8*?p^*v&%vBMbx0&S{)`SXUS-ILM%HqjGqrr zM-%(C!TH|80uv6l3YRi5{NRuOw-Qh=ATBzN+n}*+t**y8%D?*ZLaW5ArkM|xsK6~j zUu<{XCO#{C{_+LOyXkBnP!WyCSbAl*)HWon2g8-F_=50kY4Q@J`dcrC?$$J<`@xxD zJk#E=0GdVb$Y_|S=hoPHR{h#Ix(#AT2JXL--tfcYzO+;_LCSh4`ukJ)h9ew`*ic2l z>3;xiC@-1)RjYpminx!v4aD*KyxRnCl zP`l2irON^?RWnDt56QGi`|C9niJ{v`+$69yi5$ zJA%VB>{!W~(>bX0VM*%{gg)+e0j0!nGhZn~9VL1JK9mu`FVp+qi%DTZxfOkkH+Yz@ zzP?n3)aAxtd-U6jXr+s|XeRBHeo*WEyo-iTBi(U0~bQPXzjv78b{| zN@7QJe0aH4RiB)tm2{&ygepQp$(KNkAad^?8ZlWf{KSxS#+oU*4)%-)?)vr#pxvx| z1ZS+#pWn(v*)zwDZq~nvKH0bj|6%9P$~ejzQqhfmi(uH@c1HQRZU~(U%hCm!s8fG% zMwvm$-LqRT@DD|Reo33~)o#^v8+ZTYa+feN_^DiG5pZ;*)^>7WF?AbSB}2%(w5Zg1 zh#}SahG?XVUpB1(xP48YW39j zG2_>^2eX^y0XlMt%$|N7X(T7+;()>@QRIcbhmavSa#XS{FOeCu?EL$Ofw#Sw8d!uM z*^zH3O&##`eEO48_LuH++39@%2-9-a1_5Rqg|UV4ORn@b&yA?^*TFASJP6=icx?Gl zFIE?OEYSnXnF&yy70Le-`h(qu2=tfm)p1z~OBuud>1I9F+xdzFO#SuV67}>e>7zX9 z@h>OnTA<4E`m|KJ)Z~!L7F3I1P`kIoUe@A~=IZx5&tKck5`^v<%Ou2CoYLRZrIZX% zknDUh0Rx06anIUddSn^=d4lUWY}PuMx!^aFh{=V!RY ztW@g91W{fe@5}k*^`RKsgz54de}5<3CfJUhKtPd`&kJmTprZvd53_3xixWlZk5UkD z9&pqHRIvnkv=8mK#^+IRWf!*Bc_3(|$I1*`b{NVtW~RC~wU}!J?|B)Z#+(<7`Pe-5w33e6yn#M*zR=X76-KPy;w?)-oHzl2pF!M#VP;v+DML>_p)$znQ$l5X;HD=l3Q4`nI|Xv zmUFst_otp6Ms<4PqCx{cAIAvT7LI3X)EsuXkH>`!&HUaqfFKUsXqq{^uq)zexCRU|AapYP?z;x zZ|U+Rd>BZu*NSr*L@moQb>`npKb%}#J{(XQHP0R3V1*axpiT5+O(cJk)E*PX{aHmJ z|Kd&s8jl*Rj*8{O%9{CRmt+(d<-ftO@OE7%?f~szpH|(kr4IJ#m+s(BG*8u6rf$+V zfepGUJZyUg8INDrN5sZ~`@I0jgaBu5Ap5PO^i8hG8xu=4RLzd;@YNUGPs5l|}8Bl-{786kl zU<_@CwToQR7k9=8d;y{xQ+Bady51VK+Aj*C!snsSVL|*DPo4`4YiS0fg&$#XFEP%I zx4`43^m%3n5K(ATmeFln^Su1pY-WhUF5h|+RB(R^HL|)FWBjcRo?|3&rFcqyhD+K} zfp3BM0Wfx3tR{=9uhfVK_=a|mx{n9v`3+TQrJ_}OADD~+ zk{$>_Nci}RyYX@dv=2-07}1Dap_uDo;$@;OW{%)9P!`(ZF1|2b+aW)$0#33;c8Ct~Eu(s(8laD7(DP4Zo%7+2^7WVgQBHwiv200$j)4yGKqLbsCtJjg zRbFiCs#2lktZ05V1f5f{T4QE5Ra_-Q#(4q?Cs*`(+hl3-!2a>{-Z7w5i*Ck1=L2t9 zXp#A_p`L4#J$f*xZf@0P<0kRdOB}AM59Y-$+KH~*QotdIF&%W~)%E^SDoes*VDSbd zfm~)N;>*z5YJZoZ4Vpz2VoULsdHgW4r3en7>-|SaWaOpUvnL+W70c75$8Lf<; zC$vtLhDz@a?l2ryB{!FIB0bdm9<7r%VWLQN?O53>4wZg`^6{*vt^Spne!lYe?O-Rx zW2h)E^bCP>+D|K4F=X&)3_LRc^ZE;h^igCYud#a8V_SeG3vr?||6ZSh0%7#!R#W0? z{*WND0;F4?&n{`gw@*yRJMN3<#wuG(tg+FO^_sX_Jzp`mM_QCZ6aexZ{@S3Q=89-- zsMD9t-ZWPuDQZ_5J-p}MKU(IFOIgxZ_@ayg&*0@_{FJS+9h@IVlqu}|^W5%^c!0NS zziBW_GxyED*43coNCC1H$rx}ffnI>EU9?FsrZO6vEj_j9&?n9p78zERlnBTfnvh*O zw6z*DQ$iPZeaYfYT%LMBZxg(I^&~aWS*JC^AmPNY$HGUFvq*o{XMED|53E{ue-0!= z|Fv?R`;v23t$EY4f2L7r>wf0QxjIj83V;?J^0)|Ye0Sa9xV5@HUyvx~y`5(dTyJ*h8KIJV$)R|#K0ry`m zARSe!%(EO*0}uQLWY?K>b~ssQ=k(<4gM=l=p=7Aine^w&PN4~$E?+i}N=#ht26g92 z@h)(%OE8V2lT>q9;E>=>@@QYxBrYa`?%uC_vjxN4rr-47ezbo2{4Rvdn3$-8rQ~>{ zl&xin4U{=IZ(xGN-`!dvAo5^OT?>tw2|-fporXZeh1tVKKye|I2m683lL z>Rg7$3ZIxJQhzveQ@&qw#e%Y#sg7_r*n(T8xp=W2Hw)qpZ7PK~txO7kz@gNt1yB(V zKt=l$KAa)0dPVoo7sS|nER+7iBu3WZ)J}eEAF1Kpc)TO3idiQ1{wRoe50u@RAcTtm zLOCApdbM{AY7B|F$CTxJj^(iF^n2^6<;k*r)eK60|ES&&td0KLp=d_ZqEa-kXQD#s zBweOF`rPYB;Njxag<9aSdRvI)DQ$lYj2A6Sm;c7qsg}=>QIIs&3b7bLBG!K`*62p4 zk(R5BZEWEe{Uz%3>RpqE|t%bU4M@90|fP{M*6JN4G z>FuTP{F-BP~(otL63q+qIF%y8GE8zdILee&~XA2uoKO*^QGdx6*3Mt zq5upnC>xAvuU<%3cV&!XFk(=uWMO7mPd6KS>K>|%JK5+U=&ow^8~d(E6(kxFAJZDu zQYsBnb!a^EQ!PtpWbwWdb3#1C6Uht!F?ez?fCIUNLo6qkZq=GH5-L(&yMM5aC#wGQ zh>(-&Q`3M6US9en_OSyzPB7GAxi!F`Kw35*@FY<$Rt;#Y2A4@7;T}zjvDjGdz0qpM z)dMUb$`d|SdtlD82I93%~ zH^)Y2B0xM1stkf!g1!UZ%fxo0M08kFmXLMc`%;Vz$P2wm~!6G4{ z`S?(Jdn-KadNMRNUm>@a&>ZKUIn*7KI&n-Le7$}V#5{ObsG58S_rF~zr*iiN#>zpb znzwp=dV^k%WKI1|pk|---?%{VqhWXBN(imf>Z@l;LV;2BFA0T2sH~6eBr`(eRoUzb zD9Umg5^|CdozynY3}4LrmHz@q{VB64|Hb*gBjodhOsAo|(mE6yVNig>n(raz^HBKM zlEOl$dB5u8;YYi;`_Av|0AlT`#%foQG^hEZ|DtdVhsX8Av)!ukP%1z__jW0DIs*Xo z6{H@p?RC#M)X^GT#vN_+=#1~7Ef9F&vZg�QA+teoybca;wl0M+U#g zpjSX8VBogCfk_FE_!isJniKt>AfGuhdmv;kJfdy$lLrfq_zs^3v{fjQgeN5;8tgfR zO!rMyi+U)B(~~-lQa($a{)7X2_BaLic)afftccM@09c|7q@*@Iw)6qk_99$}078dd z2bC*3doZ>L0zkd&>z+xA!{r@-3*c@6876ZKzzaOtSq{vK=7Vv7%`H}*Qp3C2!UY%V zR##WKYQU;v1}Odi2$Y#or3L$Gdxh-ml!iB7tcibXbBI5U(KHKwd{l*C%%6@7!NwoZJH>LsT zFMR`Q16xDjcg@gx6^Rqiw3sgTyS}>(Lx14%Hc)+{}|iM z=ukPPj{RwT?{LxwScdDqZnvBOUlmXw?2QiHwfs9RI|Lb~@Dlb;Fw6 z9i`geYH?^av3IQ;NQG8xmUqwH0B0}jK(pd z1XJ_($UB_gE(3%F4qrrOFk zPEO|Ifq4JBTq17VG(OqHGks0;sOXSz+X2#ZofG5!VR&=9b|hCzN)@&`sGl~y>2`a& zg1blFHF_oo&<}Ft;6S^ym1IjZh2pjSpP<_~oto+{KY`jIdd8=2m^F<)K-8z+>iOhe ze~3Yoqh7*c>@vRDA-oDmkJIjLOqg-~(G9PU`tue7VeeKTfOu+}P@O6pTL*#C85q&l z?>KacntO$3c=90oSK?v{kex|?kULObA^$TgQ==EpXpU849rMTfUxERK{A_-DZ*Fb$ zuH2Ldk5jqqqDOV&#-Cy4sPU&@3zCJDfuOaQ8#wUO3M=z#58UNx_h5; zj(m!tc$7 zFa?By`e;ELh`DpNt0OV8xj-M%sqjdWN_xloL}M~BlcbUuAzmM}-#R@e6Gas-$ajKOi|P!5I5rZEnldu{cDg&v2zY z-f6AM#bQSn;}$Yv$rTxd&O0%0z|JIvl{@W*KPwzMXJaV#_e3brZPPPK<>k!Y-mpgkEWLwutWW~zi?+=qB**UB~jHRIQ8MrNb_PU!cJi>V3 zxZ|S%i<~5#JXs@gVnaBq5{=SG*?2D{=2?u>ueLqAs~BJlbh>u#c2v4=DUOkS!T0EQ zD=7R|TLWSw^lh+L_~vLV1`du99gTQ?;bW(fIZE7lSz9Nrt(DNpLFjTf9B9agp3Jpo z;J@B47t?aJ1QC#b3I5d@gWQt&@dM5GhOBVx-{8=Y&5khOcrb6ubY}1R^TQ#vLUuH_ zZ1TUZ`ZOwoESbwap%gb3=2BBsWBdT+uHIA^!?REoVAg8dJD?;~+jH~2Ppqm4XtnN% z4gI8*Aq5oKIkMuJ)c#K;i9`~B!2?_zlv#O zex@wHb36QQDz)YXFycq;?^kxRL|oN>Hgz9qGth)p+>#He`qNN2Yc*NZ!Y2cIAz#PQ zpdy~(PnAYbng99!={Tvmts!7C76pp)KPk%n8%+sTOPd0Rv( zSvM7h%FS&EV09Yhl>*{CtEH|e9ck~F^_!096b7rBP3OZEiTl%EgO z*@N$KIjMSyX5g)TN~xuQl&=%}uxB!E$NdK>(qCRavjP zg8fJ;Djv7UzwJN_w5Ks6!Wq#qB1qc&b*aqS67Zz}`zP=?<9`NBp62#B!zJdW1;y9B zm;_pl@Rl;zP@c=~ixsn2u7>m8W6h2*y5igC2X(|yC27V~h<3aRES%H9YkiCmNG^^J zw?*Ti@j!ciNcwZXUY6Y}oAqux%4f0gCw%X={@A!NOc}^O((zotJ8Jf4jsHmh;nMTL z&)0wlFFiQ5s(aeNZT(&V)4R6VrrZ3a9K54{L;MumZiIW5#)4|v? z^P7vx5f zC-!r6$UFXqki99ryqL3ljj-TaXJ~!Cau8_-CIMc(y7Kyws!?vty8zqczxlV-kSQx5T!cDpZ#OS zFve$4h-pmCr?gHq7u~Jm_v4RDwyDnD>3sU>HealcpHsoNaHPqk@k&X;mSWWGnFgaE zAZY17MTJyK4@!ZC`vL)Q?>9@&OJ?WZ=U&&fKdw5gI}}E^m%5TQs)v9XFn&HTZ&OIX z!v&9pbv(!NFX^>CO43X0TiKHH3xDlani^ zr>F}=Yg^j+T96)rIPjXOXA8*uZrPk~j}F>2H$B*a=U;R2|AWmJ|K@)E-k+6Vh4A3H znARSt_6wre7q#AU<;D<8P?Gf-@mrwUIl&32bWOEHV`o}H9x2N0K;3Zs0L7W7sNlKD zUG3d>1pG#*tfnmYk^#v^K)9%~TtaNjE?~-bbJD?kjT{$n*&JV3co_OwQ5o0PVlX_q z`#7&6x3u7jUtaZnJrZ%%B)r_ELUw%TD9Bm~wk%dk-!GfQNMr+RW+PZZ-qTo?uD6-~$=iIl%i6HGbMSTYj>v?)}TG=(`ue(z~sw~a5p?%E}E z?L}U<%xcUuGZt@|dxypcLE@!7LI-~0g+00^F@dAsw1RCy9gbU-$iMzc-2bK_d0&kN zU!!amTM$}W5-L9lOv-;5RaJ=MefI~zHV>b*q{WZ}rej9N&=d=b9?fan5Qd_jW;$K; z17N&))iELXm#+8r&4q30cpK~13N$z`-db@a+`wYSx(UwxSvas@C@+a9rxoyW}FF1V_fuj-U((=g) zzpYG%XCfA*UM`e~=WX?KMOq}bAAhx%#w0$ueSG^ zo?V{05bq%H;8K1o0fYrx#~B|VAHjiEOL>S$LkjBZSN~a{#wfaxW6?)#s6c}M7rpFf z4-}=8fuCI+o-wt1v{#&;-iMBtZD%^wd(zf>qAlD-(O`KN7DTL~;Ek36gk;bPQd+rX z1;MZ?puP)2~VLPZ$R>C1s1x|>^J!l z#1J$&?Od8&9bO(?iYtQer1e-G%RjPcn^-SUeqyR&u{*!XAooTD#f9Jr4K=Jh5Opm{%~pu#DH&GK2sh zcV|b01o#(Vf3rqglL+kxvLD(VjmhOr_ci$f^{<{r$x1Zf*_TLPr029BXym$S1Px(e zIGbp$3G{IIB(rS@9?ek7R%Q&EZc_%H`I;B#FPd3lqu_gq4&-I1)mAaT22)F97&S6U z^(Gk7h}@EBY#H^X?`MeWI<2wcqg0M58mZN%qVpS43qum<6@Ya9=lP zt{j&3+-~B^1$xc49#>QsmeW#mpNC0gypMPivfWWck6TX2**^z|B(FBw%AT}o9!%3k zL|Ay;48zUz(s#Q@fK^B~ZwR>h`F(jY*>p%S?cxKg=?Ur#@57?WMv^5(I?LPg%N6H# zNdK4%lM7@o)L{2O{{%Pk@6Tg6r(!#amToXiZ?7tgz4;4q$J}nZBIh1^DF7ci31UiO(eh$XpwlAk7RmV-1@zBw+aH{1 zS7fWNIO)7Z1!ZV6$Q*aZ^9seOIC*riO-oo=m$LYb__VQgod-wUyqb)gWAB{WQ+ypR z|BS9x{f!!XWgxjI9C62*_4)faYy3X0&2v$h?0x7_3F%j#EZ_MREb{8rh;8wwN)@p( z^xLX%-<)a|FME&j5XJ9P^(+jDLwP}C6#8VH1f!vDF9~x`-?U0^D#$t*NM^fjW5r?* zs=sEg?~>Q;`C%kIIX1fo+sdDE(E{0gsY1i>;F$&EgZ7?AH3c3-ad6~6rHq#*Qm;A6 zR&1G7(+!K*`<%VgN>9<|9xRHi?#5Iu9^On(SrTv#$e>&^m2AT+vdPZOetKFc>4qmH z6JNMC^nGN$DcM@m!zt^ApZ%ZKt}CjEuI(Zy)iR5~aar3fSn(vccLkO0yI z0@6hxii9pI0)jM=5~Z3zq(lM+LKUe30s$kTH>GGOUtV?n|J8T*or_t|Jaf)|o_%Ka zUTbb<>YOBhvfL^Unwz$hoK-*A6xq6t%ZT)^4;}qj?``+m8Yodzb5EDtE>W`Et={K7 z?LTEU#qe`)Ew7t#7R#e!&LVDI&pP?N-!LZQQlgl+d?3Vm8b)j}Ea+@m@l%?wE|cqU zQ6jl~mR$HKJ!i`JQQR1?JdnLLXUq(v0=a;38Yb_BXnnLUy~tl~0Ufj-@oSS~19wv8 z80e#&P9Qh)2!kr+Au!G2w69zwefy`w+5m~amC@u;b~^S_%qJ@qd(l&=Dz+j>oYwi! zmYY}&vRm-uvN6!DTgpGLLv=MZ?H7%H^gWr&^7)v%l+_dWE{T(27W8hq*pNY5q$7t{ zD(XL2!tMfg&{(3}mwmBl?0$H=DloQvw-OW9tWst4(PA*y8u1?Q1CH-fLbfud%sr;? zMIoz~dRxzAO)O;DZQR+Jlq;Is){F2x@d$mTif;+qWJlKr(Ig zgM8VdADFbO#^o_2ivhZ3Q#Px#&c#W=5TaZ7qWkSZB4aK}D`;10vSv~H%SLayc80l4 z7S>czqA~qpc*v5&!z%{z5xOjTv-)L0)Y>YiH{hS7>Vtf^C^!yyJ@~?f$k3VixJ3eC zW(F!zK_H|wcpg|)_V^u4X}xTfg(ubC2e@ z*nK*3T~4pSGziLmen#v*_efj584MqZ33UPsO&&Fdy) z*gn?QmYtCYEyAxP^OG;7xt{P1UCO>TVr-2N#P)bs;n2aa916y;R$kVnb&JOPQ5<*o zE@~d_85rBt52U7yz`j;i{hKt@tw{vPmwes(jG8gz%J)?z&{*8 z(Z^E~&yjRcj)N$^c8A+JwlIqvrov>+q9c$MZeK=ZlUJ;(!kF2lVE zAzzBue`#1?S!53!j*XC_JdRk$Wq9Sq%SjLyp9Ba<=N#tJj9J|w5d$tvORqUePC@rj zxnY%yqCK_R?X3c-EPl2_Q`~jB+`DhElcT8an6*rS>8-A5)vqOy=A1fu zk<`xD^?v-RrP|-G+;!N_XF32vUAQq=iA)|BARoIGyGT5*L zmxgh!qFRSXN8GX&IF0Td9;=t7Crvo0G5*Dyh_kV{$oKRoM0>UpTJg0|VgJ=|`0jkp z$|jpphPkhJz7)Eo)A=9zyP{9U5PdRVC&XI3V2M+ZYuF}hsv^hl{nzd$Tiyb^&^BLi z*iW9zCnXcN_}lTFQlj~ajN}T>-9FyE#rI+x-LP}0B@M(upx+7u2Aoh(l#Y)ELNztHD|8b)igG-3s z$*jceTBZFor_aXcoIGAA(rq{Bw!gOV(a$+;V!SiM@hXdV*T0w4--xc2h2k!}81}zj z&%=CsXzC{Be5Lx2Uvj0JS&cXv{2FU2+Fl`xI=&Y`2YchwUGW_zc+%}xB<9iNl}Lps z2y2S%5lW9BoIzNNDt6vzumaY~CkrJ%>-VfJt3$h&t<)MPiU~%%g836x3QT~4{Z=PT zlh14p;1-KY)40ZUdR)?E<3~dX(zHA7zBrI4iwvqPSyqE#3HJ(Gz+>LtA@q zK~zCBI#JU@X)335vzNGSs0|GYEZpsX7q*JOd+_gtRnwI1XW~)=wmOc+IRj&w&QITA zANk$1xd@6lZRX{M+ihXajVYhkPA`&O{0g?E&!SP!A7^)&_xkOYe zRcN+v^1kvLtvGR^RZcroCD`KTi-u=r#}2<5n40$ z@&^7g@+=hh3@oC1#8C;7fiYwyh=&D`RDrCa50-%}y5|QrhuehT>OV7R%Pe&Oh#+m< zdUW)z4i#g*aB0C*%9r-AzZx;Ss%to`4(A<_9V|xdkSv#emoT!It4j)LGN=%jq$-au zW-4uAJ7!h+^LCv0^67wa?y0TZAPa=`n=McKt&Ba>8_ZQKD5-&guKVxG#jxt_f!kd_Un!TF#NNGROtV_-Z5z!wgK0TL+R zAsZc~J_s&86r9!$*;ut4sj-68=)oj-)4(vA#ZGgdJ#X3+@VzqtL(zmu3kI7S^QAH1 zZn&U@a6S>P;K6GTfS;;>f=Z6Eb`|)6w~wNCW-T1mIfwlLWRGZB zh|-!2wW-;UY5=LZFou0r&UFW26UzMYRvyN3nI-Qq0PghR^q%4xn@tnAn`^Sk; zP{UF5<_m*rk^DpcY$>G+mNw;VQ-}TH3B~S~LsB*qlC?aCb%nu~uMKTrC?V907)D`* z`zu%xFpPt=WMAs-WK&XD?hV&Eu1HcxsWq2j79AdAbuOuVw=cOCUHM?SlBF(pq_DT? z^KZs7vrtg>AI9|GjJuhPy#Qpd=n&O4MEe$(>yP`|#;}k>jA*%+^p4v;=xj{ z1GrS!cadR6Em4ri$@Qh*EQX#mJF`}@8Z_gu`zm8tTi>@>KdwvM96mJOF(WG9*o)-w#zIg)jo;3-(pw7Ib zS0n<&1M09VY*#+eHlD5hqFv)e2a7%uOiP7b8(N43I^s_u8={%uU?zCqZ*ce*HBQ&O z{|6s4a=F~TwH4s3fWesa8amqz`CoSWsMaLDW5WkE3lJeI$5TcM6}{kQEOOStTSq6@ z4oeAgtg@$+Eo5RfqW#%)B~hmGXCaVlhgvp0KWsKyR`(vS5=$_244TuYFr$Ujsy)Xs zp#bvP`!3>iiyU@O^FtYl6dB0l;{>BM62=E1#K(vBxd$v;aAsBxEL)&k2RC3>AUoUp zY1p>e?pQ2u*5alH#9#d*nc2_ms7QY{2sm zyTzcf`$ivRM3BQGe+|n-PBcHuFy8PPm1{kiDN{ruK^ob89{G3iJPqy??J#AHmxdg$ zf)taOWvj&0+F(6_l-o5%`X-v5afzyJFmfP8Sm11z6x(N(o Date: Sat, 29 Feb 2020 13:32:09 +0100 Subject: [PATCH 246/269] DOC fix minor typo --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 103a305faa0c4..9ddf0cbf46971 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -993,7 +993,7 @@ Examples of use cases include: * Risk modeling / insurance policy pricing: number of claim events / policyholder per year (Poisson), cost per event (Gamma), total cost per policyholder per year (Tweedie / Compound Poisson Gamma). -* Predictive maintenance: number of production interruption event per year: +* Predictive maintenance: number of production interruption events per year: Poisson, duration of interruption: Gamma, total interruption time per year (Tweedie / Compound Poisson Gamma). From bda7ad6867b661fb26c5cb4a496222c5a37c1b90 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 17:24:26 +0100 Subject: [PATCH 247/269] DOC add point mass to plot --- .../poisson_gamma_tweedie_distributions.png | Bin 38253 -> 38430 bytes doc/modules/linear_model.rst | 5 ++++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png b/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png index b4cbc187ada9b28b4c971e96cd1ae83bf5f2c6ce..cfc8fef2ae40c2e422e939ff067b53b94dd08f8f 100644 GIT binary patch literal 38430 zcmZ^~1zc27_cl5M0@5uaB}0iQDcv#%N*Q#Q14=V=Bhn>;RiQ6IAbfM+=Zl+! zz$-Pn6AXO20e}2d`zCPt-L#GXekXQTHGqRaq%_xG;NjVT3*bK`+?4g*UOBJb&U*W4A2#6j#(5I`IXEpe9zVW$Uhu#%sLs5ivQnq4(4l13erF4fM(h4OG@L%$ zcJayZQd%jmMQcJMnOuaO+=MFyZd=Q?KRbKEw_J zn}+n0$i|w)Nn)s|udjqg28JyV^xs8DN^cJNcTs=Fvj+*5JEc!%4#jcMiPUxUWl{PB zjPymdR_r7XIKY%)t9=jT)X!O$I`7=$7(;+iDW;!=|Uj#(O+~lp*Eyc=yh5<>> zXgI&U*{{ZBgqkPHVf%Y*TBa53lgWhMgj^M_uIaAr-1LP$Q)^jZLe@T35v%p;d7K0= zVvL6$s)i>ME^=<(#B;GTkBXa0o3s>}9tpjiokyG-OIV7O&%(syZReN}aD#;j&j_o2 zf+z%>ucdyDGJP`oswto1cncSoVR&g&PlEKO?G7ZHxs5upb`~k;f*t)z8%>=w#qHVj z_kCtDl90s6Z6VGbZMF=vzf_c$lx9|?PR7EH@^#OfCL}#QosJ)}0;?OSL%`ROKakAx zMla&>%}DgLQd9#mxbXCCOsK<&WDD*TUa=HSAe1E_M}G0+kM677CELRSh0E`YNL#~7QYUW)PK&}SC;JNe?bWcOaFJ+{ za;G$d(7>6A9+W>{g zc~sih(`;Y2!B1mDS2lh~oV`kr-D8tnu$&`Iy5Bw}&7W(n+B0pkUva?5XR>cI9(jA7 z8s}wJuB&vXZuh)t4%eWQF_{4q%U25=f;5=J)?vDKu(T`ePWmLVh}vE@Zm`eskS#xT zQmfCtZY~517f~ab8ef6wZewCGefG}=E4^X5tGM$kT16hku>zu=K9WBBv}``HjLCfI zh`{Q!hsW6VXa$SGN^6{S>W+M1Roc&Es|iOKZasS!M}Ci@hd~nNQJ6Ms1v^eN5G`#B z+(PMg;wrNFJklpIA*N1@mXl{9>62zb3&^Fby{kWlfwuH=gP>5P&EZT{W%RodLRw<|TcK<|@) zO7+}!q1`pW%;2MH-eSD_C65&`GsI1^T&v{%vWu7yrjVnd3*2$%g#*#$PX9~}v#0O# zbGyDx0+I8wCWrF0%=M-%BcOJ%IVnaTW zlm-C%81!;p6!-P30+-+&FL~$av`mJ@pKJtSAOZynw9c2MReCXbe0~qqYmg0|-Nmu- z5)MI1hrf>8LfFd*QcJK^qx-n4XwL+K49)3wU2yrbO$2f)og(C3Q<8cio-Jh}YtQVk zRC%8Glpq5IU%jgVf)0EJuvwL6?IsujXIov)3T1n5J2tt46K>MDwJYcl7znYzWS=DC zlU}5tI?g?Pd0t{&i;*gCJ@X+YFBOE%FBBx9;pPo_-eB6tdSl1vFQdV-dRLf-(Vb67cWGAD2`0U*Xkn3wRmYF%COLMs5SrivS@P?>z;#@ADXf^wa6ZR zdoFc%Q|ZO)K}meomR?DgA7Ht6$1JrsPGZs;@^v!)?4pQ@Th8MyDbdRBejq>n+#i19 zRr83bnrH`ytx+J}WuN!(wRxth7f0^J;1pB>VmK>vbJN^^U4J1y+fAg1Kjd zpBSvXwTb2M@^+^i4Dw{1U=OUuK1BLo?LGD{DPi{mO#hLj^a{g{<%Y}0doH>~o5f@j z_{~b{Z6J*BXCjZ=-~MD32u_l~+{uI&X2Ml295j%RMATeRTdt^v^`G?`j53y`oeXDp$wM0B&EHdEsd4 zhLT-I*tw!uobvlUFRYp}9X2Jh ztVA;blL`~>6JS2IltCZkGOlDQb-cWHTNV4WJW#UKNdi+FV7n|mkBZTa>5wbAt02$e z%9}{ZV`7Z4uCzSX=cKIXa^Mlo&f=I~eV}}ELC7Ng`VV9CCOCh!%IVw{!TY7v<0C1| z)jGm=78j6$f4oS~(cHL!%f6H_9davyJC?||tVzx%Ac*vyNP6*h1ni9qL;|n4_aZ zSL$5^UxCMP^_{r*UxSjz1~@g*I}-DVLW;L~;N?&+99wW_(ANOZW6Hh<&A5G&&~c8+ z-o7gfSmMQT%+WDJ{f<{XpJ2ZSk*r*p=kG7SEw0WhtETjP(c$C0dKR#O^hrjRim#&q zb;IBAcFGTR^#X7|1LeEF2~E;44&Y(tO`1&zWMVyqop0^d27+OaK7i^cXPEG0-is$u z)aM4yk!?S0K9|Fs?l?5u)AXI&c%1WnG@$Xl=OTJa6}jfq=p}B7nt-9H>T6gkLf2NK z{eNmkNcA#oJ*;#a#FZZ&Jr88FFnB0ThndmD%=qBm!)ejoW9LSqy`VFDE>*@y1wk|O zIQdMtjg(VD#{CPdu84GhUzSk0Kg{Fy&J1_eXyefhSvl54ZyzED7gVVIT)UMxlg2oF zhB8>gt2IMBV^UnocDkrsX8de|xPr~j^ZiD>%iv4!EhZ1V$t(ITm{kzVPo^_Hjm9NJ z#yj!8-Io_xcQ7+ns2b9};?g3b0NM&%OU`Ue^F{%MrkqLHK%kjop?~Ny!na-W*s(ql z86I?=A^!U~nZIrDY@)xF!7gYd$Kj{eP3bAmlt3Kh#jXtcQ@h%3RGBD}vpy?obs@ZN zw=rYsosUwO1SXw9Vcb4R}c`zW3JTC;n|BS6TK zMn_%G!(k=?Jf2DqHfJ6uFwPJKPjmfBQ7_)Vt%?cEaV+n_OK+L(}2gRJ+`0<^r2cxQ3Uk1uobu=ya}Z4lFO~^tmdnO1~rY%5fVc=LCaa zIIP1w($c=-;aWC^E-Sp@EyW#7Q@4AtAlMNBu8zhDCepV#Lgg}!7eC+M@NBP1k3D5n zy!;V1`75fe_su8*ArNN7;5bR$LVi_dvq(-&hGIxbH%Lxf0cSfMHhN$Q(Da+*!5RH^ zDU(0Eyi4;>EhQbo#p!Qc`upA7;oNWm_Dpnpd6Xn15GKBZ?8Cbf9Q3+de7eJ@4J5;? zhVt9b*wHxZ-glMx3;hrM3c|Br+_3GonF9yBf$fNwlVNZ9r4r;3i1s6y(q7uZu)C{< zPo8f?y2}UX(l&|u zTm|Vp?uP7G&lW;-9lGsW&25D z#+LkR#Lrt(Yegzn4eDug@`I)?%BJW28->mJ(2+gY(D$E^{hW-ugau*7}$=Fy>FN3HF;CO7}29^B)Id8*=D(SqId#J zEA{kVSlO!;r*jrl)=9z(76CSbbF=xt%cV>=C!s)82R)LmUdD@UR|hkLJ?Jahy9K^^ zBv60KN3?tEmRAXO_0YZZw^wUHKunba(&ggm!4XQMG$Y(Qi?HZXxs?~(#-Xi?Cw=WaWc=7=4cf{f)YuT)nfm_5G zyoTkjV#2ZYF3*3VFvpP*5vmoblOaeJR1N*=!wp;j#<)p;f40f6>5l$ly|Nw@nn!EM zsQ`RqgQFV6r-UjxclPW=+O(n2DTYe+2-uM{cAI+tiS%@TUjPiPVc)CNPfSIsn1E5UFt%rP+mF3fp;A

uYea6)Rn$XMzfLKm@C(b^8sX7FDV?# zIR(5B)&$ht3{Uf?DKW~bsHm7soF*bCAGnBkP3j!B_iS3Uz<}Jzs<7>Gc&s zri9#r_dj%j#B#8M?}r|Guj!lMzbX!laWa;P)XJ1Cj&G$;*j{_a8nsN-79JdbQiwxP zFngf3qUQIMq5o#;lN|mxu2M!liW21fIrm5WW4eS!x=L0-kvawmqAt<$XY+)2!Sy;B zK`RQp=uBihn2mJ2Oc|;GY|rPCf&J%JCcUi+hy*IZw;N2YjI|L^)d>o&d1)Ff&E0s))`577!=tCxh^+kPQal&^)&v#hperwpKEKs zPg9mge0M>;(9uzT{hG_FGcMLcBEkLp=dU0&&704SN>~LG^y#{lC7@8t;f1BcAqvI! zQ&WbnuC9dcdj#gN<>h5ER2EWc+K`o-tIfq2j)Yo%BoU-e&>RtcG&HgNnI)_%N1!pC zyk{}ErX_{fSHu35~N0`@-UZT2ls5l*&!hY#*5tq`j2HgwSl>EUeDF!`3v_4 zQ|u94=C`tM63x{#*|}N1Vek0ufLn+Uh&v4V8l@VhjBbc^(x+ahUsd3hJcjGP>* zOkTX8{*$j|Ggf5KmnOpVj`|25JSJ-njvE(+3f-k%=l%Ka9%Psp|2{CxfxeX$$GC|n zia8F-!LFRPN3<+wd*p2k-@}`8rG37cgQPu+O2ieoFAbGZURYb zgKN0*IKLCAbL##8U5{_+$4sx&Y5Uf-t)0AOrR3|}DT>4Cmj@=HbQ|8Yrtau9K0-0A zt*!kbB}K&;As+Otz8=9S0t4;Nzz2SxHxjXW<@CD9;4ZMsD=TH5JEr{gyN|gw(%V`> zN$W1Iz=dBy*wx^#+=LRQa}jVyXZ~IRn$~}Yg7C`4(UNOy{wRJu@*&AKoP$R6_3PIU z2^f%nM+`6BoC_a_?teS7Ty*92}5D)R)p(Hc&P+3yvwS?T$7ABp5cE-o68 zlqR)#Aws6kxL?xXzo5{B)++sTB$}nA?*x_G^55{(&q7f?1!q0!qITbZ{vWF<{ zR2h@u)X2D2O;nVd8yw3d{_Az`E#Qv&wQjt}GjOd84^dIkN1~z$2?>fC8szcvR}$l9 zf&O!TsrrvNae1tQL00bM_W{?AxwQa*BGj@z<-RDTI$_>8X-%Zt+<++E?Gy z7q;tmwh#RG_V#81&bJ;IRz!3s^1xR<=?T04l~%TX^M+U2bK80#OSUgtK08gsM%W20 z!(X#Zb4$Q zUZRJGUp`Y)Q*tzF^z-gr`t|D)7B}Exd{YAC~dQ%n%_Jz@vab`jGjVRgiI{ zps-M=oR!{;hT9*u{)UFNM}6;^A3pqPS`!j(8qN0r)kdS z*r&aN(W>_UxtFr!50L$ANvH&r^I6|r-y-oFLN9=~+wT8^hYu;A9{VB6bokh#R3DME zkP?Yi^@0CDm?32<*Dk@(Mzq+)mj3w@l`?1TujqwAv6p!0cM8{5jK2mb{qd;8_ujpC&t6}Hz~Q@>|?cN&UJ z>ck#XcSOXzpvoo#cCi>wos;`tM1S_&Civ!UF6@;FM@bf$HL(U26clLsR-`{PuKuvO zxk)iyWL)#{<42mP=;){F>ek&KxqwJ08W*Cx=i(w*ag)N4OI9-Lui-TubIVPEBO=aH)ocd=gYn>3|DB1lkazFi zC1U{KtiRarwTSQu>gEr_&*)Z-wK-0hvQ^=L%;x>uE+9T{XMTFz4klMjy61bqRC<26 zyA5A)T5`T2ST2j0Yjd{D6dP>pokGrqrD08l)^YV{Prhv7XPqPbs7eD^p>!qN@)IBf_-kGc|SgBi^Qr8 zB0Kn0OUKNh97g{8H~K@>HFDTb3OhEQ!tTn*n!_=u*8sc#upU}pFO$q~(j$93^JnAo ziD7m&Q5%E{p;N=* zEnw#_j;fj*CUp%Bdr+Cuk@4}EqiH9bq0h?dSu$ySCG{VFyvjcs?Odm&mSo%CArLFD zAmX``{@uy`ufqg{MsZ;ioIzyoigF+nEdPqE3!Xo;V)?Kx&x*H6wgga}GZ{+yQ$BXb zp9L1ShL%oC^pU)PL)4%BEd=<3+j+w=+SjRWI$0nvADdrMku*F!%-TM#b#Qq)_(6y& z1-V=M%v@NM=T6r%9XXmv`UT@KIm!vvgZkmXhZC{L@V+7%eF{Ih7=ReAV`a3H&Nac@ z!l~MNdR8#m`;CNYjq+?HZAu#chC(T9Cbf5?0F%C_UW+Ke71= zl^RR~VyIvlMApz%zFQ#BrpM1s@UtCr4s}I)mK33eW4`z{JxgUk8*6 z;E$&pFggo3$HGyee1z1DGmuZfIIr9Vu1B%NC`m4XZ`(*q7D$>ygt&^1!Pp&(viecG zaUxk$QUpE9Dk|beG?ewNY1GwVnbc1iZioYP+3(&sA2P}z1z<0h>Uj)6I*at~R~vGA zA&?pYReLL@!7E8Ot@##+v^KhHn4Q&5${S*5@7CwLoJ-LQh@fAvq zc6A`d(3m6}wMblZng2i=uOvHijh$E9me+?m@*84rdCKC>TZ$*rG}P;a*MrJFPs-@I z1^oT`M}SPO_GfegL@iCq{l55&vr?Ud-v37EV$G9;?s?*Y)>W#Yl0f#Q_+KL=egn8E zKY+1|>-pK=ii%=H?Z!$i+eu7)*1w8v-C<&?*{ZDeIm=jI~SloZ1^!_!Lulx4J92~fT{%&miC@Jaeo2oFX zapeLoItGTs)YPtzTx$B2_6A<^NCn}F2tqvT@dCi4Tow(KsQ__(k+ktpx#{=c+vRQo zf{aixhpD#r#OT2)Adi33%BDcRBOv`=T_>T7Y`I(#8uHg}X?AGD*0tQKgs#NIKQ;JxTEy z$g|SnRiwTL1j03tE0{Eq?$uYO&t z`fF-VLy&o>efSBm=W`3DqxhVk$e;@)z!J|@RaNiu@kK9n#$(%PwRR6n zsxHa?XD}9Lg9S-siDtdejAfo~2p+juAEHo!)?PDhh%DgRJ-XLWtNK4f?OCHqorlYI z4My0igKQO**&E7W+VwTXbm~_FfOVPDo-}N116x&dsL#sLsWRTedZyl7uktmwdgJM^ ze}D1dJ+E7ey{7^Yu)arDj4v<8uZAmF9*uRA(9}^OWejfLy3WbcD`u?2V z>P-=F@-{X!eD>zen|Lc``$bot!TCkonV&0X%9f$mio?gYDEV`WVqR_S9gz)>JuZJf z4vtt7@psji3nobbUjJh+toQ8!a??unnyTx+Y);9L@#9CNQE||6$XOkAB>go70mcdx zC?)y%{vPCoFeg`VEPr;=m+72Fv8e@sKmFr!xz{EEs3&#g$D$Fa*rOF%71~WbfmGE8 z|3XIzfDS#|o1Y_~PYo@Dsvgey{_-?&3jx^GKL$q{D)|GjY{GXT5eiQFLG!&*EdVe7 z^HJcvH|hA?kQbk_(tFVF{U!>pQ91aUe3{-uX)NU=e`ZLy*Q5T_+$iTcU@oD@*N`JX zM~T&a?B|>9(+uU53d`f%t)xdb|9qZr%sD*wj}?Wwv<`#4eCLL)cRoZ%_br|s({HM9 z)uMkBCOuVprF!#r1|PYo!hd@KM)A3ab3cEE03nGgGAK_KvC*vC`rk!mtoi#gK0dDC zQJxDtdr$A{6>7GtNQ0;o^l!06d=cc)0A^^i`yahHy?xZt@#H#yv%D6G_p#VCX;1zX z6Awl}A4HlbmD$)DLN@He7sISA>&)UkBmgZ62=s`9;cAOF_PSv1kTyvSqV?<${nl4U zM@KzF?4kX+>HmZ}6G!sTSMs>EKw+~}AhA5Qt1^_V{ zNleQPdTwfpI#~Y$)-N>&m&5wsVzJAKT6pNo0{+@H2y7s$pT%>A+I3FaV>1W1R=fd; z3H-daq(yh`I-CfSNEc;1swdWneGvx~c>aMw!y`w1sl(W~Euy#=n*u%T!zmtfuB1F; zp~qQTPZN2B_A}--OEr|03B0|%+uFUyoq7RHD`Z6o8jw9U4&U*=I>mJN_FCH#;dd`JFjN5E z&;%q8z|rocoTFYIFbG0=hPuoL4gPPg_ZwtCQ=Ra-H`xW-FOJW095gyIGB`2M2FCrieGmxm5h{Xn_k|1T#2-3ZRBAu56tgBEG(z0LoinO43-^JM>l+*kG=#LM4! z3W(GR>;F&k(f4T6;_Bk`n)-QTS6#ZfQcAt{7L&KWe*L;RQ2d3!VcT+d zV$FUJACNoWng!&6+4)4NMJ57gnoFM=MkwWFliE?HradGKAh{7Foe6hvoVS4K;fdo> z+=HY1bH4a*|CDR~8x_*W&wK~ym*+#$9 z{p3zy9!bf`M-m;j(N8Ey+88H)L~4(YG%#Dweoi{O`_%z@3-E^2@xN01-N0P+Z*6S8L)& z&OOwpIQb@$S>i8)xr%4&P{YE+&xnIyX9`8cKi2Aj8U?Ebdse-LKink+jw?YJUov6&T%|Z$7Oj$T>zCB)+S$vw#SxTkFH#c=zOH@MlEi#w zdE7J@A4m!8spLAk0ZGAJ7*hXgVOQ+?G(hNH8T&!s z!{Z7H?te^7{KE2#3GkFw0H-GO{987__~c4->-jA{Y(PoX)!e&qyqMBwGf3c6-oz4l zZiB%F;`}mZ!|fO)^Ke|o)a&wR5K8ya6Y6!DGDk|DJ9nN&QNH&*7VcWXU|s=QQg4-$ z!mYczdl`dKk&gNB`BO-kjSoh}Hy&aBcOuQ)r4*}^>ij~2k^^Ef=m>_vT7%1jhLA4v z$*rmo{G& zG~T}{4ZAfY65WD$gknx<^)IaaGxDf9+%$sB?NW%yee^vw`dI|ik+zB?z`KXPm4}Fk?ryq0@uI_15>hn7PggY8-ki%|K8j~2`y!ah&CEcw9Cp{_oYUkww zr?>gE{A7O+LSt|#Oyu4UC~-U&pybAR)NaiBO!MZ4Pw7aYps*7E=e!5^!q>O|@3081Ci)m8Kx?rU5yTUZzWNQ-A2@QjrYJ<;EMU4!9(_~)Cg z<`kz(2#3T1KpW})pApKdrUZ~rG()Oy^yAa6hd-t_&j)b?jlwGLw(PmB^+*zoh>GnE zrjH#zqDuhh;Z}02&Mk1?_(fUdWkRCxJHP*Q+FZ(k=20Ft$&3)RiX=|>wz0jtoqdDj zn-7rkhf6AIek~yi2^(!;<-z%Yp0#Z;qDD(%U{HyUTd$tj2y+D9JZ|`vX@`2y()XD@ zD&im^AT5N#A$;1Vwe4l~2@wkgCQT56j{%az7xi8yn&!Jeu%EqV{rhf#v3pb62M>er zj7ySd=-CeKAiUY8ZIjg%bZh@fAIyW*nZX%2-o;^F;s(CgzI6m|clbc&)kD6B;0ShjjjJU?9;9sr^#w z>EMP>n~+#h!)yst3-+4_nxDow*j{#`Md6R;F3$~<8-Hb{5g1%E=$0a2vrjlB&I<)E z{hbGBPX72V;2*bE!>1oT?9t$0HwV%WXV_scuxT6@8A3afFV#E@A#{CMG5bKqCx^Y7 z(g}*Z%MfN%&FQ7+0svgj-3LQ5FDny)LhHZrOK__>o0cahBwdV@YVx;TU40+ljVr5p zb3Jb5;hfA!Fm)OLF-MOE@O$szGk&QD%1mdm&k6K@x(n9y^mU|7UdYA#c%`;V?|cBo z$K9Kw?-zvTIUVdUC#93K1q_~Jui}wq_zvkYYzeu5kenE6!akRN^7<>NbfeTPD9yXU zd>W8d2Rykj>s63lpS@`5oe3gNd@kf9GGLyGI$H3(WT+A^YjTHwi9IGtvbHL;3-o6P zPQ}Qv&{MbI8NJY~<a;ctJb5ci$c(6)c zp`r=$Tc=(-Tk}}Q#hE%ohCgdiroy-Lfxg5Bq~N?C?v!G_6;*RAi4lA9s7JRirf`+6 zv|#aW#-kwVjt&pM~UKToycDf`)c)L!s8p-FAMLV!q%6sW66S!RPts}4Gz44qtG&@ zToO1~HJT?^yy0c1CHe>Zo326iK~$Q|NY7f4wa1O)pjZUOUU#WKmwJPY4^SwG7LzrH z9eE8d;o|HiOvE2>L}Ga;F(rk3KkRIy0KQAZUG^~qav*Q1(_z)=l(Zf-e1aJPIyav2SsZg_-s}+&^6>hmpOB9E!@6!a1y0TFifkq8kbw} zEiQj-D-sZqG;%_TA)woaH!*XdjO<`{N_@0nP9%qjewER6ktXV=3>~KcUnqe3Z|q%n z>h%RFJyMir%hm|EArApq;BT|L9Bv_Qy`7fU2f(L_G#xc@c51ic!&N`#C+Z=cSNSV9 zYf}vtanz-cHf8y+iv1{?WLdv z*V{(!LAfFQkAJ?ig?WUeZ4sgI#8M~aMC#=B7u@?l!AU1v)+7Y#8|1G+OrYJ8;;m3c zR|e(LU$$-;Lq$ao61!`5)fc%Ulazo^fVuij;}zM0AB}DVm_sfVzDt^$-B`tkXTm2@ zHMz()NCVhWd&HA*Wd;pKML-CunGIPoL+};)R&o*_Tts;eWZF2w_Pf-_wyuvsLM){y zSI9MFy(w;=V3Byhsl+hl`};^aGUF}8^zg+yxT6Tr%*T;HQY8Jql8;t>O*p=&`!!zq z@T|HHJ{=BkWgKtL2wXz+vDp78SkO07s`x^aupPNBqn(=htUXv)4aM_X0sZ5eoU`53 z3F4B#OvS8VJLPx|{8eS)qceDa?_``z+ev?B0aAsg`f5%h;#5+Vu}8UX8!72!=jmR- z@!#<>dMXR7c;-mS>f7IW6ot{Me4$N^0OgXF#?^i}b2uK(OGxW`&~;6s)*GcEWe>UG zgHJz(BQFF#)?vIgf0L&Zantw$Yo=exdoe!IR@|Dq8Z2tdnyQ)vrEYk!+tmxk|mwvkHzZiXc zdolG7J3qKmhH23RLc@FQQ^rWilyyY!ZlG{T*71hpv6@W`go_>NZy2M*32S$ykb3%3 z%A*65>SXu5(2Hwo2q-YN`LF^9G5F?oI{n>DG=J(L_J97ohb&vo9t^ehgD@v z^7KC#=}$lZ8nb+ZKT{}5rN`qX{Ba^$+498$Ope&9NpG0~H-p|~a9GE;>k|nRx(#nz zYnX84o{NJo64a4N3M#?>io`|RLK`ZqO~In>u@_(b^eU97 z6S5-LncY6-k=^GuZmn@CDcgz-RFS?zybO?^Pi=v7x3-=` z^ENEyDCt-8Lbt(8Lgd3?t>5w?Y)SuYd$vl1s&R~65-D$o9VMqZ2$ws#4Q~4b4+z+6 zdLGTS-_{rQBr9>0ZVQt&JTu@VJd_|(#C0h(cf>F^XK%jTKbFC->p9)M3u;t3}NZA)I2kny1Tbh$N|4!2& z9^%J85y%-BQ)svtk)gr+)-D0Hd=>9XbxVQ&c z&6|-i@v?JOhv`@@2IL-oshx%yvmsP!Xy|@Kf8k{tkt}M{+zcC}0raS%YNX70cekmw z8cF@wZYEH-aEQxBl5)q-JC$tBLqm$UBicTiF5g{K@xC`v{myUut^SP-)?isX4i{DG z_0ZrC;T@D(L;#IBfia21qyiX~bks$wGdV^so^#kF+8~)TT(73?FuOBN}|ex6?SD#pEC{CbU{dFtZ#t}>i4X~TW;&sR-@ile|_Ni9B6b2OdA^j$AdSn zPr#hgCxHhc465V){5R1E0P6v!N8JDnH1Er9u`789b7_CygM<54b%xgCFizNR|-%y=ggj zHB>@u`;E=*#k`l_Nr;ETB6F5&An*>gfMV)N<1^}BNVsA*Wi7- zVWKKRRWI|UAYITXJ<&pufR{ky?w@;40Q1bAP%|n>^ReU6p~Caj_1VglEtGFW#Cw1; zrGC){__E*lc$a^0tBMCK@CVvT7+xnC#E8zEW4hcvIk7%j<9IG8P-GDuaf2y6xeKTm z)*HG|p)yI!i^KGAAkrp0MvgBmSrKFkHS3XpWBm&7ls=A&(>ieMskaFAbQ}r6KA>K2 zyfO;{n}6`Uzg%T-_q$r$*vMYW=qJRB)Yg~__UAPul?^u}Y`=#-8~`e+33B53f*GA- z+uwe4Q;XxGcc-!!LbIC}jXtxa1Gj;7j0~mKmwm~^BQqc7+FcDnN944KKo}5RcW>bn zqvwmaqM#~K3rDc>Z$Dwczz!=dnndZ*^n?%f`zoF~z45vp@izXvmjj^4H_?t~JU z9r532@baYt@^i5L-!=8WOeW`~E}XBu#9y1%^)UfG3I4@62J>}}7Gas)kBncQYLGe4 z@m&>I5n$_U81yb!HMLu#oDR-zLm(`?C=Cui(O?~VqI!lfji4VXt$e^k6!=5rnoz>L zH($a*yQ`>HmPbA%Rci+Aq|{N*terP-1p29YJnh(G#xLG^_6(!H4|nWu%T3miT%MVk zw=NBlK8F(4S4&tOrRDRmqPMZuzaj%P0-yA;G64yXX$`PeGa zu4!oGxCMaymB}swZedmO0=&MyY5;m)PgDMG;OR*HV~+lfCjQ`vdI0ga9!q`jad8PK zH;hUP#L2a1+`uggEI_4cSCoMx3qw=x--jCZQ`3q8yfw%=UihgQgdw%-uIP6X!Jy-; zbgjoov_TBV#Ob)ws;$=vcJMjF={(dLsTM^kSm%6CpwYyAKP=P?TP#;{=HP}$n*iQO z61pNOnx?3nyT?9hL}xqOnjXhK+(~ExPR>m3QhxPhziQkA=Bm-x}Di+?h5_*#6B{+RB&$B;Z6mM55vV1F4cd8td*=6gTqYqu8k{?-nZ-6Gd#=XxzC;g%3fUl=0>0M($ID`Wuwfsl1a)SB;5py})n?Cr*h8ho}YO6%$|WaXGWhNGn0} zGEn!9R)c>thS?GV(op3X2}IFjYltuv7tf8UR@;IOGvWpM9*da88HnF{5bGyr2 z)p%N|@b?7It$?C?Y2YjCceIR9LI?2JX1Yjp*Hf%6l#G`jlyb@Zma=sTM>`3`fP%Dd zi{Oz?w)yNfhl6+=g78mCP)Y#7=+?n-bc9FyHy!fah0G?``la{*du2GK@bd%^xZMXn0(I zxOn;A;FJ3ldd_QY>~ERui1mZ|+SFZNxiU&yKj1RpB2#0z6{f{WlR@YYCfv>aLt2Ba zd3dqASLNkwOaU9K9C}yc188ZAz?*m9)Gh zc+|6*aa<8qtr~t0g5PJrRdcahRy7fLaHD3O{T4f^*s-A##YMU=fJy8bk7~GecMSS} zY4mypbYitf8%0bw9AZo3n{a{VUA4JYH><9)c53D+LgEjt~Ajj`zu@cEi9 z?EQ?#j>iUaiOPM#b|5;>JDTDrIbh+9RQ$q0WM`7*{nusd)J<(iUwfrB;yqBPO$$N` z*^49kRLH>uz=|qHomtZ=aVa!*i;TGEHkh=L%;^$eOrGi5*N|yh;_m@QbbV4fk_V0H99J@hXrUuIJ~>44AOH3vJU1(Ir+oj#J+Kf&i`3zVcS=gHFc^R5#H z44)?cmX$5-j5;6yJ#aeyF>o&)e$L4gxif!pKzCN2sOa>X*p?j=K$Ectk zbV`Hg;6}Yyxubu2xal<^vA%j4?+W=Amlh@cN7iS&R%2z{+h5Iib;N@FFzq|f&&FR~ z)o%~nZ?jea9z$B5E#JQ_FO$h8yLT&g>+l^HWw6>6+onJ5)z6h&Wx{Bd@Hp%oWh3hm zq`=>r)9`i3t$X#DE-*iv-YJbYo9`nqeZAc}v+Mvxz56OeW3 zJLbH_p|@8j9?^e2k^j&ePF|dyM7uagMJ(G)NM7Hziy&Ks9`+xjay9Y{7cUL3Y?hfl|2mQ8$`JH}_ z)Mk}I0W7v^PS9WC!I{~_5v1tKKcnC7sCWT2zGK~w7PW{k6F7bdVPVP~z4WHY1n|%q z)n*}cwVHLr0LHuRPlDK2^jc3T^)#7S0-6z*-gR?)|1Q;WHaogi>Ait8GCn`~%}Mm} zw6#6Z`kAHJ1=g23W>{K4x@p&s)#q5r?=?#A-n3E2Icor0Rg*{EYWUMWpk0d{A!2bG zP7>~V{)ef?(nGGBICzlU#-FNZC!EJID3SpnN=UvBa(fdz-<=A0ME| z(_^hhHD4|$?xf#QCF69(D0>RqofMUB8bPI!KT^#XMspu^H!^53b$@Z%Tm{HNs)F4%WrKaNKrK6EMQxiWySRw zM~y)LW`BdZ9&VNE>co;@od1Di348qp&u*{C0-ISF*RgZ;_RXr|`cJjoPN8jQ-+Thf zRvG+hFp(X!7u&IOd#&YxsxNaAGkBv^OO{`M-5ph?^&a5OoDt6t2Wpsz_d$DW8+B6r z$YIy{mYtT??2uo+^GTG4Ph}v&r7L+nF1k%hYoDenbT(okws0rp!RqMn0WPY@r!H!{0c?ZDxAvC)tdJ z@BAQZIHtIt^ZhR52yK@xl%6%3B-;J0n%7LMxSKoU*Sf4wdt36i4zw==l{OvI-G*aE z^S|epdyu!5VCDk2*|@O?g_E}`i~@5kgE`GSJ;zuHGEz}Zq%|b&@pWJoj)|(JHy!Qi zD`7S91f(pVk|UI4{aGe)!(F%TltqL*Yc6$UEs{*`?4{-S@U$dx0y<0Dt=7KUH_ILI zxesWI+j`v0)_Ug{*v7VvFK%Nuwrw?5W7{?w+ji3=jcwbuZQYZ5*Ug_Kf8O=3XXcsNv-h5J zR2D6XQnb_h@xR*`<;chvIXbqZGHB?t9Dym&mYd4}sdC_3~uIExX&%hT>De2@|( zB8?&B47Z~`?o_r+AhbZj)A@<~Cy4=(XNw=3Cp?&UKX{wgdPXh%fi<{QZL`S#?+U|< z7s6xQZ?(hxx=yop3f26Sb54l~MvD?jb@BN4b%6aM{Qj`vS3F1?{wIB;Jl(;)e~qK( z;_9cz?`aS{M6f`t10vAonMxxIq+gi7+FCwUO8)Xw^;$u~H(X zIo=BJZ?%!Y90!mb`Q`ZkN+BkJ>5&7@;^fmE=89$aeOxZF6TN_p=h@VjD~au;^KDI^ z`D%BVxY>LCC;7LcHWjUIv;uAedIJ%65vJC|Gpywt=jVo3_eZ1crC{2eznK~}8XbHe zY`bbvyIEwH3c1ZO1?iCRSqVslp<<$L`nD8iCQ1|0$PZ^o;Mo}JJu@zU-?nHqYNv3x zR1p5C)8MSHC(IAtl+NNWXE$W^mte7c_ga|m$M%y!GTIEfLEyZRJwBsKoJV@dJ-E zoAqyZ4nBOhEXu+qH+He1>(&F6#qidq2p`5tptBjU!**qnmQf7aR6 z8yHZhiu?TEGdxxa{eb9x&uj07nambW-G0_pKW4k#Pt&`J=G{K1Sp*`m(BbBuLZ>?l0i*?*Wf355# zpJSY6m6-M(BJE&0hcbc$eIWq>+Zko+0s$jdp&4;?9d#Pj9kWYu>=Jq>=L;$Il4>W% z~F|EDS2M$v<%1kCBdRXYNu=U8Z_G-<{&mK;fKTvJBGPcg=1y6;npQ zO7lB10e71>2%GRi$eB5?Wp37dGE*=k9u%rKiAu6a!rCfsWywz^LZ8*IJ4~`dw6yGb zl$TtJ0*;?QvAespr>8aoMjR8>zh`lgdD%H~CY9580<+dFw?gDUSSSa7PrvUS{b=Fj zcu#mZn@fJPM0T|mWPe}3Jyam0>SK?^?)nzSn&ngw6R_g+Q1*mx3jPO~hRO4mEO**=QZbn#IcqqDGKKJ@psLCzZl)U@VLTS`BN#xQ!_@8gnjjC)KZ{P$U%M|y zK6uz(@MgZv0QF;E1QfnwXBX z_R1Y4OLamN3FUm`7|ciB&cZ$e=h$i=h_9A5TnvWnfB3L{Vo10Q9{$Js{jqM!CDj@dLEyi583;b1zq;?bC_x+) zXv3!h0VFV%K;>Z1rHa$@l}aXoQLb?vZ`1)rhwxWDRKM2k@tnguDW$`kn9KEaf1JP< zDIv7&$#(5uM8l2pf(+r=yaBg?Xb90#%hK-8Ply~^+(SPg7})dxJaYa<@sj-^vuu)* za|Gdo#d@^t)6=HyzS6p@LaQ?iER!kG1?zpJK+A8ew5FB+IWe7IKwTsi@{DZcHRTra zgxGvAqAX}a7`U13v&oL?;U^^(#WY1_cjQD!1GN)?Sw#Nwz2}>}-FJhtj z4demN^95*3Ky?Q`ppG+K^GM-uDCZp7S7#+ggYc(nqC3-VW81WcH@NmH&3jl?` z`g6~G+AiDBU_O~uK!4BLKoz5$;fC>332@9Q~(cRnVDk_@<4L1 zU0?5<#p%RMYTIrzy8osCaYkDa-kDWmWkbT1fnqnc+B@CVOVCUT&&eWY=`pp_PFb~6 zV!L>~d)g5}GD~#0+E*;a_%pEn)MZw?kD2;JbeA!EbPbj`pJZRS1rvIMRL5`2519@| zE^&0ck%0O^hI7G@sZgRY`ZzDtI)E0MKJapP-}d*BLO04g9_O%3=E;rWcEDwKIDXNl z-BH!Y8)d>OXz;;L{PdCyTZWSL^-=lyv_|D}4H62xJe#+Um>cSKx}XjY395@cRl~>Q z2Ql$GPMOF^TgUCc`qL~ryhVhfqB(L(rHs_DgD%j&hZEo=kJke>S14y8$h|IG0)^%H z>f9PfI2sTdEK~URU#GT7@ClWq)Bp7mRioAt6iN~0Cpa&iSy7kU{o;>n`uCKtFa)2F zzmxq7ka^lr&v>*BQ_R6u+H}y==?dLep#mmU3t0U?Bv5kp>FI}b#%7hrG}L11D$gPw zjjOAd?#?4+t!UZb4eL&3`v0Ug1R6?u^)2raE^|LIUJEl2pMS&yAms(y+eBX1UP=Y)*)Lk|9$wQIRWs%&v;K5$>2U#8`( z^&^86Ke}Acfdt3ivRoYE^h`Lqy-~Pa4K=;gWrapEoH37ETGHomqDC?r)aQ1>VQxK@ zxzRR%1P&Wr@|)C719(lmTG~KRE^OP>WpHQWW?KVklI=rr{c~EL)9v5RYf%>Dp5NSE zKVXP-77x^_0)|D_uXj~0SAsnjZT^^bNVd7uajf~G&YA-Ee-gp3VKVi;jVYBt$m2`&i!n@7*N-9L2a z{;)N^{Dk!^geEgg)w7g*u`4>?% z0X-v?+Hl4AnYIx43Jt+_9#F#3HOXQy#lm>NRv&$9k?O?t&F3R{#=-I?$6wry~$K%P4934U9esCGC zH!(E6izu36Vugq-{7r&@uj;Kze>#fd>fm{x;4gghtLjN^R&;Cz#>RRa%{R9#uuomb z-c<6Lkq?7@_Jfk~UMPJq5UdBFTiUDefLHZ26SWtQqd$K{ zx*II9)1M%wPUC?V&S4DW(i>{T;WnlE=t6-_!D2Q_a7W3j6pq0iOt`<|FaQ;^znp4z zUH@n~ZLcSilMzYDr;A~H4SQ5CQejip{$BLEe@R0Hbz5Q&Yq0U5t19)WFtPBe#{2WX zD(2|bmbC##rpfI}X8B=yfvSn2Tvr#A^UFS0+J&j16udZrE~?ctb8 zLaT|jvF)~A90+hhBtaR3?4iE6a0o7l#vyMyjel58h6EN!@>ZS!d-=u->m5kyr{BLj zB9){t?`8dJs%Q{^Zi@gj~@p720Ss! z#rlMV)@er;gBlP^Y!dWRfM;geJX#9`>>S+PY>E5M?tHrRw&L^-!QGXr{XO&PI-YWA zw4@r7qZ|ss&(oL1vcR-kKlk8Bajw;+0Jsf5J-9$drWC$FefUiq?|oc6vXm?mSYcMRl2Gc>1W?klk$=Y2oD~_zS*y1uy1fN>Gp#}xW=FP@A1QJ^#|XYOY-<6 z0>xMbEILO3NCgr4WB$Y&ZV%<(p(lDJMfL87%5$rsB>^u%9x$-@>UbHrw7#dUYb*{? z_^O^NjL`Ja>`dOVb-ol}j=a{7g|6`!`|1Z8a*&?7IR1g<;T80dG*{>5SLfi!l_3MH z&K*cE0bp>OKA|VF843^(Ti0*5ORD0iJg>zUbJ==S{VN!#@0<6X~aN?Fi> z@!&mqe=XChPd*{!MmafAczW4kG+~ZWE-}DFTPxnJzfz|KJHu;pexggx(uMMafORZ5 zxY4*AOm~W|aVfce7?_o1K;*jSu1rPDBiqW}&oys39q(u6Q+tbhD)7i)4Log8#NT~5;gJXsCa=1fZD6Z*O-5;6t&NkIL`b&uqp9=47p!j~0N4&T#mmibJPkq(k1 z$HVXkeS?!6b3b)(J~vIHRi{pAUa!R?I-k&FDA$NQF;c{d1T9SFD^J;8Is&>D#;V#J z_Xr~_BfrXJ_QkDJy%wl z{j%wV<&qL4x7*?ZHMVZ@2PYBhwFdWU#erH*y$9ht=xykqNP`ivs26Ax@O2C*r}Gd0 z4Jzo!Y_HlXH~0wmdZHDfg_bGXW%58~EOjKe8_~&>$a#4g3-uJZkgu-Xw%LMUiqSC) zwy*6x^kY9ywkXqb+4<8=B2mn(SO20?KM1Rzme7Atv$M)#-ia(!Ed zDbs$t&utmF>^}X??nN08fbl`f;zCLzC4Sj`BmN@4_y|5o3&6$o9+y(oi4L2B2-x3@ zBz9#6KgL3^w}uuO&*8&5nj}jWvVI6}j4~APCPl0tDY|vsF-vtW*weIyX_Xw@?oDdC zc;HT`O|+rBf}sac{mq!+_f|wzJ>^hn_GmCKrA#pbP*2Qku>pr6r;_ySA$#wCUG%~z z5MSJbC+sEaHeUB`J26$&f6*P;-WjwV^bYocZ}PEsM&=xk#aUj)q*kfW>$ZX1o#4y& z(%`aMbV(Izig(?iJ5tzwA_22exNhUV4XRmSkb?TR!Q)3s{0q9yLbg$pR>eZv z{sl>7M&N_YOm6>q^Su5oe@zXZkOXp^BMtdDimSLXJs@EuJcsJttG6@I1dotxb5k_6 zgqNSgIUl#HG?coTbRyd-+7B(+-hW~IZ-iH zu`GSsH(S5&U}nYeR>KMy4id>D*$TZ}wW?9h%8SX7d+%$a>gd;Jw$tnL9y?}( z_B0vAi_(eZZ>3GQuvENUliY8)FMYg#Y$pOH!Wd=BUB+L(m!(+goG$vPy7Xu`J5a@B ztO<#M8phgrHb9k@7J0y5afp=PSk;#QwydHanpcbBEl4_W;}C_3 ziSzwC1FyOo0Ih*~%J{3{e$pAnSqmcTVuttQ?os5w6tcQM<-XMM-})D+F;#~C8p9S9 z;=`9I;l=&noawLOXb*r!*+TbU8^>&}_^u=_E`6h)H|=!5=~Vwb^=Hnc&3TWmgXoK&qV2#w0O?Y|oDqqO?H_BXT@5&QL_f+-Oq57=(KitAb*ec2Ze3aLS72L@W zw1OQ@>vD?2Ulg=p2-8KU&p#&}|7|fkdQ5B&A5o2SjlM{GVKG#5b}v2-GFFGG&ob1$ z`qthehPlH~`a;}?hvcU+3KRyoZ_3EM{cJB~5 zyr2zsG;c+wkN_|sTI#&=`AVdYP9g_KVRH*uWesrQNpRsKaKGN`Matf0HRwCAP!Pl* zveYoFwFm>$zJIjJMV1vqDXU_JoO}zixNm^2Q6O=(7rZ!$82gGCz5@rxUUoLMSDT`n zJL%(t*ukDLss$F8GeJw5SE-m@Z(kBRtP8@ytNkX&ORFI>?Aq(OLCm2_?QvS(Q!8sP zPbITDZ6KD+)fJRo`;4X{3dA>`=cJy`5A3pb?^ccb7OZ{eeXs1dADvn6hyW++d-?8a z8Z;?DYVBJzM;RSbUZo9mbExrdVGu*TAgL?C@s{!u+_8~P)RVn3de18)- zBh?^!MVI9%kMnRb;>bol+aXJ(B_e*}1jwvJLFm7p);@ZNc@lWv)hE8!hTUxX2$z<1 z+uTvF5qV>r>b8&Zxi~uhxg)%OTsqP%k6aiY$RGxoj%-GY%l;%vRusL zb%Zizwt`0JvL7*)-t9G=Nkh*e@hytG=CCA(vrN5#y=!Te9k}lOK+x6pjHtPYH1OKf zqw<0f?25~gUq7CmD=NdoZ5xFPe~4bvag&yID^&hrsowT_y`#!zUm$@-^vw&qR_2MJ6GUjcIw4rOm~ZwMZ+ zmxiwerjYFQds;MdFV-EhdN#BUyov-+zA%p_aXkxwl9`rb@ddR$H%Q^YzolP>B+eoy+R>tj_T<&~YrqzVx zt-Db)5Smb;&*2tF!N~SoF6&9>AYe&t2VTIIm1%$0TV?)a`DKl=h&DbK0imzlQXQa%slmY&p`h~srZc$_ZusR;geems0ZR;<>*uufZ1o-J1m&Q);op2VcNsir>5`7Cs)c>%Da0M z-X`{&*(4VA;~(*RS>)qOzI3O`Qy$DSJrGuH`9}TQv-bbJ00T8R@_)ryb7EiGG#@{B znL^zGEwUMq0XHj|IS+NXxen*^N=Bm0^xFa zcF-R5I8-@8zCxB>Qjk+zqyGB-1RLPT&XzPXUs0yhL}c`fhM9_rfKsl@euY(~-J?Mp zbvgc@3FK0_V};xM)ae(Ww>&hdwMNimjP|8i{e?o;&LND@S$G0bay>vO((@IbFP4=k z+`>_bKJ7)$Z@cBENY>cF_KYJZ`u&T;80cDCZnedS~_zOn@K#E?619>z~J1!n>OB-fD7U zlvR2SQIXt;Fab=|m~T7;6*lW}D>@}Q)F0X!pFBYbp}5joZ9m?|<9;TM55VIur`E;a zVjjjz8uz&mGVz)7kb#TovQ@)y5dvd2a0h|ucFN48V01z(Co7nq8#=H0Vy0V(22chn z0|G_s@Cc%qIbkZ|Pyiw=Ij9iaAY9mYKcOI}_Nq_FC*wxDemoFDaI0A__FCM5ev=<( zG|Vp6^k?Xs5lm#UlFvj)zS|FGY;`YwH{y)ca@X(+*|3PR1KVz3C z$R(n0&HM|y)ep_Cf=G1t!-?#%u=Z=p!Sq6Msm>3m`|6*7>|vMgj!cit*1ECEKJILf{ z({IMp|5?3gh%D`Epb_61QPT)IUZJE1LQfO$gpulD=rlCOkaItwb2vkE_d?~KOt-tY z+y74KKB4NaC^=FxT@>M1WZ%+x{=seWZ|?fB^*HzLRCe4rMMG4R=PbYRgp1AQE|&RxwV0Z$83QKe;hkd9? z9YT<#h&Z>q_60T)d~ zW&YLBz=nWLV&bq??_DZV3;Dvh*Ry({OkTUak?I7QNR&HlKh)$6T66F=J_sdMM|?@U z=`D$P!iZ#V;#WY3!ktB+zW%5l9k{%gcs-|CjYIG?mDY_}hU zsRfe-l|OfjsNXkpXL=Ad6&0jr)&+N`wx!(e zXw4;5)v-KLvv31|>J7yEWz*l*tIR=yC zU-R=-_r&Iisd(03CgNK@w{wY{f43iSGOyRWb~yY*s>XYC-v?;J41=ktEpq1;ap z0!l(~T_FSjB3|ygU9R-4ppiu8d3PA1`yMmt4*V};uco?4uw>5d9Rr8>z(!5|^n+SE zLg^Q6r&cylAp9LM50|o=vw)a!YSY6lUR1B+5|TvJ=?`vYeAuM3;{C;@*U_RIm9Ryz zNU2F5&dgK@2{FV(1B#dk#xGJ*YAlb;kD!vCf}lICsGb7+07?|1K!`BY-pOc_C3F(fc^MTKGS{R=gN8oDt6p&u%l(D!c0+p_uR zc_R8B1;ou7Lh8NsXlwwpq;?~h{K|$qR{Tmt(w7J@WV|3!-$V>DHcDp_sPC#%~TC&I&Lyr zbSLYrWh=F)Kn;a)c9yWVt|R1v0EpE(eC9$$tZw*)Oyg;Pcv`Pq2BQ|wjkVknhOWs+ zp=lbNb`U7O^(m}^f_TlO;V(99a&6-J_b$lO@-x$%Ej3WCrWY$1q|%;cI3N;Wy>&aN zhh}DgOO-1vFP;(aU-$R#HYbqO%+xR@5@FO6LMNhH3RCxwr<&%h$(&B%zQj))ma_|g zJ|6DfR^2n4X}Kpcm|h1V^53(bwzrooBN&ZLDBNq>0z6h-&cvd+4y!(OaaR$vmKc#^ zKFHCOe}GgN zIVihue9C$?xVb$?jKZg>x}mJUI||E|KmB} zjU5d{)_twdyaKm+qFQR{w$>9m3d6#6)K;=A-H*NXt)+Fw#V)7cGC1#!y26#YWG6IJ z9i-7_S5sAN_=P^7(Ukkc3kLyekBDd~mMHlBXnLt{VB^LlB1_E=HlPcj5%v=b<-;6e zcK|!lw#E8`H<=SQy+WfCZvSX98Ak-KD?~U%tl5YEs!=T4Wxf_0Z3p4a@2%{jDeFvJ z@im_*yaub0rM?;9awiuV*8Dwupl@=Nf8u>wZK{kcYGzVXQv!lDpTVBTXD5JRv;oA5 zF!KG$UgPm%oyIJJHhfYKV9{I9sY*4Pe*?@e5M7dnra_-E90E$&K6ZMf@VA49@&na5 z=KFf@r3#F<>qkM(m9;%$K0!Z#{%?fTKSgr&0`C^Kxkx_`(h{b-hP*}$s1#5J5eSlo z2=s=Y-E8FtOb@0%30tuMM}oI0<;veX>7uCn3R59j*rWeXgG zi0H}Z33Q>|bMwjYc~HNffcgjc`(cDNKStGTWGx#LEak)gh0(7QC8RyVE}&%q!@=6} zy_`p@Yvr&Oa818T%tj05{f4(8TWA5{33G~ zp*lj6D$g@ZfwD3F@t#Ph8)Z7Ft3X|fbQwH9Y$PEo7RCId$1?|&*TDSNR!JR}Uk(!= zcjBc;c>>|52ns|DCb5y`33H>VO+Pgb?g>Yt=5{^K7E7)$BqfSjM7ir(uE(q?pmmcS z-4C_5+2L?t0Z5}R=7h&^?DvySkAjjApiQY%>iA!HX?fc=$8*YL2LWoItnP6O>NWNV zAT6ROe#pJuNf0bZv78kQNrzzi2F|b^eY)a2!%=fj63;}w4m|9g8OTo%0oIg3Y8=I3 zl|Hut#x66E_x9J>`R4CRlBeF{kb+$b3MAFZMT?g#($eB7+4K+Wtzkm`%p_YQN&|5y zz(_W9cBW*dOb$cXo@3+Uc77=lMJ7s9*gv6 zVzE7k;~WfWQEX5@5v{V2ui^Vm#}oLD=6=X)iQ-8sr~Co9MF12E5eWY)I3<*U%?gMt zH&P`|wXmE%+mydwsQPN!W#w=fa8`zfwfI~Zt$D&gUv>G%Q~kq<3sg-_eSn2KhqR13 zoT?yuZ(29DarslCKudPNoS$0n55!MXN@iD%OtZDX+o;xOuA<-0qa3sKy`4)BRY@o67^o$jJzd3xcWXlT2}lT)*1% zHJYH2GuufPcR#O@=VOH7*ep6w?(6}`kt_U&+d7Xz-f z1(@-yt_$7Xuf4Lhd=g22u4_IC=p{O^gh5r|HFm(k0ycFJnGci}7uvv>f&7KCaE7o$ zz(9($PyAmyGa(3e61@F}{g;&}$_xSY0o3}@wWI~_r=oxRHnfs62; zefuA;8K3Ubn~-R2LFBu&8C z;Vx8F+aC|lT~u~tg&p7EEp@#V^t>Pdfg>ii3PtR<8B(y?eIac)cWc%$I#L0n1<3E_ z)k7sE?H>HVO?|JaCwAOecCx^mqR~>F1~{N?IA3mXZn_d9?GF4`mNPFmMtitni=4-j z8b{Ji4Z~Tw$HUYHkBkgw(P8qp1qMqtAi^pv9kTrr^wY=m?RPeW$PRNgVtMHJv=x&W}vzSseFe{%nT-Kilii* z5Iksh;Vx28ULqL)VwGRm2mzv=^c~>FQFd{KiDUVk94|sDqq9~a+I6isvwSpZj9RE? zWGtM4N{F8U)Wwub4tN;zr`P%EX4Ty((H;05cjH?!9_(8JWO2T=Z@XwwVp1J%5x*Ha za>u5#p4-jPP_V~8iysHZFDSny-Z|G^3NJl^N*FWhS1%E(&Ie&OAPqg=+*a~*{&xO# zO1+qDR8woFvPDe^A)lM0-l=9?XH0+R?hdfk!tG|rz$DqvY}rh3y3t-2eHtpO<6Bwa z;hMe?ktOEm-S%Px6y?yQu?Tr_A1V3A>?$s(ZeyW(TVj*Ql-%X-$+%o{nSQt4j|S_v zCq|M$z!W@a-i3AQ6i3`#XMee5(+tdZN0^DZ8hJz!J7}_)mde*)XN{nUv@VqC3C!_w zJi@z>e0k-Tk4>yPYH7ci2K!R7RmYzkIsQ5wrdS+G6VOq6)+4t(EGe|j-Q$N$dv z(w~2IYq?mz*rc>WmP|yP>-Fpa=(C{ufH^_Vbv<}JuJJtK&z=HhO|={;`&6(k@kwMa zB5gi?v>^N|9~%09-7xmC-Er>v-bPIWF#t|BRxyp!NE`ypUj%eKQrVQqAb^q%v@oDT zP66PISI1Mo#EO{04;x?ZbcSWBSnqhkt*(p%gc%pk);1CJgZd4;&y4>w#MiZum!L_B z;DsfC#O`z8by)qd{v+nTbdRpnS4vMqVx!Gh37t5$caFwwB4_dsIz?L@AEq{t0aRjk z>csqoGr*%iRFgruH~bPeyH@}Q9W)a%V70SBs^mESk?ETnjr(PTCXCh%j2sD`1}(t7 z;Lr5gcwcdXmvdw~cABH4%m)D4_;xRV4g7|K`bOYR`KFf!?gtxv3Cf9hxhm7dOHlNQ z42S|2Ki#JRwK^x}ZSG91e6hDwO}W_tb6+|oe{@Xb;v#02m&LB*)n@PFLbhXL`%Y9B%`uT8)aX{RIjjn9twF4(CNcuwa=?>L1SabvSTPS601=-jAwl(wFU99f} z{uIg;Ci%L9J2*YVGYN?Fk%Yk^d`2F;XlV!~o~HUklO>dVpnxw8#DSs4LvR5*V@V@X zQ1DzTZXGT>wk|&)BB_G<^V#f9Q*%g1$1a_Z@UXR{;L4i6{cM3~bhlYA0oJZ=`}Fm7 zroT5TAorVml;H4aU)s@!e3;=_%kDq7v_LOspFk36S{W;9%|sH?KBgdvCJB$`$Ny&N zepgTYT+LelD_J1unZ{kf#{&OT4(X+`G9=XzHLac{0>n4*1C;vhT|pP)*Z(?zSLgW_ zv`LYEM*duGb+Z^$9wTPVIbPRlbdDy0QK{M#==S{x%$?n4^W~Fk(z7V`YOGC9QO_n} zs?S^^G1JY!lXT6=7?Z#GE#jDa_cPZaz$Z4MZm@A*Mr;M%IfUyAum}%80lx|s6e6S7 zLIgVkaexSub_jDNChY9vR8*9O%jX_kk+Oy&6#z^)NVrt6voZO?^t8rf_yE_p@?O^F zOAN3vt|hIb2Q7{c+;>EKKKm*Md20_SAhZ!CA#7|K;-cXuAx491PH7<|FjhZQhd$YJ zw!PxhH$LRrIQgRd^P&+*mc>PckqVHE;lkW;m|Y^=NVtv zEZt%Dr&?Rlj?n*HxpO@q3zyF!;tg`1wJET8_kgIvI!(pNG@I!2Im4;Pw@8f){8hx$ zd`7Jm82(uNXzEp%;%*z~OW@KDX~yn1LuB|#8NuYqvRM+S0C*pO^Z(z$+Ana>&fheafY)_!i+f3udD{B#%TKQ_}(#3m7^0vIaz^x{KO~P zl^_z8m4IO(|LR(B`#id*>0@~iIye^DL4H!r+45}6{-~vBijvp0x)~i1@#p`l(tUeE zG%>+wd*GVS;4oTm<=C+83|6(f(u-k9gYc0b+pdZ(Q<*zSn;8u?8+0ODI;T`uaYXA& zdYU~IutAJ>4Qr0m+tmK=5}0%dhWUG{&wiebY=|maqXH%ZqH12iXVbmtcm;O zrUF=UStVwG<6t|pSA~L(44~zLI~|e$?0LUZDvW_@9o&vJkT5OJ{I!PB8mrHox7?~u z%d*yl94_SFi#UQ{3l%usH&W2-_3xbaImsE}{9popwGx8FfcSAMV%|RXq@Pd>{^|r%&hrC#E;unv53c#gWE%$TP zt8-wS1O}`n3jW?u#tR1y9@IYhC7-1J7&}t6$)<{nWgQS|B$67gJRdIrMLOVi%u^va zwCO$$PgGZwPM%hBPeYEvwg2h*WbcAgE8{s*Z90pg_1!g=du3NVsRPId}q9y>Y$ z`cdBGXlwp8D~J*e5Kd{ZD&15XuQq&$^tp)%t4>F=7~w^9eQ7zm?W`x)C#Ss%kA|2+o4PrI65Shn$>dn^e|<~Q#Ci?emWiF}96vB-JJna`UKq-J|B9J8`D zoih#1968083}UUM)cdVnB(LimErMz+QK^R$c228{^(}8zmXO~v#3~H$$82D*5q5e* z;n|Kw;I}`}Eq_@@lIcZr(kPN~C)&hmS)44NP0qR6<8zR7w{Ek3sVtnNt*nL~4!V#{ zHKehKXf-T}+#kuudE*p1z+_NE6-}WU2FyuH0@kRBz+z$WFd~`RV_m!J`8z z514+jm+oRM znL_&~GN0VsHQ?X*aQckEb4-+m3LA;!5~lE?yjtFQ^e*Ij*C!%Us6tp8nqr?5-ey>* z>6m`O&Z+u&a^7XrB#Uu7SL*uE1xqrABRUT*&i zuI@xGw!_zY!GlXeHBOeo3d>mr^DJR&|Fkeu)?~^6AjgaCr2-4}swI;E?(-H3m=xT)Fs(hlL7QW*CNoU!fqF8(R;@aiM-yZ1>q3vK$8NlEe#L zDb)Q(Z73CLOSy?L$Hf*N9FBM(c=&w+JU8*%@%BO>bu4xMdcpDebZhVOnjSDDez5?q zCm7CD@`|oE)-cfI z8doulP#cM@Xlo04`cY`EeQS+r9o$yLL6I3Y7}j?gJh^6q3gCzOdPIOc0JcUzK`fiu zrirG%FwtT?1F$VZ=#qT&AE4q`?Cj!?VBM!_rub0D>$B*CuzSCREzAMxU0^_*7GS^d z=62jMBDA7&(wu@!lIaidhv&+K0q4wC?jHRvZ^v8{@zr|(E8qty-Bg#ZIyjZO(9VBD z;n{dVeI9(CR$q4wcBc(w!XOGzg(W51sYiPiG=jc-6^f)G`GyF=!8j2S)lMN*;s-kp zlDY0-`7#FwA#3A^%^BPdt{jzMU|iM&Ey*CB=UbMQTul*}%IYX5#=3{932M^I`NTBO zH_~xZ_P#fgIv1Un#W#iJT)dqFX1Bc`@Ox@=hBd!gno~Gt^SzGokJfksnNceIBjf}} zq?n+RiXsiiYI2tN3Py3RP9V?CCk>^Q0l-w3Z$yqs5wDWj2Y6p_WOjhZ0aYQRR-=mi z6G7R=6Lej?pj-LdN`WdvdAGcLpD%-9yyAFRbCgBhP>&9&sje6o{wz-dfYJICNmn?~ zg*m8F-(rK{ts_N?gx_Vn9)I(QyG1ii$9p_KDmb`R>J>Cp2M1Rtucf`%)f{N*LxUL2 zBp4q!c8yJ^!e3cqJ|KQ^_dN=QL{da-CkSj@>p0mY|6V~;jKd`Vt&lp!C~ZtppP%Ac z10}dH+K#<&lYvcTExG+ltnhH&v_niKaI!@*f${TUJ>}KS2vdb&NGPBV4oi>2WZ*~* zMIU6Nal@+FQW`(D)IC)O?0bTPyX`G5T=_{DtkXeV9%)p}Rojo0KbDHgq{*Tsl_=9Gu5MbpHU<*fH9O21CtGg+1ntGdLbeLuP?o>5<#Z;`^34=fm2De zu=;2y{mR7V=DUw>n0L3;BQSY=TdXX4nw4kV+c)AjMaceqWQGI2ZC;uT81$^PrafzD zG!;bz2;H!$)N*_7?GmD5m17fpa6Vx1px`QKBYD5jJu6<>+`8-qAXEY|J`DBi` z=SAid=XXAQzz{@OsE~ViFN%vp4fm%@;M!TA1Wta^RF-}1H`rL=tYV6TQYE@97RS|3 zR=T*A<#76My4fmKnt}SPq>9uqPus69i%;zB8so3plYc(k_~;v@4q}dayx+lVe{UM+ zmJWsN6&a=}m^7LnuXq-luWh3mWOus?d^`CjL5rq7hz5tosq6y9;R?$;*=vnajW z(HvWm>0g{pqF%;?5D9{70+)+Te7JZZK6U!@e6(w;EFKBjfcyk(E9|Kp#wlkByEwb_ z?(Xo_M*{0ZUkUN|x^Hwr+H1*e``2lBXf!l%=zvXVHA{In8YN0BYJVR`N{SI91uzT@ zJd?zZLVl#D_E`J30%Dl7MvK_ymph~&GRAPQx*n%%0#!> zgDzcu866MzQ|5?geA;7;%E4!x$|9oCL}d^jPn>AjXF@t*-@U=J>>I63gCkmCCTy{( z^6HTyo{__#gnXE`*0psAZ!Zw(^P1+qr2)=EnP*%jX=O=Zs|=8EfrKw*SzsgBH9cQY zz&kfPTx8tdHM?B281EPl2J%~4s*ix01${kuOuDB24~m5)10iivgvpfpYUc@nXa^t5 zt+{MqK~1UYs$8Wc^B>I70O)~O6dd6PVwGP|8*>rp?8K_}Qz(8&1k{mju=!!=IumYV z-6_QTQ2NUG&jH|lfU#@XmJq`QX8dqit73|hVQQ^rqIbzmQs(3Sy^=}rEgTT{@Dv$7 z8$lo;5ci{iwpWNzqQ5s2<>==P0<`Ru-?-WgZPxAAf9`vF2MX3Hn4{m@^fovoT=R4@ z=iynrZ7z~3OgA<(AfZVc8VZ2TwF@ra>;&fqLbLRC*gFMHc)bTW!(}_zL7!%xda113B=o{l=w?qE=O;@d1`8 zFCwKWPiOP!Rywx7gky^Vx(>js8;r0U-ozGaPbh{@5;e=pSD} z#J0rNQLX-`75V9++PstR(A_-(O7)%U4C8=hf{`lAo#JEic?#Ge)&Quo&Dhs$jhdy5 zmd3K;kOUO7a6Wt`2 zOWGKCkpN&SnLwA$KULqH{Wc3YPYRW^-TUie)MB1i!UM)u7x0nsk$J3mul}M^xvf>L zbB`?`38DaHv0$_ffOW4U?-kgOwYrVoumTpAk!!uXe?qx+8QP{=z-0c5kAmidGVW>$ zAZdFq$SX1Ut-O!_fPixe7TRq6@u?`pO24^!|$}z@PA0iou6VG zZz~;F+@=sovp~w zA{bNgjmvr1%(MkHu=NH9_QWnRNIuPP2j;}s$=kZMY}+0yE!iiW-?b`AlcYA{^VJWK zCZwQ|a`%~C33*30uyFjU#E^bsN1k#3A~;4!xctVrHs9UW2?lX)_b?cD4tGDYF4y>l z(e50^3{wie-u(FZs>e}hTVw@?%T)au+0>zOW)Ql3Z`Y@LQxmZVXtCTx2hr&blxcg- zHJ1ply3nH9(Go)0=)xS0$ajMl_HlFj#Jn{WvS(DW$bu*=Fvq#xeG`U zEO`W8@H&*8y)GuSff&ri@K|kfzP4jBHRj;ao}xsq;LIoNU>;WEEkT&VvINkKj%| zk|^5|@$}U9L+BO?c;*dvY|5XlSlZ{OJT%BOq&sYKJX~E)B2Q=%!~Uq0 z7Cn*{H!D$b+Hi>KaJqXFEQH&9H@cKFWg&+^T*RBBcr}GKjt;Dkk=+2HYaedCoqE&D ze{PzxMUQsXK;Yi~;krMjeuJPjBo&Wn3_rD4&`&^!|e%OCyoV>kT&^W zyo)Vo|HX`gbMSZFdoL!~p_8vcT-#*^WoSGQ;;TE_rTDra5Ar_=Zg}f=c4*K-d2IYo zW!D)E2lMq;5M9WMmh~hyT7qa%HvT$6glHSldzWCNMO)qKSstAzQCADH>SCjd64ATW zqO)v>?%n78=lS@4c;~~Eb7tnwZ|?l=oHOUlz1>c0!vg?X?EIIgGOtUwE4dZ32 z9z5~wZ638+!O~)+TZ$y#PmeaT>E69Ry&d43@_KzR>FPfd=prpaBdXKqt+Mi&e44^K z85T89AkxXa>BI&!1Jx#R&D0yNtfZ5;v}zbyt1P837YFoS>6~M&){afPYwQNu>)!4c z+tx+u<48Y`f6GfJnW|iAZn#$c{cbcO(`TbDGR4DtVCzIGe7?C)_K0cvo{Wm84VgOE zM#H67)1JTN!PQmD?uEoRCjK31@vc0ifa)_C2FBBaL<;bB)4Z7cdFV5Lu}|#=Z5l_WEX*x{%)xDC9@zLYxfGOJB$n&#j#+B=Go< zXL?e&f>89DS6LbZiuP12xAroX3RI_^(@*Xj;pzytj36v{JcTzc{MDBrY{qVnkGaUMM{$+=<+t+ z9(6_!KOGI^%yFIisSeM)zQf4EdgJWZe^drM{M(HslKfXP5x;*YP5pT3vprEq8I@dj zJ|(puuv+|gnYdy+zSY=FMF3>QrI(Ri!~Lw5+ll6)tSEmEa?@eK-#Jidw$)Ek)Ko#q z619IRS@8S0Vk2kP&izETkbzuDO*+M&Og0hsC1J<+n4?cZMc{5c^ymQT7A(vpt3L4N zqkbNuU}H-4J^*JDO&Xe_44ev^s3tQ~&l-<8C@bhM}A;>r!=#xusDt zl{!MM@sz}D_R`3thntJjbuY}VD|2#Ey`&Z7)qdoqrf7e;Cv&>7-mrf4-5?>3<;JCx z2l`#_B1sk8aj{XNg)46D+l~yRFF23(JW>=SCDVmOf=&h+BrkPO%jrOkG#s);2jH0O=ju5kl)6tJ!Cqu(y{cg`UiTfW z=e;MKZgaG?)KFCFCo7`0@5RR5=}F_$9GgvtMYurv^=+eFAR2X9+FJT#!bt5f1dtQtCRN zt9Rpl6-j30L+eIZfV>1eNFwp!L~3#LJ2{+JZ~xpfrt_t9)w*H!dDbn}`>T}kl$`cn z(Yl4-(``ss%2P$oSXV^5H%8qjHu(aGF1}_Q+XZ$zewjE?LQRUgD!x0T*-f{y^GmU8 z_8fP3Bf$4VLMH73XN_+v*w)3h#etQJ2rk*O!MS4$@!@s4^nw?`g(kU4XnEcpZN3hB zq0Az1NEu!lNI%bB%=cV9m+$!xM+c$LCNOq0;4pM&8?u#cuyS&sb9)VMe=7;jJ*CHB zARaj}ENKywhl*Y2r;S}_V-*q>{%FsoXEnBui(UH`Yo`&r*3ZPxBHQqpK^3D{(GefI zLKz?QK(t;kWW{Iq10|T@(~EkWjgf~iY^axNYod=y?!b+CL2r69%Vx{h~gp7f@^_MI@i~cn5nSRV0X5Yndctz@YYj~Y= zc}uh|Gat?0o&6b!IWgP^X8e%aZP|cDu?k^;aWcqPQ&q9ejZn5h?fsd%no5-%1^x0W zbkorlCk{~;%#_(~**iPxtZa+#q?2S=Y|TD7-VK&3mry%}CI4_qKft6KtTc)?>L!<3 zCA*Y9pUAO~hSn5PPwOW8J7(yjpH+)$RxhK=XgD^_tz)CJrxc_AaZxt}Z-IN$3s zA@snHZLR)RJK6PFnaeRTG%GTTbp#c;mn>;NP}a=yjsNNbcYJHC4e?$qYpoE#%a*uK zp7`GBg?s>V?#nBL<*%$MxbyIWwk|&<;@Y%icpu$>9vl`f3mF8V1|Ay-OBd)u`}PJG@5p;^dc0j8 z^YHR;;69$8o^CuEkMhqsG0^4c<5lJ1ay3mMcH0mgb@JlqeG7Vq#5kyA=hyM9u)W<{pC@J z{*fil62SQ04)dD!B4lQb0cjs1d%ND`m7nlT%WbU#%i5WU`t^EhBYHV3`iyLVQKERp#Wo} ziSm`P@^Mse3>aoUEH0Rg2vfF6KwPK}qXohd9#OsZ$~J8X%&Hu1tstWJ-l?3BpUUbs z@m;^Eh`@+cRk4|Zrwk4>{l$1H#0c5=&Jw6*dbb)x9lxecFr-KG^ylGT(~Fx+uMKzE zXlu}8mAS|+IPh9HcHCRWMxH4Ya|QKW6>1Vt4}cFpLw|3$<@0a@oU#;y1)_Y#KW9sv zN+Wvha6qgk%9mq7Wws(v;n*6xOPC0tKFLFwkzWj(zco%#3a$k?f&23&Y#>Zh{dqi2 z9U$LWTf2Yc1(^frOSc%If~429bBu19G5M%a)LHrB5bko=mA($u1w;yBt4I>aO#5{a zYP@(mjI>4`$`+bUS%>0w4{q@J9;yx~zEQc3(Ld_(@!oUtpuj`hE}=NdOrx=HwIFK5 znzn^Nq*20}w!A>(jZq1RhX^r7I_c*@Y*fS=$;?KoR|LX^^ei}s+_*3CLJ$worfK@X z-E@PMq;)yxF&=^73VrM(qJPANC&x(%Q@%iL!EYLm*OVKKE40awmEWpKyJG=i0ufOW zF&d^QeXRyL)o5@kkP>N;3Sn-`4$}MU4ckAbzOnVUBFF!&z~DJ_4pel z9CJ4@`KSjdBa&7+j0i3xW#tRPg$%#mK{ENSS|BpS62e6LUmXz(3lnO{ml9g1agwQj zS%iO#on<3x2n-JKUf9(@ok=Q8)-s{a9HI01s9F5gpm(X0G3dSaI43R)ff+LQcpe*aZTGzATk4IC8~xQ1^!FS^14?tmV4Th7i2627R8JD}&z-2waZVj=CrOH?vp0Oj&uMbY_^lN|5 z)}Rj6AW5}}HMIUS)@XJvmqyq*2K7^U^{)Ixdp9eP?2qILiDs%uD#gK^SP>q+6s7L8 tTtoT_o&N*cd|1c7cB!+KMB~wm5LT2{{!^Nmbm}` literal 38253 zcmagF2Q-{v*ETu`f*^Gwcu?i{ap~~ z9!O2;@hdOX#+Qk+shK7^5!iHl?*XG^SQ$%KtkGmz+@(sDyG`Jw%@@#Q3 zQV^^(){kq`&qoeh&kV<~ga17c^z>^WrGJl(&-j}Ee#uSI4(9sz>aco&0u9ymV~z7w zc2wl`kgCn?aQGWO|hTPz>+=snV3>dyJm z@$m%Y+s(a4OQW=mi6#xXSc=hQh0z5vIBZA3FX$JE}!&gR8Vr%pLQ zif+9zXFs34+P;bp@`vreb-dYmrCirONT?|vgLnF?0O`*+yeo+{bH->=vpv7!V#9vFU#-2izgwCo(0DK%S- zlXi00g}JZ>?K_p8-Cvx^`JiF>nUwNv3HT1 z;UBhv>O?m=LIWKtaJ(ouW!75L7srP~pW2C0@VhOqETncyZg!9GhJDS~$Q9qmwGZsT z+r%y|7I7D_+~sH@olIUQf~!f}qS7M#D}s~3dowW7i|RipUp|W&^B~>0qq`m3I2YWpo+ETS zb?PAd>XUL^FZvO>xC?!DI@oLv=lWipZ$IrT(d%i&Swj8tS`5R!in-nrgV9_0Ao>bPtu z@ai$dc)h!P4&C&J{sxXX@y5z~6JP~2t8isEx|lI_yZJ~vdU;_L8_(w(A=N7x{cHRqchVXX7ve_aWQ_GElk74$c;ahJp zwFv?ByaP-|z8!OZZG>C9E(Hgxi7g@8yU zGtQtye<=3Dl1?z4GP?LC55fGNxe@3UzG4t1!2+c6 zxO#V73*S(I>SMI$@3m0xutBY0z{`;>-YpMtvf^j*^mv>KL_7ELrP&zBO>WIPLf_7i zTw)c-g&1n4aJGZ6Amd9fE|8!kHuhIW;>N8SqZtwOT1X3xg4mGRcB^uW>f)nCrt#i# z5C}U@=*cT*(JNu>j8> zd);y74nk%ST;SJ`2wGp@c@3kf(Y!}=_&D;FMN=8%jGoNufWN_w3z}`;N5nP#XaNW8 z{R>iwAkVOiN;ed50GVIgAnnT_PE75?MZ4$QbA@{|=43J-Qpj+#T~|xN@ZG(+q!v#; zi^(^>C^#G55q`>P_qi?8z}1c7_Woct!T?L?HrZsrcbxaPVESOTYInYU(xKL?CVW#+ z;_{{GT->qe*sEMzmwc=Jy0xKoRvCfvRn#QcJ}GXLZf1e-iQK5}844b3*?<~Bylvs$ zX;Re3&l0Vq{B8pmo3Cw=Y1p2`wbw@qvDTC-n@KSVrmW(}Enb?-5%3VoQOMZj=cV6( zJG9i-5ctv7jK+QWypJeb{JrhhfbycB=UIxyXbIft!cVEL9MoF;J=UrqJ1=Pwaq(+( zfq->CZeslSOy!9uhNnOwnocWC{MUtOLqgH^Y3KBL5KIDVhk_RcMeH>B@sIuDM83e9 zgutD`;9HT+B~+;v&_YbDDW=x*VnUwiA|LLQ5IB6S*-(TD)_>EqjFSF^;_a4}v^o58 zs5MkdxaEOifnyqDJmW_*r7Td$LpX-T4O6nQ;<;Z3-%`Vxe1>l+Njo+zv-yhdNH3l} zKbk7M7l*ZQEkeb+-!6sD~n=i1)7LG->tK*ZGEzJ^G3P~)*a>nG~ z@v#fa#?{iY&Z?CQ84-|J490m*?W4!4fo zdH?lk2Hf}Zr4tsZ#y4oBTK2hj5j@kA7Uq zJ^40mWsHJLXTa?*X@+CGxZkol{p6#$IrA)e*!mTocQDRV3X6fygJpD2uoX~`AIL@r z=s9{rXx>}XEZTpQ#LMqf3>LV4r{o0m{9fFg<;_-X4I4hwZnzfgQKhuB%}j2k%*5Vg zR27@Oe&}D6jvtWlC!XKyZhkm#$y5H>N)gxB?}>sh?}Q7Rg)NLMqkP9DJs-^LrhRWI zBRG=hvk+I@t!pQsc_nmoy3K1>Q6?sdop;0d-`-Z*Ra@N6)7)(2jASrkt+(}B5x&Q7 zVceh46??>Tux*|j(HQP&wfC&yjU%lPQCa?HTd!q-hV9>xLGy`xSd)=Y6f?U%jRd~N z!zg&*?t<~;QtGCEel>x;@do#Zq^CQRqvx_e_T2z&nKZVvO&ZUO0>(gnfH6TVyt!#n z?FvMA5UxpwU+nwgaRp5pwu;#iYptyR)<#gIVM%JpprL35Y`x^Z$T3fdjFovOo|N4q znL>CGKaS&S|H9inue>8!IxTrji!~vc|ML#u3QKtdn~fz22F9%Q6q8!f%-G|vzdc_! z=SW%K$s(=d`Hh;16)#&pU)9Xa3Kh339Q$hq^*|ss51O5y?lgUjfr;Ypk?ioy ze|)-A_ovwb+J5OWuDKjn?^bp%(5uBf2$pV`=1oRk<-KqVqW%b0hz0DB!Djzb#y4m- zPc7f{pzNxmdwHPH*2vL;RvQYHgo@ofc@cfQ*~+9oH;yGeI~E*HK*1w|Ja<-|8~>v0 zJJ!|CMFN6X1DgZLE;<^b{y^v7u%?{VpVICSLejTj{X&(s&}5QUr zZ~SF=sZ%Kz+iLHSnvW_A9Xm`3WMagRdB&Qn`6m~eRkeI+w@J79g0O2RgbCZ~UO11GPz^; zD?C3c82d*>FPwMa=*-IVMSim^w>?rRg2-w|Bk?@EX5c?QnFc3peK7R$V9d4`(t zaI0+FRlY^1a1}ZH4**e!!!rjUTIKjcd>%{~d=)DmxC)wYBc70enZ4HcBZ5ti$L-Jw zNd(LHEXv(CGb>Aie;`^Se{F)lGR^~DPtj-eJ3vX zScxMsAa3tt2aulxefxJ*Y$vvLfHw==G!9Tb6-=rl4XHCcBZU3!h~; zz=%BH$&|KNmzhlBGTb|<%0I!13gLYH$$x8&D0PC_p&n#r)$(g=Ya?AB?$Z2J?9Jqx!cStlWToMafm5A|4HoCoKSrSYa?@ezx&(SG9pn_kQv4`hIiRA9MtV^q$tTXrTHmM4_bIN_JlQrk{!Db7HO{jDq}$r8Sr{ALtNy6Y@Z^S+IFsA@$mbQ_QHh-r*I~X zJNd8Z7fL=FlRiV}4IK-ykJiwn7Fh+bSJ894=%7ap(uHiqPNe#L;9>Cxlu@d=VS<#B z1@k0Rig>D;A3m!Prm1RNn+;Xpl(0!rBWu!4Pxa3tWcI95RVB3w>gvxHl}boO5C*qt zqeB4&p=a&y-@gUL#kqNTT*Em^CeNOchJ}T3aB-qYKz zCyEMT!l9(Z@i3zpwld^mTA1APwXl#~T)bL~-Wmi|adL86SXcX?CDW*&{$T{n4iuj?ocSjx^`RX%+F;!SL=#(DZ%KDjZvo#hu*aivKC&V+YM%U7kOZ{tLFS?bTjEi z{L**TNYxM5g2^zs@lg@6rV^NkorGfBritFj?u`uhI+Lw~8x@6y5+;#UgbE?>>_gm3 z%Cw%lMh#xVpP$C%RaGUPZ#Os3d31JmYM>+#4_@X8`JWxIekf0g*214yoFgMIQy>sQ z>u?a+&_2Ds!1Ktr(SS9b>}_lok7i)lQ*2g#dHKh?5U7ynmPuC(6Tm4_%ma`O4GpZZ z-+-&^T^($PB`3RmRZ!O&U&~Mx0X7h+O3%sJQK1VQ2uR5H$YH$HluSWamzIW0OWK?l;5-QCxH ziR5-#2$-eZB0qilv^qgbMrK&&W}AL44iTu|c!d673~Nj>cB=g%&BlGy!A<2QSoNxz zSo8P^O;o@?x8RXvk2SI|9&_QkW2r2~jEJ17bLS-_B<$$)CUAZD@L{TmEj36zna}pv zUrT0JX{g(4^ZNu350Az6e4~la+LsS^A>m{KHH&EcI6~+UtfvrP6F#K$Cj37qwMSOP zuiO19TdnN2hL*lWVoHiQy(I_;W{dd-Z-s7I8(<}eg{H>S?|psGNJ&YjWmHDi35aej z{w*qlPJOd_IFx2d85T1C;=GV?iH90;t)RK-ym&WZ3>a{DqQvO?`?NGhkeI_{OIsLO z?RL|7P^|CJ-}UvW^4A3E;;_8z>>xrix}Bc;aaDGn88P@Xh3y!sc>m!q8pFr z`)gd5y-t2bNc)>Vd4f+VcXAJK7s>Lc8d6y{@%20+0&Ko8NK1-vS*@Sd#R4p~$TKDdXRrKE!BRPyAvmN#g<|MjA%1xXOq% zuV2ml_o|%{kL3*B;io9o&;vFi2Ikpf;KC~ZuqrRlH^*_R4A``(`~HA9#G(bnrI9-H zy1p}-9^!Lg^Gsd67I$^-%Tvb_|8ZlEa6XJE}4)|uYnqW0rEe?G^31EQGqz0msu!El0fHTCz{)Ya@LC0W{QPe@EkR`oFY z1UR3Z0SCIu#UgfmJRvEWVJcNAcc#t9 zyA!>dIwb{}v4ML2sPHeiE+k<{!!Tr07=)E-&9^NhA{_f=nb~3D#rZJt|3Z}X2nhkh zZ=t$Hdf(4Ten=3c3cn>{5_7QFUmX}7AOB|2ioc4tfVKM5Zj4z%@P%&DW7{!d!%vb$A*3|m%o;` zw`5FAj7Fvm(s#Qt#bvo`cc-0__;7x`tT_`tD4XKGKCWY6&;twDPhWNXKj;z~dV|$( z=dnQZg=<$VOTYVsY4^$Qa{Hl6+oz|PhL4njNN z?L5Bc>~BJRu+L_VC*A&!*}NW9Y~R1|&9Tk7y%qvszd^0*HCHeV*!~kIhsyX zUVxYPb1~mn;>iQ_skYMtBwXo4YwfQwh@$+R9PiU1IOJAohbW+1;|wSz4%kNV-slUGPcm7>v8RGB)aC);N|fs_zM=UWDrr;B5cRh7gR@uBVlzR}cxVScvnMa;k&&U# z;#P8A*RE~IxV=j_Ogm-=mt32q)I7U$bEV)qj=z`F+F)vZ7WZ2!Tk5@vXd2%_kJAiC z7;DY(;`aS$E$ zoND`l#{(rkfA!ekBe|w@P=whvrTa@hIQHTkJWRF{^QLBVNj1g?a9HFu96}N^l2w}% zZvg&6ZRq{wPMs;UtQpOks?y4>YekARfe6~{Ix zdptYhH1oe8Rj7|=Meuu;c|Cty@O3Tn@7kTUMt0^L2EDz#4F|*O0GX(?8+%A6XodzL z_UBhqMMZ+@g#hV6nf+V#8vwy)@Vk0cfoLopbd~d)73iHp=A7pzp-v;ci)PUAuN-kPCmR<^X)>YNB6>3Q&3!~!O%d|~jYCZRPk=qVr_5T>^^76~k_xR5ToW|8oUjPbJ zBLP!*uB=R$z@>?A(Hqan(j+b?mkFSn%by|k(xx-Jd0?GvpM~7F+?DFR3$-5|iH|-e zn2nw?DdkT5=UkyGE@pK=Y*A7DUx*x5)<*MNWo7N3PtgVqUM@xWCBUMaGu7hqWk8LR zoqcbV?S9@UHU=-->6noRNEFmD*J(Xe6<`%GM|p-dAtfL5SKXq5!d<8L4X@jY=mbo9 z0B%O?HXHH-wjf7ML(_}Ftae1v$lxya{g#dR>?dX*E^V#a3drmA!sJ>y60@6VEyK4{Y*sD7FWGzpRjdnf?;5 zO*<_a1s;Zv0$ub(XqSK>GbSpNR-ugFkS;0Sp2OF1BrO7-n@U=dj3phi~7$IdxA?9H?u)3E|D&83M2k-~qYUtnT*R9{ z|F`VOF{Xc>m+;T2?`BR^jWz+4VKN1Y>^(dD`(@1$phHut?1kJmS)}Nij9=OcAl0SPPvNzb-pKnsb8_mlzur<=6G|K&s{0%RB#P1|7y3} z4>7XeN1Gy5IS#bcEoH;znMJT*me@!)#J}w`W^R|Q)Mx?elj9Na(OoA~yiw-xzjFwR zr%H9I*G%(fOwYcPcB3us=gp^Ju)pqeBWhZA#RhJp-n$;~&Hv5<=y3eKMk|t>;wq)R zNAX^;GGGTBkG_p07|a-~$WHBhScU*~;LB@NdBf>__wQqxH6<4pVH)n|mVJp2G_vFx z=RBh61)&FjeggE$6?b`jY!UjCmzzH-7adEnDSmXS<=ZLKcuoj+nk|g6%SwtNTRjjH~-IMEbf?aP3FsLm~C+`bZNrG{W z%p-8y7v(2jb=>fbRXY5!=iHcWeGHTMBfC)^&~>#Z;Q|I#LI*z8)z#^q6SH~+sSDKJ zfGmIO7idS<9F7~;{!TVF`abu+NTH|vi!8)4y-ocK^}B0~4NQ9U8ZpkVz|?%#q>fz- zykOkB+vwk&DyBi0)=^qwT>U~@Tfqb+elP#ir*~=zTzKV{z3~(a=hOCOI@;PT^Nqf> z0Kfwgev973`SIl2EwXQy?6JnwD$aObWMRP3Mlt?##NzV6hf%v-zUNiikp~|veCpbO z&~B)`M_^F#h7jOr1XA1AOc7+!5`<^se>lcx{G(VmSPxj@XmchV=ql-nHET_$pl)mV zypSg#Kf-i;#T9a8IUHr+bQK|*Uka<-wsF=gDbrW zNJ%)CzL_^Cd=+i`{|`&hm0WI@MKAtaX@B^?FHAD5eN^I5eZBb@7`t$JXj)!P4jmzJnidHQqdCE$vq830T*i@fV7p_9R?9Q+8 z=H>rHGTsLduFXET8BND;7hs~Nr($SmSp9Z`9E^wn{L-7Zmm=*Z?23`XF&GsOnfelW z@Uc8YG}TaM2fj#Z!R*|KjIsX-<=D|&^vBfhwxmeSY-ipV_1nl1H>fzYVE9CbFrY4# zx$^;tj!|)`A8Zn=@8_Zpq{ha^QVmn0E1S4pc=L$MMqNI=#3y%aw?ptyWBThL!7-Rx zO`w-(fw$92=O{;T0Z{MTEt1vZsVoy?;(J|rUq2G@f`uPAco#r4)i;~-y=|yJKY~CE zn*;oF+armndKJCzs%^;oP^ZWf@tIvydQ}OsDtrsymi=eA?+TxTbf8C+5Z`t-{Gs>A zxi`@M0f$-~3jWNsow_QrBe8rX$N&7u-Di=?8%P4_fM4a7MXR#*4Qi|cWz$n88umbu z4ZA?q!pH*wL?#0Q^x+1SV1l4ota2kc2I!EwVl+K5np%+%(zE~p- zOlJebzdztmgXF~956(BL@}}Olqk<+!OmsFEe$IgAbH&FqbiWg?4wHLB6(#z*SFx2* zSOE|*yd)1JL5)|(w-GXv$D7suipfupJN9*I8b?_=RJYK*qerw!-Oql(h})u-6XAUR|l;CGq%8MU6p z7Zv*K^=tN|q@){_Hk!riYHB&<<;R(nN?^)EEp@OQrh&C8)RIBM>{=KB|G^(X6M?n~ z!a>|Y)xp0pwXDsem*`)Tw+7X*72?%^__3zw^OBXcs~~v(SuE3$9(?O&i41Q}Qfg|? zKQUG%i-zBuM*!2KskOJ79W0J^Iiwkyyy=24yZ>BCYIse26fN+m(mJ^L*PNw`(0mMC zGhwk<<#wuJ^G^8GL$1{Uo;Op;1ZfNG!^8(&K>R$ue9@!v+~S`O4Aj~@jQiRF&`0NK z;RcjQQb1S<{-{wwix48sBQCjh@&R5t_LUi$3^E}!A-$07sBr(2R!DfUGvzLGwd&Di zDE5P!k{>KvroWnlNxgfY7+M|>C%_Ta5me9*#7*ypJmXLZ(Y#Q1!*CBFCpd#NFYX-c z$a#N4rJH-YN!Ipqt znLOW5Q5|*jNf8u|VOGuIG{n-5(+%2@fUKod5B&9=kPrk}N7Nm>d`0+){pD+YLXlBr zF(Ilo?fh#`270Pii_k?|i0=&gA==?PQa4n-yL`0nkl{>E!rDu*>&zO60)I|xl}$`Pszz? z8ptDfl+gQ3)AG{n+IpKlx~!yai+tOwbJCQj^cB&VqpR5_ktbA9szOpMPaUCTK~EMx zV`|Mjd#!Ao7_l)uzbk&v)Lwpiv9!cD%pRp0dDtf-VoB<^^uTMaerGz+M=^Z$s(9=& z4E;fe03y<%!_f!4V>Ie z5o_(7<$b;0{jZ{`n=!(3b3K| zIq#lZr6)PnQqwqpI;V5#13{;GKf9cdpKtw1q0YIdymIS$y;yd`LVlkis867iIq8%# zg5;T!)E)H0Sb4lW{5bs6md>kvmKJhMt$hZZsg2R7uWyZTx`pIpYAK-D`cz~){rH2j2W1`y`h2Mf zITx-KiYDy;0blm+bQ2QRdz|VCCGm?QmqXs<+5ZPWh1BHcyY~l=bbU=hP#S})%j_AeeD2!ReHjc zws(8P>D@2-7C4UdsRurQa&L)`m*?iLL0I?I?;KD_<%6e0Z83exbM_X$= z;Kt`d(V;8EGFPfK_#|PJ8mt;X7bwoQ;x@v7M%4QzL?=r1DS#KBRvo~A#LUZ0zFMd&E#dKPGrjA* z>Z|u@GB*@3&Jx>X}CE>imy zco<=^nw^w7^)iT1#%UFm&Xn9!XM6PloN1ubyP&tdRJUf7p~|76q|1tLG0sx{m&$8( zeFvFxSG4!?NNp+*kwQ^+3aFl1kV6W>BtM%|`tWR_pO8{xfbftv3z3Tc!8W88RHh zKciG9l+9W3_aO0VY9HR72QTNtcrpoL7xbg75xG@8H@Lx$f2$OLM)&l=Bzo_1;wTJK z0C^2XXX$ht)0J2)&)Wg&k%|-tAoTo9j!4t2p}RVTc>O--(7X`3zX$R1oIt2P%(tadO4Ripc-x+rPL~pP{W+;mZ#?zF!2}oZ8sW1VrpJr8)t-eQllRj5CzX-L*mP&*LGldko8!0oaAnXasu@ zI5)zb;3H%JZEWcd)(YhS)CQfm-5NeV3oD@_0L20&_sQ zKniS{5wJzYboj`27?F@4-G%|)cOz51Tyfwi)!i_gu+vZSTT;RY>|tGm_xJAt(S%P9 zzY|5MgN#U{)d;D8KkzwbKZA#g)>Ax{0#mKk6$~gU!W)zojL6}9a%_z2=`dn2xKUAv z(8y7|TKSb93dkv87yj*X9ybK|dBHz<9_|m>3e=GmNn@IGWy@HBX3(AQ-UV~`C4ZPB z)rVZq!HpV1gwvJ~={L}^t3gB_wxz?F*o#YUNw5+Jq>t-j+L{M2PbJ>l*R#Y(Z076p zSQbRA%72XZIr8R+=$?!ciGmkV4vFZ z#jp&kc~eHX%EYyNr2l?Z#x0Gk`T19A3?Xq=c3vs_@K2z1ssv|!rup7l0qVNWDiTCR z-;SwXG|(L+<^nsCLXnquAm{N&SsQ3|s-cj0^_FO9x>-l8BKJvmmtK39GRHF|UsgLY zXPJ63f#t56&f=L*rxoCz9Mk(lLnx6SKXkK`%q~ZK8{( z2KPxT`nxDOr1=ru$?KM?9PgFgSxN86pRXTOE{inn_Fv~BQ<^bv8=w_oTr_P^;?i?R zb(0bp(z(8C%~q4kyI55EO$%r%0p0+a?OE?!NqYiRaBY+VH`Y~@F zZL>8uvx>Xu*$d)=Q?6W~zsf;H^86Jxhl*hMbieP}SNp#`8YB#eyY!Tt^sT;v@8A}6 zwfB^7+$<$7DP0SOm&$#p2pwp+t2`vc2?Ma`J{$d#K5xl2N>4sMvo(b>GS%^ z5RZv8;L8PTrk9{r23ekj2g*}2O@7jtYoO)1a|mp|nF``n@r!R(K9GX@`%05)V&q?$ z(O21x)!Q~qObgqWg|B15jZgITTpWxFdDs_SnvdqKp}Q@E!Ms5(Er(FC(ijh9OEXBl z9w#g%9;iaKe$4#4WHX}8n}LcX#GqGElZzI*TX|MgkruME3iDa|2zCwsBuh!&uX&#x zGc7Q5Mjjqa(*P_&?b3RX?LMJl@fnz7g_qJ9fc(m1JQfH8K%=3;bfsxBp;ftFtj2kr zPn6C+nk7Y~0h1iiB;GHNu*_bUj|&{%T8UCc(Kdo9IlH=rMZ{x{kA4XMqYx!!Mf&Es z^f#@8^*}fD6oJ;o0NUZ9_6q9U_^I?>N~P)l={=NKJc_5Ou^YC?ES>EAnV=)?W};M= zum3xT3m~VrdY}D@I*4nl@GoCFwyu(=S{w7xWMxETwRZjR83L&c1b0aR;l4%)bS2z- z&`UwM8%@K8#{aVuQSXoThJ?E|{0MH=-PkyiQ{wOdiV9KWtJHx6;_$Ze{aAELcCxIks8Q*?{Wi-pe5 zhq9zZ(h&rY40iIuCEp)V0VGruX^`5P{B@aG7E2*f$E0^DJR5Nb(ep?2pEtRa$2vOB zl}pLtshCo%XN85=jn63Oc869f_|h=gLfZ%UNKz?f#;SfwdQM8FU{Smt7+L}t2%Z7& zZ>Rl;6b9%W%u%}IOqTI2KTv_x$&6aMz7_+)I<-REkUP_HIZa}-5wwWR3Hv?mH4u!m zn|w2aAVwf0O|g9QP@Bz2sqP+7AqK{M93&XT-^P!V#?A{@`VKfa)x^#qiv}Ow6Q1|d zXavS6%Bok{gVdA_Q(KK^DBHu#Bid(j-aO_|q(%t+(e$=CXwg`@H9t4@JXmb$@^;oO zdr@E#B%Zph7LJ_-Tj)ejY&qp`Ze)A-SP~Zp;1g;7)zheLA^-Bd*rXzHdQIhp-$h}} z`9%AbyS5D3d#Hy!4m?~R%1Iff(445GLW#CK9S_;LQn%Q-W3b02bozN-`#x3F-9}Ez zFI=53Q7M@E`srFnV^^X?^wDb0z^Pnn81EgbHHCu=LgqQLz^l=_o1Y}_UwtbnAtHIO z)0_V)H%iq&lVv>v zS}RGBt>XE9&#tdObr7%?kFOtc)!uMUCF4zVmB21xg&+2edYK2^1e4 zl;4w1GQ7X<{pQqzkul@=%}=sK|URdK)zo7ZbQ1o?t_L#N|`1 z%%U(drAf@zVVohxk)f0|bI3S1Gi551!^v3Sy)B>jBhV=5`z$phxuTV;Ook6NQt(dy45t^W>P zH8d6e9gQYYbG}N+-qSo>idO9bwn?;D;>dCfuF=0s_uTh;erA zn`LPdBS8;`KsBzz7C@Wj^i-Yc^t7KzOLLr9bs+p3vBj}n88^^O>ENP=CZsV8G6lCY zx30Hp4eDQ;a~3cspk_YS?Uy_WrjCET^weT=!EK!#!J3OtuB!EX5!gvXl@l9ocUDU7 zXoxAF-gbGlZIZHu_gHbv7eV$)=?yUV!8{UuzJKP+Pg<2LB5}N5zGhWm!iro};@~0_ zr6jlL>(2hg(NF>?8wbGbZ_{JLLXe15K@rTspY!}Z+Y`S>GM9ozx|&K)wovgVO9q0Zbedjk(EiQt@OrdM5any?7M{FGt7>j+w2LxgEv1uB1)C-nc4^IXF6i) z#%3R#dT(LQ7yz;i#3iufGYMdW09@?D^tEmP!}HVSS0zlEmmGHn-(jz4zj_`!hk~j5 zx#%nr2vmzb({c5@#6BHNxtqS}GpipJ2@;TA4H}n)#zZst-ie+1sHz5wv;9T3{JSn* zX_jQ7dKblFS&^s9i5F9E-z~Wvnu4-mW%Kn@z~i9+GjN%YG`VUDFkd$g;K*wWTL|j2 zD&Io(+~5HVV>jYW7+LrhrhD=4*-8riGU5rmWhZ5rX0C)uC9R^{s&Nw;6Ft1JQkBw0jjFOGM&}oWiPwk z#1D}pckgZhYbv!pBQzpg4nJ7_suprOC;y|ox$%qj;5q)Sw3~tF4^e+C5ARbU?hUZY zf`atNH|-mR<*4Ry{FC)Wdbq-f`!ooH#I!qufNLfHK%Gw~zbh27{3?#tQ7lnzdkBmI z9jKT$x!s8d3RQZ+=I%K*hKahzeqi+F-HWr!oi^F|a$rCb^6FzoUk~2hYtbqzQ+K$o zwT7~Fk*&QSN3>iP9GaU+`9kSUU+sO@YiF1AYltnb>WF0;ytpNEYB+h}$x~ZC;?E}M zoht6x+*=p2KI!U3rYow8Dv*ABKgN>FDn0?F=%OdgImM~bo7I+m@RnFiQ;aa-n#(1U=7s+k|4rS-*U$d)H}9x4h%C5)P&KJh6(Nb-YUT z=55%Mojez~#W2%QnAa5EJuce?Te#vUwxbX#UJ!CZx{+bYCF5LbmsMl6oZu;r z-#Usvd$u1g&gw;Jc6pph4n*$ohhLX!njf&g-F;`AEQnG9DW9_j>jw^1UL$!_97KR2 zX^?mQ=;tIu$Lu#`R&a+1>;_^xOZuvJjI3-m_&23c9nL1e_Y~%Rrl~)BfV=`iFPe^f?|D zD6F?&1s`(CVNQ0*n5JQ`9x>(lF<jS>3bI%c^ge0QeUW9(JCZa0Z+s>}FpR{B{@ z>l^r5@Qcg9391{NnV2X%oOew)`hc%zcR13=-jjV0%qH91bg5!1seQCvZcod1S(Zr# z@QGnST#vtP%@^S7c^)&ozt~pR8jiJSCzGW|MRuSXK3HNH(*`QH-9Ezc9ljCC$+_{Q zYq2gq#Rf!_4V)N7v@;C8GJ3-({=5cZUFsb!oj3Qmvx!vh;vnL5YklFjg68e&o42DL z0p>s?WOfW-DY3|E&wY=l`jy@OH&s$v)p10`VuvO{U~2fcVQNWFC4s@FN~VdQm*LQP zQc2F84O4CN88{vg%@5LrQawFbZ3E{Uw=*uccX~Usa^-|$B0M*jlO($YnK0=1t^WU zn)che`t_S>x|Di)LB)8-jAK+EQq#i5M=mh@cbxzYam*!MBNtT$>S{rN1))x)sq3lJGR-Zlf<(ds2N5~Y8@~CgWy4Pk#{ip#_$QTEdXFH zelT@)wh{+CYZqMTUe=oH(UMRwRb(Ma4r)g*q%iEUmg@TI@2+9xG#wTBDY~=9aZv*i zIC)>}T~^&-(M3<6M-LL_rSYimDcnm83N=%t8YQHv{}fQ?QNB5CYHCBDeroU`dhPUr8*C4%&X4$NXKVzqAnH9nl`QPHWVP6-2QU8 zqvfNa(Ee~!O3zpHQaZ`P(A=&k-sgV9DpOc1x@_8L#mC`}ScA|BVV*~N9~1Gy;;HN) zWIj+vkkb1uy?u&DLjSw4ydgVT8|o+}B0eH6g2o_ePHMHX1(xKSN;lsP(HjuB{c#-h zq}XkrX-&g9sMo$;zLJ!j9ciS*J+k~cF&+KAy3}ea*Gh)!N~tK3?Z|U|aF|5h+qETu zkEJ;=EQBuJFsb*mG^^E7*OY_SLTs;t-A{X?@}awbHhgUC%(Yftkl53}ug(@*LuN?hc0zR1}}deuz$>tK{~jBqJ8I{Mel>-_-MyEnJSM zTmLbpLM-_t?!aSfpzlwTjO?QAo60xH+W&{BuZ*gr3zj9gLxAA!?(Po3J-E9&1b03h zg1ftWaCdiicXxY}d*54+Kf|B1&YazQS65e8&)AZuD|2kQZ~uXS!;4(RMPbJLO9(Ao zMhNYp&&IfS3I;B;Oa~qCI2+^OCr>jvXRYFhB_EA&=xTmy>C<0YW{XT>*%k`M&-Nf zCkH(FSON*0PPklUXoihy`y_&WgMa&vC(}>QNe5BBzBpyHin~>Y@04pu2pr)KgS17G6 zSXGd1?8FXS@lzod3jj|Q4Y7G*Jg?_4>}tW4NLJvr;k~JQ=$%_cCD0;ZrIz*VqJu+k z#mVXAnPS= z3Pr$Za22bab)PH`ET8Uz0G*o^&rtF|QQ}l6Q4o^qyLT^re!uK&Jshb=2{dM^smQAy zZt$s;ZhC+EQYCTDG9Y3YKM58vGdQ1dWy1H3r9gz2tGqgfy}R1%e6^CV*WjI} z|45#iJK%eage>N$Z`sgORR+DiFG)r@!ha#Pk4nGk8fvYwe`38%S&>`9L4vh5u->1X zVBXy^?H?G?()MfXM70z_)~iM6GZV#)2+~UsCzI4SOBBrzI@t>z?ncfIK?TP!Na+z2 zU`GwWG)xUxSt#)gMbUUKe-J>f$@Wh!vxG8@*9p@3{66oG7 zonn0t#tPM6A^pz_l)Q6}x{eJhex?6*|G+Eh87oVbIV(<%l(bJ&$C3FJ4g(qf*Qy{X z^$$lkH{+%z$py;-c6P$#z>y$et&bm4sA-rIlZ~huBuYBiG0Er56@r8Gf1m_Vvx1jT z>y`wxg&}4I(~BwJD_EIM@A_Y&gpnc-e>((ms^%9Yjtc_Q1B=Jbi{ysoVp~~;@RHCy zU}`TU^5U_f;=}cPf9a9JVTx$FAp6f7Z*Kz*Uv%j15tm+z}* zQntqA_$5nA^^^DWvP%j&E*yHAbm+W@Y0vm)#>acYEh>~FqCQ&W^XdyTk1=~(Rvf%D zfxB#fYq}~HQy7$n8*fZo=03JJbJ>&&G|7YIohCs_nSKoXt!A%`5~25h=?!2m+&GcuGBL_+xCw);OaLddr2 zf7i3HSOZtOW6XT^#|jwYewwJO%8KZ*2W?Enh?8~s-_%`!Ex#NV5|A=JW*dswKoQk5 z$6v{L54A>|A%pvm_{aaQnF<8wfdtEul@awM#ediqD(UOfdVj6-_JO+)k@!zr?@xZ; zF;S(^*wm4a>B5TYzRJIfX+AXuDkw3 zyf)i>^OU7`eOO)S<^3c`{`hGA3}ZRAeb=LZ1%Y&d28G#3;xaOlQ2i1-I+4zlm`J2Z zU5mvI|4<|O_!s@|>W`{qp(8rHX7~7ZDxGc;C1%ty3v&sN3+%;=k$DWhnC}fS2nr3& zya=Qb4>9c8_vV~?J3)|1;o^KMj}le9hQ*u?+y>==V)u{jvx0;|#z8lDfUBe?;8Hz# zxdm|)JsoZud|UE$Sp5Ec#mcxCCLlrQ$Mj{6+JYk@B>PPuO-coHVLbYXq?L= z=03UdK{<85s!CF|485&Lqqt8+n>I_2e&}q&*FeCftkCw2JSQJhyi&;MA$Bpx#AVd7 zQ{aKmq?H-+FT(a@EDzQh2YRr~Opa3p_n#5zuB`p!A!4lKtN9u@jO@^AiEzbpPYywn zdVL@PbzD<>(>z24I95 z%tX(8Byk=U47EzN%!Kh^Cu764+t{wE3)h}KATV|t=Ay7jV&qof2f*(sY zZawA|i5!eRbWVTR2!cjwd_Fh%w=_v$gcj~<0( zD{KB5wZGcoUA$s?ZF#i1XR~q^4|%tu*z;_;n>iOxKp?9OXpHGCN!Fi_i0ZsF0kS4V zgnS*36-TW`ZGl<===5AI1u7UhGm$>%=VQ`wb#QbrfY-R0KZh!oSIWJ6k9J)OC?u#t0)CflY)n?*-5al zVW$T6x#*dpkN?9!0{jnT7?i9C_h=rE^%>J1LvL}&g||oL4m*a`zkSf$c+3bMQ)9Xv^=A8$;*)QA-+6_D^F9YslL!kZW$ zyNct|yD0;olZ$V!*Tv{uy=@GMQ=bI<^ce}`#vEF&bR%CN=KBeHQfSmXhH-HfkXlC# z&Sv^{Yg|BETMe8Wwkfahe#ZcL0Bf7w&x|C?=8uspTXhxBz$uBwxvvAOt!otK?#&sA z{Vl~`Pdf0A6Z^m7@fxWkmlYd8))i7xL;=Z_%awsn=?4FTd0uJ0>eqoo@6HRw?wgI5 ze%W89@M8JIA@1mpOjWZZJ^k1Sa$l~0&&9EUd`4|3ygp!5Sm}A^%dzez9VRy{t0*u& zX-=Tmi_TdvWMPIqg2ET&b63dEsK_|Q)#Z7{4!uKU={JI|M)K^KC=yhS$#j0a+tci4`*D@$y!}N__#A`|@cZnAC^IxFjouIemNaEzSX;XW z4=ebhL~&UZQN)y{=qnyJ-uY`6GNF)`^I zMu-~DMOb0p3DL$xo)5ys3uAu@>-?a_f@2e76hOz-VxLRYi=TSDzpx%#9t_Jv^*gSR zg-;po_X|s6mV~tf48){S6(eG8M=?b?5WU zh)4K?l$gH(&|6=6>+CxJ&Xuau(=hdPH@|$z%hKsDmD{9!E=|J^<@N#ZuSj=|_=#Ju z8}oSmz03%RLR6)jbA`u4uj@RewM0yBncLNwY2hE(cu7jhoHh|RSCfm~=DV()9LMdO z&lo4In6-XB85=($4h$PFUN(v7oGAJ3SgJ*9>5FsG&MVDNo&b@wM%df}1i8<{PSgon?W)S&uNwakxhbReRmnkEQ|}gNw*U-(erU$vlX* zmAccN+$cBbrVh|h{?N6~;F>ipQC%VAlJ)k~-X{zLTFP$MeM#-?-`M4`;^0-E0qv)} zo1FKMP>x89-8LX!xUW;MI~EuI8r8+7SGv^f)&(9EXRpDusuJ-yr*>g!;zh9abioPLoUKBC3HW{C}hF7 zeL?y}(GvXqHke0T_H_9pvknaEdxIvan%XivPRK<`+;TK-Sno#1&m%edpNAjMPd_Oo z373gdAil$49`aQT7~4^XyuVAYurc)&O$~zxS{enq^b1-;A_$kw=u^`R@j3rzB5aH^ zmk3V+@FlY$exsEW6YyoI>&5G4+Y}c&#AGmG23f?NxR3-@y@{5Knp?YRvm|rcM#p`& zV(w5n3$5jZ)-Zst$4c*udWfOmp-q@?%^+y`Sp8dYDUXuU3|j7y^$DlnIa6S~EZ0+R zS#&(HgE%;oHTPN2gAbMk7eE5}cJhvTJ&kC8wL$ya1%p2JYjMpSiD0nI&2C-KGmTA5it_kX&whZQ{iKLKT$)w zHVjVmtVmkJ4H5v{2;hU0q?X!^#&@f+pYB!DyXCbzF_`?JQtP&3UV#=xd$sXIbnXQv z-UUj6(G>0HD=cLJn?B24%jLoXu?kvouo!uRBjT*j8mAj)lN8oG-FQq5tUYu<_s7~L zMF(-FY(2X{e$C6=9y~cqJpX2dhM$eRL~a7dt!D!@L1#3ITOWTrPDQmq&7Mb!l@-AC_(lEi0qbWg1lhax_gTJ3F zgjw@tPHjTuJJj0+`K!f&jggD!K&&^z@V)}_LeZ&ZIRATWU1IsEZ}Isio~$!#Q@%tP z7@H=OR29bwz>@;1<-MXNON``(aq%ynD6tcZ7OGYhUC8l#V)ONqGejRHzd)9mQMQ=f?49Pml?+VUEcudvdu>XKG~!*%u>wB!_xZmVaG(|5 z@X^%)SP?{Cx@XY8@x}vE^C2WwfDM0vwN^U&YsdJYAHa7}D6zSv!s^1}Ug3zDANLAQ zoGDJR60W?xz!Dt-T-DC~{4zoKQV)HKMp^hbXIY#F6yYB7yBjrlB(fF{_FqGALCrNMc$ik;$jw=5)OO{I-wST#zBWnH^Dof(p-^lDjbyT9>6+QZZ z!-)QasIhs-XO@(erO1d+-%qT$2^GNi??B+2i#reEjk2ew8VRx^KZ7dDXi;EM{WvF(IzcW2B<%rbPc!0IT_dwP2)Z@qc_?d1NbM^ViSnNo> z+-C2*=P@Hl%cc3kA-M+U^aZss#G!#GgpBOWJ>J*|khuU|Uoy(uw=0ZP_%Nkll|oTN zlVBl8s9VH3Di>7Gx~CkLlk3_C1F31Wyudt|S=Jut2K^msTUWk$Tu(`4@I&h~k>IKT zgl1dwX}LkD&KL^%#vtG*?3~R9p&axd)Lc3o3m}<#eI}bM0@*j5|1)Toz(;WBAoz&q zN}r|SU{xqBpvQRoZEoxCA*?#?r!nvPlfmVs3kS8v(1;2QIVzijgGz;#gz7kA(5fa{ zXQz3QtO$!ys|k@Sj*-JO_J5lNM^PenHJ`g4Mc{Hf6FYWI%LI!+1*1PPW9=#+0820= zPF=kVXGktf9r(r(I7bkLD&oUmIF_XNI5!2|*ti@;5Hm3xht~elbo?JB+B4l-)y8Y3 zl9aM~Ld}1Dze8c>!RrVOzma%}H$>20AE6RQ4GgH*ZipX*Ak5^YN#i#lG9>cRdA-b&3xsp6=pZFs?g z++*CUt`tz40|MOB;5%p}*+2?4HRCT8eQJgw6`zyNBFNk-M4@7<#kW{WG)Wcenx?!( zDXFlEBw-6SHfjY++?(5TIgYDNPLEd-Cs*;y^MLgMtd{zE6NcE}u6@%*k%`7ofET*6 zKWc2^f3mX#n6VzSaUC?uyqhz8c(Kv?+DpzcGq-I8^Id%$*bc7l4UOnLgT6+>;~s5> ztvOBU)a$o)jFgS;0;w=;B79(NF&)rQ%__E*5t9pN`yMS6`el_ud6!Un!1bR#=UcNm_j!?q84O=e*RM zB#4NLtN*34Lg|43dL0^rj?_TwgFbtHDm%haD1q>TEG~+DhvLJ9KOuC#DWK}kAPtv{ z>;GM{2WoIZj!4Gk6Bb;*H{D8s2VVgC!d=A2aaA8RxDahlC`B=%;9rtFaw zV3onY)H-@20`;tyHcN+o;Tt`z2peKVT3+k20#uBM2F}}q`!zxN*r-!T&r*=#uK{f( zNdC<#hGgOJ`9&NTC*hoVEVp?q4<}(`t(c)ru!>1=|0rx}B5c?wY?#z3k)dKMo5hz(^V7)ju<6~n8%JX&l))yjL+*=Q34UM4Xla9jMRMD} zyd1{GTXU(A6BmPIqQc>F_h32rjm2qB7BmKWOo2q$z3IPH9Lh798>Lnbrd?@1jD^5T)gw|u+@ex>ZlJ1Aif78Nq7&te+?#brcZ#n z(eaJnoI5TGIKcJT%0A;TW9d(T#kS)Yy+c6?!2frF^P8bl^dXDy9~2$&*pbxRy}*hn~u}rFo>AZFm3_s#irjQYOC#pk`?&bG4^SQz*-p zLbQmh2>%l1(IaoDRMGAnQ}*mqqaBSTg_24k2~eFzo*hh@xv|rw7jD!A?0&V2%SMBi zMqv-Nxa~M!@eS)}YpSm<7FSe70!iuL_ge<1NxbooW)iuvmuzL z22$H-00*o|Dq}=X)*qkhi%pIpVvzUBNDn0vAP{k7FRH7vnxt}a7Tr0Dm|c>0UZvll z0RxTY;?nqBH-YI}aZ^)CG41e_Ps9bh`vcsI3?O zp%bqSpVrS-n)a>-TD^W<<1u9t=m%@{dkJK4%SU?ZE`tB(1t7@TKdYW0CncO)-bGUg zGAf{`dr24EXu)XLrEKfY3lhT}=lid#LtsjRcSIJ)BrkQL}@Fe&HHz1q`|lb7X}|Pjo}G$5FjS%PiLX3SGpy_ z!axE4az^Q_!R;mR!*a}E5*8YJ<<%}P44^`#yUIX(#+&YK&=$}Q=$cIvt|-YNWsLtW zU7uBPGphbF8XgzTP7UU#K@~AYf32W}*U-Scy{#Idm%6x6goMn2g+|LqB-*OWMHbZS@_&LOAjQ3J|9~d z%6k?o_*KNx?*z@#I*}u&HNAh5YBO*!5Sy6Ju7v8Q&bm+UP4qKC>b+CX)M2)&7dSe! z0X>2jBk}XNoRSl{BJ)|C-^58!62n`!krz)hVes)rGT!pj`F&c6p^~;B3o#59Ii@{L z$2!B*x{cW#Dy}ip=h#u>zXMzh)T)K8*?p^*v&%vBMbx0&S{)`SXUS-ILM%HqjGqrr zM-%(C!TH|80uv6l3YRi5{NRuOw-Qh=ATBzN+n}*+t**y8%D?*ZLaW5ArkM|xsK6~j zUu<{XCO#{C{_+LOyXkBnP!WyCSbAl*)HWon2g8-F_=50kY4Q@J`dcrC?$$J<`@xxD zJk#E=0GdVb$Y_|S=hoPHR{h#Ix(#AT2JXL--tfcYzO+;_LCSh4`ukJ)h9ew`*ic2l z>3;xiC@-1)RjYpminx!v4aD*KyxRnCl zP`l2irON^?RWnDt56QGi`|C9niJ{v`+$69yi5$ zJA%VB>{!W~(>bX0VM*%{gg)+e0j0!nGhZn~9VL1JK9mu`FVp+qi%DTZxfOkkH+Yz@ zzP?n3)aAxtd-U6jXr+s|XeRBHeo*WEyo-iTBi(U0~bQPXzjv78b{| zN@7QJe0aH4RiB)tm2{&ygepQp$(KNkAad^?8ZlWf{KSxS#+oU*4)%-)?)vr#pxvx| z1ZS+#pWn(v*)zwDZq~nvKH0bj|6%9P$~ejzQqhfmi(uH@c1HQRZU~(U%hCm!s8fG% zMwvm$-LqRT@DD|Reo33~)o#^v8+ZTYa+feN_^DiG5pZ;*)^>7WF?AbSB}2%(w5Zg1 zh#}SahG?XVUpB1(xP48YW39j zG2_>^2eX^y0XlMt%$|N7X(T7+;()>@QRIcbhmavSa#XS{FOeCu?EL$Ofw#Sw8d!uM z*^zH3O&##`eEO48_LuH++39@%2-9-a1_5Rqg|UV4ORn@b&yA?^*TFASJP6=icx?Gl zFIE?OEYSnXnF&yy70Le-`h(qu2=tfm)p1z~OBuud>1I9F+xdzFO#SuV67}>e>7zX9 z@h>OnTA<4E`m|KJ)Z~!L7F3I1P`kIoUe@A~=IZx5&tKck5`^v<%Ou2CoYLRZrIZX% zknDUh0Rx06anIUddSn^=d4lUWY}PuMx!^aFh{=V!RY ztW@g91W{fe@5}k*^`RKsgz54de}5<3CfJUhKtPd`&kJmTprZvd53_3xixWlZk5UkD z9&pqHRIvnkv=8mK#^+IRWf!*Bc_3(|$I1*`b{NVtW~RC~wU}!J?|B)Z#+(<7`Pe-5w33e6yn#M*zR=X76-KPy;w?)-oHzl2pF!M#VP;v+DML>_p)$znQ$l5X;HD=l3Q4`nI|Xv zmUFst_otp6Ms<4PqCx{cAIAvT7LI3X)EsuXkH>`!&HUaqfFKUsXqq{^uq)zexCRU|AapYP?z;x zZ|U+Rd>BZu*NSr*L@moQb>`npKb%}#J{(XQHP0R3V1*axpiT5+O(cJk)E*PX{aHmJ z|Kd&s8jl*Rj*8{O%9{CRmt+(d<-ftO@OE7%?f~szpH|(kr4IJ#m+s(BG*8u6rf$+V zfepGUJZyUg8INDrN5sZ~`@I0jgaBu5Ap5PO^i8hG8xu=4RLzd;@YNUGPs5l|}8Bl-{786kl zU<_@CwToQR7k9=8d;y{xQ+Bady51VK+Aj*C!snsSVL|*DPo4`4YiS0fg&$#XFEP%I zx4`43^m%3n5K(ATmeFln^Su1pY-WhUF5h|+RB(R^HL|)FWBjcRo?|3&rFcqyhD+K} zfp3BM0Wfx3tR{=9uhfVK_=a|mx{n9v`3+TQrJ_}OADD~+ zk{$>_Nci}RyYX@dv=2-07}1Dap_uDo;$@;OW{%)9P!`(ZF1|2b+aW)$0#33;c8Ct~Eu(s(8laD7(DP4Zo%7+2^7WVgQBHwiv200$j)4yGKqLbsCtJjg zRbFiCs#2lktZ05V1f5f{T4QE5Ra_-Q#(4q?Cs*`(+hl3-!2a>{-Z7w5i*Ck1=L2t9 zXp#A_p`L4#J$f*xZf@0P<0kRdOB}AM59Y-$+KH~*QotdIF&%W~)%E^SDoes*VDSbd zfm~)N;>*z5YJZoZ4Vpz2VoULsdHgW4r3en7>-|SaWaOpUvnL+W70c75$8Lf<; zC$vtLhDz@a?l2ryB{!FIB0bdm9<7r%VWLQN?O53>4wZg`^6{*vt^Spne!lYe?O-Rx zW2h)E^bCP>+D|K4F=X&)3_LRc^ZE;h^igCYud#a8V_SeG3vr?||6ZSh0%7#!R#W0? z{*WND0;F4?&n{`gw@*yRJMN3<#wuG(tg+FO^_sX_Jzp`mM_QCZ6aexZ{@S3Q=89-- zsMD9t-ZWPuDQZ_5J-p}MKU(IFOIgxZ_@ayg&*0@_{FJS+9h@IVlqu}|^W5%^c!0NS zziBW_GxyED*43coNCC1H$rx}ffnI>EU9?FsrZO6vEj_j9&?n9p78zERlnBTfnvh*O zw6z*DQ$iPZeaYfYT%LMBZxg(I^&~aWS*JC^AmPNY$HGUFvq*o{XMED|53E{ue-0!= z|Fv?R`;v23t$EY4f2L7r>wf0QxjIj83V;?J^0)|Ye0Sa9xV5@HUyvx~y`5(dTyJ*h8KIJV$)R|#K0ry`m zARSe!%(EO*0}uQLWY?K>b~ssQ=k(<4gM=l=p=7Aine^w&PN4~$E?+i}N=#ht26g92 z@h)(%OE8V2lT>q9;E>=>@@QYxBrYa`?%uC_vjxN4rr-47ezbo2{4Rvdn3$-8rQ~>{ zl&xin4U{=IZ(xGN-`!dvAo5^OT?>tw2|-fporXZeh1tVKKye|I2m683lL z>Rg7$3ZIxJQhzveQ@&qw#e%Y#sg7_r*n(T8xp=W2Hw)qpZ7PK~txO7kz@gNt1yB(V zKt=l$KAa)0dPVoo7sS|nER+7iBu3WZ)J}eEAF1Kpc)TO3idiQ1{wRoe50u@RAcTtm zLOCApdbM{AY7B|F$CTxJj^(iF^n2^6<;k*r)eK60|ES&&td0KLp=d_ZqEa-kXQD#s zBweOF`rPYB;Njxag<9aSdRvI)DQ$lYj2A6Sm;c7qsg}=>QIIs&3b7bLBG!K`*62p4 zk(R5BZEWEe{Uz%3>RpqE|t%bU4M@90|fP{M*6JN4G z>FuTP{F-BP~(otL63q+qIF%y8GE8zdILee&~XA2uoKO*^QGdx6*3Mt zq5upnC>xAvuU<%3cV&!XFk(=uWMO7mPd6KS>K>|%JK5+U=&ow^8~d(E6(kxFAJZDu zQYsBnb!a^EQ!PtpWbwWdb3#1C6Uht!F?ez?fCIUNLo6qkZq=GH5-L(&yMM5aC#wGQ zh>(-&Q`3M6US9en_OSyzPB7GAxi!F`Kw35*@FY<$Rt;#Y2A4@7;T}zjvDjGdz0qpM z)dMUb$`d|SdtlD82I93%~ zH^)Y2B0xM1stkf!g1!UZ%fxo0M08kFmXLMc`%;Vz$P2wm~!6G4{ z`S?(Jdn-KadNMRNUm>@a&>ZKUIn*7KI&n-Le7$}V#5{ObsG58S_rF~zr*iiN#>zpb znzwp=dV^k%WKI1|pk|---?%{VqhWXBN(imf>Z@l;LV;2BFA0T2sH~6eBr`(eRoUzb zD9Umg5^|CdozynY3}4LrmHz@q{VB64|Hb*gBjodhOsAo|(mE6yVNig>n(raz^HBKM zlEOl$dB5u8;YYi;`_Av|0AlT`#%foQG^hEZ|DtdVhsX8Av)!ukP%1z__jW0DIs*Xo z6{H@p?RC#M)X^GT#vN_+=#1~7Ef9F&vZg�QA+teoybca;wl0M+U#g zpjSX8VBogCfk_FE_!isJniKt>AfGuhdmv;kJfdy$lLrfq_zs^3v{fjQgeN5;8tgfR zO!rMyi+U)B(~~-lQa($a{)7X2_BaLic)afftccM@09c|7q@*@Iw)6qk_99$}078dd z2bC*3doZ>L0zkd&>z+xA!{r@-3*c@6876ZKzzaOtSq{vK=7Vv7%`H}*Qp3C2!UY%V zR##WKYQU;v1}Odi2$Y#or3L$Gdxh-ml!iB7tcibXbBI5U(KHKwd{l*C%%6@7!NwoZJH>LsT zFMR`Q16xDjcg@gx6^Rqiw3sgTyS}>(Lx14%Hc)+{}|iM z=ukPPj{RwT?{LxwScdDqZnvBOUlmXw?2QiHwfs9RI|Lb~@Dlb;Fw6 z9i`geYH?^av3IQ;NQG8xmUqwH0B0}jK(pd z1XJ_($UB_gE(3%F4qrrOFk zPEO|Ifq4JBTq17VG(OqHGks0;sOXSz+X2#ZofG5!VR&=9b|hCzN)@&`sGl~y>2`a& zg1blFHF_oo&<}Ft;6S^ym1IjZh2pjSpP<_~oto+{KY`jIdd8=2m^F<)K-8z+>iOhe ze~3Yoqh7*c>@vRDA-oDmkJIjLOqg-~(G9PU`tue7VeeKTfOu+}P@O6pTL*#C85q&l z?>KacntO$3c=90oSK?v{kex|?kULObA^$TgQ==EpXpU849rMTfUxERK{A_-DZ*Fb$ zuH2Ldk5jqqqDOV&#-Cy4sPU&@3zCJDfuOaQ8#wUO3M=z#58UNx_h5; zj(m!tc$7 zFa?By`e;ELh`DpNt0OV8xj-M%sqjdWN_xloL}M~BlcbUuAzmM}-#R@e6Gas-$ajKOi|P!5I5rZEnldu{cDg&v2zY z-f6AM#bQSn;}$Yv$rTxd&O0%0z|JIvl{@W*KPwzMXJaV#_e3brZPPPK<>k!Y-mpgkEWLwutWW~zi?+=qB**UB~jHRIQ8MrNb_PU!cJi>V3 zxZ|S%i<~5#JXs@gVnaBq5{=SG*?2D{=2?u>ueLqAs~BJlbh>u#c2v4=DUOkS!T0EQ zD=7R|TLWSw^lh+L_~vLV1`du99gTQ?;bW(fIZE7lSz9Nrt(DNpLFjTf9B9agp3Jpo z;J@B47t?aJ1QC#b3I5d@gWQt&@dM5GhOBVx-{8=Y&5khOcrb6ubY}1R^TQ#vLUuH_ zZ1TUZ`ZOwoESbwap%gb3=2BBsWBdT+uHIA^!?REoVAg8dJD?;~+jH~2Ppqm4XtnN% z4gI8*Aq5oKIkMuJ)c#K;i9`~B!2?_zlv#O zex@wHb36QQDz)YXFycq;?^kxRL|oN>Hgz9qGth)p+>#He`qNN2Yc*NZ!Y2cIAz#PQ zpdy~(PnAYbng99!={Tvmts!7C76pp)KPk%n8%+sTOPd0Rv( zSvM7h%FS&EV09Yhl>*{CtEH|e9ck~F^_!096b7rBP3OZEiTl%EgO z*@N$KIjMSyX5g)TN~xuQl&=%}uxB!E$NdK>(qCRavjP zg8fJ;Djv7UzwJN_w5Ks6!Wq#qB1qc&b*aqS67Zz}`zP=?<9`NBp62#B!zJdW1;y9B zm;_pl@Rl;zP@c=~ixsn2u7>m8W6h2*y5igC2X(|yC27V~h<3aRES%H9YkiCmNG^^J zw?*Ti@j!ciNcwZXUY6Y}oAqux%4f0gCw%X={@A!NOc}^O((zotJ8Jf4jsHmh;nMTL z&)0wlFFiQ5s(aeNZT(&V)4R6VrrZ3a9K54{L;MumZiIW5#)4|v? z^P7vx5f zC-!r6$UFXqki99ryqL3ljj-TaXJ~!Cau8_-CIMc(y7Kyws!?vty8zqczxlV-kSQx5T!cDpZ#OS zFve$4h-pmCr?gHq7u~Jm_v4RDwyDnD>3sU>HealcpHsoNaHPqk@k&X;mSWWGnFgaE zAZY17MTJyK4@!ZC`vL)Q?>9@&OJ?WZ=U&&fKdw5gI}}E^m%5TQs)v9XFn&HTZ&OIX z!v&9pbv(!NFX^>CO43X0TiKHH3xDlani^ zr>F}=Yg^j+T96)rIPjXOXA8*uZrPk~j}F>2H$B*a=U;R2|AWmJ|K@)E-k+6Vh4A3H znARSt_6wre7q#AU<;D<8P?Gf-@mrwUIl&32bWOEHV`o}H9x2N0K;3Zs0L7W7sNlKD zUG3d>1pG#*tfnmYk^#v^K)9%~TtaNjE?~-bbJD?kjT{$n*&JV3co_OwQ5o0PVlX_q z`#7&6x3u7jUtaZnJrZ%%B)r_ELUw%TD9Bm~wk%dk-!GfQNMr+RW+PZZ-qTo?uD6-~$=iIl%i6HGbMSTYj>v?)}TG=(`ue(z~sw~a5p?%E}E z?L}U<%xcUuGZt@|dxypcLE@!7LI-~0g+00^F@dAsw1RCy9gbU-$iMzc-2bK_d0&kN zU!!amTM$}W5-L9lOv-;5RaJ=MefI~zHV>b*q{WZ}rej9N&=d=b9?fan5Qd_jW;$K; z17N&))iELXm#+8r&4q30cpK~13N$z`-db@a+`wYSx(UwxSvas@C@+a9rxoyW}FF1V_fuj-U((=g) zzpYG%XCfA*UM`e~=WX?KMOq}bAAhx%#w0$ueSG^ zo?V{05bq%H;8K1o0fYrx#~B|VAHjiEOL>S$LkjBZSN~a{#wfaxW6?)#s6c}M7rpFf z4-}=8fuCI+o-wt1v{#&;-iMBtZD%^wd(zf>qAlD-(O`KN7DTL~;Ek36gk;bPQd+rX z1;MZ?puP)2~VLPZ$R>C1s1x|>^J!l z#1J$&?Od8&9bO(?iYtQer1e-G%RjPcn^-SUeqyR&u{*!XAooTD#f9Jr4K=Jh5Opm{%~pu#DH&GK2sh zcV|b01o#(Vf3rqglL+kxvLD(VjmhOr_ci$f^{<{r$x1Zf*_TLPr029BXym$S1Px(e zIGbp$3G{IIB(rS@9?ek7R%Q&EZc_%H`I;B#FPd3lqu_gq4&-I1)mAaT22)F97&S6U z^(Gk7h}@EBY#H^X?`MeWI<2wcqg0M58mZN%qVpS43qum<6@Ya9=lP zt{j&3+-~B^1$xc49#>QsmeW#mpNC0gypMPivfWWck6TX2**^z|B(FBw%AT}o9!%3k zL|Ay;48zUz(s#Q@fK^B~ZwR>h`F(jY*>p%S?cxKg=?Ur#@57?WMv^5(I?LPg%N6H# zNdK4%lM7@o)L{2O{{%Pk@6Tg6r(!#amToXiZ?7tgz4;4q$J}nZBIh1^DF7ci31UiO(eh$XpwlAk7RmV-1@zBw+aH{1 zS7fWNIO)7Z1!ZV6$Q*aZ^9seOIC*riO-oo=m$LYb__VQgod-wUyqb)gWAB{WQ+ypR z|BS9x{f!!XWgxjI9C62*_4)faYy3X0&2v$h?0x7_3F%j#EZ_MREb{8rh;8wwN)@p( z^xLX%-<)a|FME&j5XJ9P^(+jDLwP}C6#8VH1f!vDF9~x`-?U0^D#$t*NM^fjW5r?* zs=sEg?~>Q;`C%kIIX1fo+sdDE(E{0gsY1i>;F$&EgZ7?AH3c3-ad6~6rHq#*Qm;A6 zR&1G7(+!K*`<%VgN>9<|9xRHi?#5Iu9^On(SrTv#$e>&^m2AT+vdPZOetKFc>4qmH z6JNMC^nGN$DcM@m!zt^ApZ%ZKt}CjEuI(Zy)iR5~aar3fSn(vccLkO0yI z0@6hxii9pI0)jM=5~Z3zq(lM+LKUe30s$kTH>GGOUtV?n|J8T*or_t|Jaf)|o_%Ka zUTbb<>YOBhvfL^Unwz$hoK-*A6xq6t%ZT)^4;}qj?``+m8Yodzb5EDtE>W`Et={K7 z?LTEU#qe`)Ew7t#7R#e!&LVDI&pP?N-!LZQQlgl+d?3Vm8b)j}Ea+@m@l%?wE|cqU zQ6jl~mR$HKJ!i`JQQR1?JdnLLXUq(v0=a;38Yb_BXnnLUy~tl~0Ufj-@oSS~19wv8 z80e#&P9Qh)2!kr+Au!G2w69zwefy`w+5m~amC@u;b~^S_%qJ@qd(l&=Dz+j>oYwi! zmYY}&vRm-uvN6!DTgpGLLv=MZ?H7%H^gWr&^7)v%l+_dWE{T(27W8hq*pNY5q$7t{ zD(XL2!tMfg&{(3}mwmBl?0$H=DloQvw-OW9tWst4(PA*y8u1?Q1CH-fLbfud%sr;? zMIoz~dRxzAO)O;DZQR+Jlq;Is){F2x@d$mTif;+qWJlKr(Ig zgM8VdADFbO#^o_2ivhZ3Q#Px#&c#W=5TaZ7qWkSZB4aK}D`;10vSv~H%SLayc80l4 z7S>czqA~qpc*v5&!z%{z5xOjTv-)L0)Y>YiH{hS7>Vtf^C^!yyJ@~?f$k3VixJ3eC zW(F!zK_H|wcpg|)_V^u4X}xTfg(ubC2e@ z*nK*3T~4pSGziLmen#v*_efj584MqZ33UPsO&&Fdy) z*gn?QmYtCYEyAxP^OG;7xt{P1UCO>TVr-2N#P)bs;n2aa916y;R$kVnb&JOPQ5<*o zE@~d_85rBt52U7yz`j;i{hKt@tw{vPmwes(jG8gz%J)?z&{*8 z(Z^E~&yjRcj)N$^c8A+JwlIqvrov>+q9c$MZeK=ZlUJ;(!kF2lVE zAzzBue`#1?S!53!j*XC_JdRk$Wq9Sq%SjLyp9Ba<=N#tJj9J|w5d$tvORqUePC@rj zxnY%yqCK_R?X3c-EPl2_Q`~jB+`DhElcT8an6*rS>8-A5)vqOy=A1fu zk<`xD^?v-RrP|-G+;!N_XF32vUAQq=iA)|BARoIGyGT5*L zmxgh!qFRSXN8GX&IF0Td9;=t7Crvo0G5*Dyh_kV{$oKRoM0>UpTJg0|VgJ=|`0jkp z$|jpphPkhJz7)Eo)A=9zyP{9U5PdRVC&XI3V2M+ZYuF}hsv^hl{nzd$Tiyb^&^BLi z*iW9zCnXcN_}lTFQlj~ajN}T>-9FyE#rI+x-LP}0B@M(upx+7u2Aoh(l#Y)ELNztHD|8b)igG-3s z$*jceTBZFor_aXcoIGAA(rq{Bw!gOV(a$+;V!SiM@hXdV*T0w4--xc2h2k!}81}zj z&%=CsXzC{Be5Lx2Uvj0JS&cXv{2FU2+Fl`xI=&Y`2YchwUGW_zc+%}xB<9iNl}Lps z2y2S%5lW9BoIzNNDt6vzumaY~CkrJ%>-VfJt3$h&t<)MPiU~%%g836x3QT~4{Z=PT zlh14p;1-KY)40ZUdR)?E<3~dX(zHA7zBrI4iwvqPSyqE#3HJ(Gz+>LtA@q zK~zCBI#JU@X)335vzNGSs0|GYEZpsX7q*JOd+_gtRnwI1XW~)=wmOc+IRj&w&QITA zANk$1xd@6lZRX{M+ihXajVYhkPA`&O{0g?E&!SP!A7^)&_xkOYe zRcN+v^1kvLtvGR^RZcroCD`KTi-u=r#}2<5n40$ z@&^7g@+=hh3@oC1#8C;7fiYwyh=&D`RDrCa50-%}y5|QrhuehT>OV7R%Pe&Oh#+m< zdUW)z4i#g*aB0C*%9r-AzZx;Ss%to`4(A<_9V|xdkSv#emoT!It4j)LGN=%jq$-au zW-4uAJ7!h+^LCv0^67wa?y0TZAPa=`n=McKt&Ba>8_ZQKD5-&guKVxG#jxt_f!kd_Un!TF#NNGROtV_-Z5z!wgK0TL+R zAsZc~J_s&86r9!$*;ut4sj-68=)oj-)4(vA#ZGgdJ#X3+@VzqtL(zmu3kI7S^QAH1 zZn&U@a6S>P;K6GTfS;;>f=Z6Eb`|)6w~wNCW-T1mIfwlLWRGZB zh|-!2wW-;UY5=LZFou0r&UFW26UzMYRvyN3nI-Qq0PghR^q%4xn@tnAn`^Sk; zP{UF5<_m*rk^DpcY$>G+mNw;VQ-}TH3B~S~LsB*qlC?aCb%nu~uMKTrC?V907)D`* z`zu%xFpPt=WMAs-WK&XD?hV&Eu1HcxsWq2j79AdAbuOuVw=cOCUHM?SlBF(pq_DT? z^KZs7vrtg>AI9|GjJuhPy#Qpd=n&O4MEe$(>yP`|#;}k>jA*%+^p4v;=xj{ z1GrS!cadR6Em4ri$@Qh*EQX#mJF`}@8Z_gu`zm8tTi>@>KdwvM96mJOF(WG9*o)-w#zIg)jo;3-(pw7Ib zS0n<&1M09VY*#+eHlD5hqFv)e2a7%uOiP7b8(N43I^s_u8={%uU?zCqZ*ce*HBQ&O z{|6s4a=F~TwH4s3fWesa8amqz`CoSWsMaLDW5WkE3lJeI$5TcM6}{kQEOOStTSq6@ z4oeAgtg@$+Eo5RfqW#%)B~hmGXCaVlhgvp0KWsKyR`(vS5=$_244TuYFr$Ujsy)Xs zp#bvP`!3>iiyU@O^FtYl6dB0l;{>BM62=E1#K(vBxd$v;aAsBxEL)&k2RC3>AUoUp zY1p>e?pQ2u*5alH#9#d*nc2_ms7QY{2sm zyTzcf`$ivRM3BQGe+|n-PBcHuFy8PPm1{kiDN{ruK^ob89{G3iJPqy??J#AHmxdg$ zf)taOWvj&0+F(6_l-o5%`X-v5afzyJFmfP8Sm11z6x(N(o Date: Sat, 29 Feb 2020 18:07:26 +0100 Subject: [PATCH 248/269] TST remove futile arguments --- sklearn/linear_model/_glm/tests/test_glm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 1d1d0c55ceeae..a0e9b3830703d 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -42,7 +42,7 @@ def test_sample_weights_validation(): X = [[1]] y = [1] weights = 0 - glm = GeneralizedLinearRegressor(fit_intercept=False) + glm = GeneralizedLinearRegressor() # Positive weights are accepted glm.fit(X, y, sample_weight=1) @@ -71,8 +71,7 @@ def test_glm_family_argument(name, instance): glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y) assert isinstance(glm._family_instance, instance.__class__) - glm = GeneralizedLinearRegressor(family='not a family', - fit_intercept=False) + glm = GeneralizedLinearRegressor(family='not a family') with pytest.raises(ValueError, match="family must be"): glm.fit(X, y) From 7498f3eb15b789c4f1266459c7b85d075af0c7c4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 18:08:57 +0100 Subject: [PATCH 249/269] TST increase rtol --- sklearn/linear_model/_glm/tests/test_glm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index a0e9b3830703d..99fe5e8bc420e 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -197,12 +197,12 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family): # sample_weight=np.ones(..) should be equivalent to sample_weight=None sample_weight = np.ones(y.shape) glm.fit(X, y, sample_weight=sample_weight) - assert_allclose(glm.coef_, coef, rtol=1e-6) + assert_allclose(glm.coef_, coef, rtol=1e-12) # sample_weight are normalized to 1 so, scaling them has no effect sample_weight = 2*np.ones(y.shape) glm.fit(X, y, sample_weight=sample_weight) - assert_allclose(glm.coef_, coef, rtol=1e-6) + assert_allclose(glm.coef_, coef, rtol=1e-12) # setting one element of sample_weight to 0 is equivalent to removing # the correspoding sample @@ -211,7 +211,7 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family): glm.fit(X, y, sample_weight=sample_weight) coef1 = glm.coef_.copy() glm.fit(X[:-1], y[:-1]) - assert_allclose(glm.coef_, coef1, rtol=1e-6) + assert_allclose(glm.coef_, coef1, rtol=1e-12) # check that multiplying sample_weight by 2 is equivalent # to repeating correspoding samples twice From 90b1213673abff49672e3f0cfbdc987002b48c08 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 18:22:08 +0100 Subject: [PATCH 250/269] TST add fit_intercept to test_glm_identity_regression --- sklearn/linear_model/_glm/tests/test_glm.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index 99fe5e8bc420e..f75f9f8a180ae 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -167,15 +167,21 @@ def test_glm_warm_start_argument(warm_start): glm.fit(X, y) -def test_glm_identity_regression(): +@pytest.mark.parametrize('fit_intercept', [False, True]) +def test_glm_identity_regression(fit_intercept): """Test GLM regression with identity link on a simple dataset.""" coef = [1., 2.] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.dot(X, coef) glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity', - fit_intercept=False) - glm.fit(X, y) - assert_allclose(glm.coef_, coef, rtol=1e-6) + fit_intercept=fit_intercept, tol=1e-12) + if fit_intercept: + glm.fit(X[:, 1:], y) + assert_allclose(glm.coef_, coef[1:], rtol=1e-10) + assert_allclose(glm.intercept_, coef[0], rtol=1e-10) + else: + glm.fit(X, y) + assert_allclose(glm.coef_, coef, rtol=1e-12) @pytest.mark.parametrize('fit_intercept', [False, True]) From 697bda293e6782cb68dce9bbddc0efe5d9066197 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 18:47:42 +0100 Subject: [PATCH 251/269] TST ignore one specific ConvergenceWarning --- sklearn/linear_model/_glm/tests/test_glm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index f75f9f8a180ae..b1583e2f3a242 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -5,6 +5,7 @@ import numpy as np from numpy.testing import assert_allclose import pytest +import warnings from sklearn.datasets import make_regression from sklearn.linear_model._glm import GeneralizedLinearRegressor @@ -272,7 +273,11 @@ def test_warm_start(fit_intercept): fit_intercept=fit_intercept, max_iter=1 ) - glm2.fit(X, y) + # As we intentionally set max_iter=1, L-BFGS-B will issue a + # ConvergenceWarning which we here simply ignore. + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=ConvergenceWarning) + glm2.fit(X, y) assert glm1.score(X, y) > glm2.score(X, y) glm2.set_params(max_iter=1000) glm2.fit(X, y) From 578408c855be085e45ac1458fca5f831c5b6c768 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 18:53:38 +0100 Subject: [PATCH 252/269] TST add fit_intercept to test_glm_log_regression --- sklearn/linear_model/_glm/tests/test_glm.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py index b1583e2f3a242..ece8f09c76acd 100644 --- a/sklearn/linear_model/_glm/tests/test_glm.py +++ b/sklearn/linear_model/_glm/tests/test_glm.py @@ -237,21 +237,27 @@ def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family): assert_allclose(glm1.coef_, glm2.coef_) +@pytest.mark.parametrize('fit_intercept', [True, False]) @pytest.mark.parametrize( 'family', [NormalDistribution(), PoissonDistribution(), GammaDistribution(), InverseGaussianDistribution(), TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)]) -def test_glm_log_regression(family): +def test_glm_log_regression(fit_intercept, family): """Test GLM regression with log link on a simple dataset.""" coef = [0.2, -0.1] X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T y = np.exp(np.dot(X, coef)) glm = GeneralizedLinearRegressor( - alpha=0, family=family, link='log', fit_intercept=False, - tol=1e-6) - res = glm.fit(X, y) - assert_allclose(res.coef_, coef, rtol=5e-6) + alpha=0, family=family, link='log', + fit_intercept=fit_intercept, tol=1e-7) + if fit_intercept: + res = glm.fit(X[:, 1:], y) + assert_allclose(res.coef_, coef[1:], rtol=1e-6) + assert_allclose(res.intercept_, coef[0], rtol=1e-6) + else: + res = glm.fit(X, y) + assert_allclose(res.coef_, coef, rtol=2e-6) @pytest.mark.parametrize('fit_intercept', [True, False]) From 266891012997c24e560119ae7d024477def5c60b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 20:48:18 +0100 Subject: [PATCH 253/269] EXA comment on penalty strenght GLM vs Ridge --- .../plot_poisson_regression_non_normal_loss.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d923afbc70891..d47c1579c70da 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -113,7 +113,7 @@ def load_mtpl2(n_samples=100000): # events occurring with a constant rate in a given time interval # (``Exposure``, in units of years). Here we model the frequency # ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution, -# and use ``Exposure`` as `sample_weight`. +# and use ``Exposure`` as ``sample_weight``. df["Frequency"] = df["ClaimNb"] / df["Exposure"] @@ -201,7 +201,10 @@ def score_estimator(estimator, df_test): ############################################################################## # -# Next we fit the Poisson regressor on the target variable, +# Next we fit the Poisson regressor on the target variable. We set the +# regularization strength ``alpha`` to 1 over number of samples in oder to +# mimic the Ridge regressor whose L2 penalty term scales differently with the +# number of samples. poisson = make_pipeline( linear_model_preprocessor, @@ -302,8 +305,8 @@ def score_estimator(estimator, df_test): # ``Ridge`` and ``RandomForestRegressor`` estimators. # # To ensure that estimators yield reasonable predictions for different -# policyholder types, we can bin test samples according to `y_pred` returned -# by each model. Then for each bin, we compare the mean predicted `y_pred`, +# policyholder types, we can bin test samples according to ``y_pred`` returned +# by each model. Then for each bin, we compare the mean predicted ``y_pred``, # with the mean observed target: From d1c3dc9405c0b19b778d5a4a56058dbfd994db4f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 20:56:20 +0100 Subject: [PATCH 254/269] EXA fix nitpicks --- .../plot_poisson_regression_non_normal_loss.py | 16 +--------------- .../plot_tweedie_regression_insurance_claims.py | 11 +---------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index d47c1579c70da..78aa0a5f5090c 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -50,7 +50,7 @@ def load_mtpl2(n_samples=100000): Parameters ---------- n_samples: int or None, default=100000 - number of samples to select (for faster run time). If None, the full + Number of samples to select (for faster run time). If None, the full dataset with 678013 samples is returned. """ @@ -64,9 +64,7 @@ def load_mtpl2(n_samples=100000): return df.iloc[:n_samples] return df - ############################################################################## -# # Let's load the motor claim dataset. We ignore the severity data for this # study for the sake of simplicitly. # @@ -79,7 +77,6 @@ def load_mtpl2(n_samples=100000): df["Exposure"] = df["Exposure"].clip(upper=1) ############################################################################## -# # The remaining columns can be used to predict the frequency of claim events. # Those columns are very heterogeneous with a mix of categorical and numeric # variables with different scales, possibly very unevenly distributed. @@ -107,7 +104,6 @@ def load_mtpl2(n_samples=100000): ) ############################################################################## -# # The number of claims (``ClaimNb``) is a positive integer that can be modeled # as a Poisson distribution. It is then assumed to be the number of discrete # events occurring with a constant rate in a given time interval @@ -129,7 +125,6 @@ def load_mtpl2(n_samples=100000): df["Exposure"].sum())) ############################################################################## -# # It is worth noting that 92 % of policyholders have zero claims, and if we # were to convert this problem into a binary classification task, it would be # significantly imbalanced. @@ -179,7 +174,6 @@ def score_estimator(estimator, df_test): score_estimator(dummy, df_test) ############################################################################## -# # We start by modeling the target variable with the least squares linear # regression model, @@ -188,7 +182,6 @@ def score_estimator(estimator, df_test): ridge__sample_weight=df_train["Exposure"]) ############################################################################## -# # The Poisson deviance cannot be computed on non-positive values predicted by # the model. For models that do return a few non-positive predictions # (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples, @@ -200,7 +193,6 @@ def score_estimator(estimator, df_test): score_estimator(ridge, df_test) ############################################################################## -# # Next we fit the Poisson regressor on the target variable. We set the # regularization strength ``alpha`` to 1 over number of samples in oder to # mimic the Ridge regressor whose L2 penalty term scales differently with the @@ -217,7 +209,6 @@ def score_estimator(estimator, df_test): score_estimator(poisson, df_test) ############################################################################## -# # Finally, we will consider a non-linear model, namely a random forest. Random # forests do not require the categorical data to be one-hot encoded: instead, # we can encode each category label with an arbitrary integer using @@ -250,7 +241,6 @@ def score_estimator(estimator, df_test): ############################################################################## -# # Like the Ridge regression above, the random forest model minimizes the # conditional squared error, too. However, because of a higher predictive # power, it also results in a smaller Poisson deviance than the Poisson @@ -292,7 +282,6 @@ def score_estimator(estimator, df_test): plt.tight_layout() ############################################################################## -# # The experimental data presents a long tail distribution for ``y``. In all # models we predict a mean expected value, so we will have necessarily fewer # extreme values. Additionally, the normal distribution used in ``Ridge`` and @@ -374,13 +363,11 @@ def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None, title=model[-1].__class__.__name__, xlabel='Fraction of samples sorted by y_pred', ylabel='Mean Frequency (y_pred)' - ) axi.legend() plt.tight_layout() ############################################################################## -# # The ``Ridge`` regression model can predict very low expected frequencies # that do not match the data. It can therefore severly under-estimate the risk # for some policyholders. @@ -444,7 +431,6 @@ def _cumulated_claims(y_true, y_pred, exposure): ax.legend(loc="lower right") ############################################################################## -# # This plot reveals that the random forest model is slightly better at ranking # policyholders by risk profiles even if the absolute value of the predicted # expected frequencies are less well calibrated than for the linear Poisson diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 98d515a4f9418..4a301f5fb43d2 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -99,7 +99,7 @@ def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None, column name of df with the values of weights or exposure observed : str a column name of df with the observed target - predicted : frame + predicted : DataFrame a dataframe, with the same index as df, with the predicted target fill_legend : bool, default=False whether to show fill_between legend @@ -191,7 +191,6 @@ def score_estimator( ############################################################################## -# # Loading datasets, basic feature extraction and target definitions # ----------------------------------------------------------------- # @@ -278,7 +277,6 @@ def score_estimator( print(scores) ############################################################################## -# # We can visually compare observed and predicted values, aggregated by the # drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance # bonus/malus (``BonusMalus``). @@ -335,7 +333,6 @@ def score_estimator( ############################################################################## -# # According to the observed data, the frequency of accidents is higher for # drivers younger than 30 years old, and is positively correlated with the # `BonusMalus` variable. Our model is able to mostly correctly model this @@ -376,7 +373,6 @@ def score_estimator( print(scores) ############################################################################## -# # Here, the scores for the test data call for caution as they are # significantly worse than for the training data indicating an overfit despite # the strong regularization. @@ -394,7 +390,6 @@ def score_estimator( ############################################################################## -# # We can visually compare observed and predicted values, aggregated for # the drivers age (``DrivAge``). @@ -425,7 +420,6 @@ def score_estimator( plt.tight_layout() ############################################################################## -# # Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # @@ -461,7 +455,6 @@ def score_estimator( ############################################################################## -# # Pure Premium Modeling Using a Single Compound Poisson Gamma Model # ----------------------------------------------------------------- # Instead of taking the product of two independently fit models for frequency @@ -492,7 +485,6 @@ def score_estimator( print(scores) ############################################################################## -# # In this example, both modeling approaches yield comparable performance # metrics. # @@ -523,7 +515,6 @@ def score_estimator( print(pd.DataFrame(res).set_index("subset").T) ############################################################################## -# # Finally, we can compare the two models using a plot of cumulated claims: for # each model, the policyholders are ranked from safest to riskiest and the # fraction of observed total cumulated claims is plotted on the y axis. This From a9686f6b7b6edb6ba89f4e0acac99f1a536c0457 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 20:57:46 +0100 Subject: [PATCH 255/269] EXA remove empty line --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 4a301f5fb43d2..75f6b1f3fb7a1 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -64,7 +64,6 @@ def load_mtpl2(n_samples=100000): number of samples to select (for faster run time). Full dataset has 678013 samples. """ - # freMTPL2freq dataset from https://www.openml.org/d/41214 df_freq = fetch_openml(data_id=41214, as_frame=True)['data'] df_freq['IDpol'] = df_freq['IDpol'].astype(np.int) From 04e7aca7ef67f75f15c8221e265fe3b4a66cb542 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sat, 29 Feb 2020 21:11:36 +0100 Subject: [PATCH 256/269] EXA add blank line after function definition E305 --- examples/linear_model/plot_poisson_regression_non_normal_loss.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 78aa0a5f5090c..558269fe2d638 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -64,6 +64,7 @@ def load_mtpl2(n_samples=100000): return df.iloc[:n_samples] return df + ############################################################################## # Let's load the motor claim dataset. We ignore the severity data for this # study for the sake of simplicitly. From 21a739c6bd5ea072302020a72546214bd26f3db3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 16:53:29 -0500 Subject: [PATCH 257/269] Gamma -> Poisson --- doc/modules/linear_model.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst index 28b220110fc49..fc5f254035a53 100644 --- a/doc/modules/linear_model.rst +++ b/doc/modules/linear_model.rst @@ -970,7 +970,7 @@ in the following figure, :align: center :scale: 100% - PDF of a random variable Y following Gamma, Tweedie (power=1.5) and Gamma + PDF of a random variable Y following Poisson, Tweedie (power=1.5) and Gamma distributions with different mean values (:math:`\mu`). Observe the point mass at :math:`Y=0` for the Poisson distribution and the Tweedie (power=1.5) distribution, but not for the Gamma distribution which has a strictly From 79ada1e8072f4218ab9c78c52492485f5bf2e8ce Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 17:38:23 -0500 Subject: [PATCH 258/269] Used X @ coef as suggested by Roman --- sklearn/linear_model/_glm/glm.py | 34 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index a8df8931961db..90c87726a369d 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -74,7 +74,7 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be - added to the linear predictor (X*coef+intercept). + added to the linear predictor (X @ coef + intercept). family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \ or an ExponentialDispersionModel instance, default='normal' @@ -84,8 +84,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \ default='auto' The link function of the GLM, i.e. mapping from linear predictor - `Xw` to prediction `y_pred`. Option 'auto' sets the link depending - on the chosen family as follows: + `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets + the link depending on the chosen family as follows: - 'identity' for Normal distribution - 'log' for Poisson, Gamma and Inverse Gaussian distributions @@ -115,8 +115,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X*coef_+intercept_) in - the GLM. + Estimated coefficients for the linear predictor (X @ coef_ + + intercept_) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. @@ -296,7 +296,7 @@ def func(coef, X, y, weights, alpha, family, link): return self def _linear_predictor(self, X): - """Compute the linear_predictor = X*coef_ + intercept_. + """Compute the linear_predictor = X @ coef_ + intercept_. Parameters ---------- @@ -401,7 +401,7 @@ class PoissonRegressor(GeneralizedLinearRegressor): fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be - added to the linear predictor (X*coef+intercept). + added to the linear predictor (X @ coef + intercept). max_iter : int, default=100 The maximal number of iterations for the solver. @@ -422,8 +422,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X*coef_+intercept_) in - the GLM. + Estimated coefficients for the linear predictor (X @ coef_ + + intercept_) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. @@ -467,7 +467,7 @@ class GammaRegressor(GeneralizedLinearRegressor): fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be - added to the linear predictor (X*coef+intercept). + added to the linear predictor (X @ coef + intercept). max_iter : int, default=100 The maximal number of iterations for the solver. @@ -488,8 +488,8 @@ class GammaRegressor(GeneralizedLinearRegressor): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X*coef_+intercept_) in - the GLM. + Estimated coefficients for the linear predictor (X * coef_ + + intercept_) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. @@ -556,15 +556,15 @@ class TweedieRegressor(GeneralizedLinearRegressor): link : {'auto', 'identity', 'log'}, default='auto' The link function of the GLM, i.e. mapping from linear predictor - `Xw` to prediction `y_pred`. Option 'auto' sets the link depending - on the chosen family as follows: + `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets + the link depending on the chosen family as follows: - 'identity' for Normal distribution - 'log' for Poisson, Gamma and Inverse Gaussian distributions fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be - added to the linear predictor (X*coef+intercept). + added to the linear predictor (X @ coef + intercept). max_iter : int, default=100 The maximal number of iterations for the solver. @@ -585,8 +585,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X*coef_+intercept_) - in the GLM. + Estimated coefficients for the linear predictor (X @ coef_ + + intercept_) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. From 39eeb44d20afdb8b0c89cd87a537c56383879fe0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 17:40:41 -0500 Subject: [PATCH 259/269] minimal addition to clearly separate links in example --- .../linear_model/plot_poisson_regression_non_normal_loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index 558269fe2d638..c447204a02eab 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -5,7 +5,7 @@ This example illustrates the use of log-linear Poisson regression on the `French Motor Third-Party Liability Claims dataset -`_ [1]_ and compares +`_ from [1]_ and compares it with models learned with least squared error. The goal is to predict the expected number of insurance claims (or frequency) following car accidents for a policyholder given historical data over a population of policyholders. From e3cf69dd221e686570ff1d55fb5b576b52f4acd3 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 18:05:53 -0500 Subject: [PATCH 260/269] Added a few definitions from the insurance jargon --- .../plot_poisson_regression_non_normal_loss.py | 16 ++++++++++++---- .../plot_tweedie_regression_insurance_claims.py | 17 ++++++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py index c447204a02eab..ee863dd4198ba 100644 --- a/examples/linear_model/plot_poisson_regression_non_normal_loss.py +++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py @@ -6,10 +6,18 @@ This example illustrates the use of log-linear Poisson regression on the `French Motor Third-Party Liability Claims dataset `_ from [1]_ and compares -it with models learned with least squared error. The goal is to predict the -expected number of insurance claims (or frequency) following car accidents for -a policyholder given historical data over a population of policyholders. -Available features include driver age, vehicle age, vehicle power, etc. +it with models learned with least squared error. In this dataset, each sample +corresponds to an insurance policy, i.e. a contract within an insurance +company and an individual (policiholder). Available features include driver +age, vehicle age, vehicle power, etc. + +A few definitions: a *claim* is the request made by a policyholder to the +insurer to compensate for a loss covered by the insurance. The *exposure* is +the duration of the insurance coverage of a given policy, in years. + +Our goal is to predict the expected number of insurance claims (or frequency) +following car accidents for a policyholder given the historical data over a +population of policyholders. .. [1] A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor Third-Party Liability Claims (November 8, 2018). diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 75f6b1f3fb7a1..6cf7e8c6ae558 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -7,11 +7,18 @@ the `French Motor Third-Party Liability Claims dataset `_, and is inspired by an R tutorial [1]_. -Insurance claims data consist of the number of claims and the total claim -amount, together with policyholder features such as driver age, vehicle age, -vehicle power, etc. Often, the final goal is to predict the expected value, -i.e. the mean, of the total claim amount per exposure unit also referred to as -the pure premium. +In this dataset, each sample corresponds to an insurance policy, i.e. a +contract within an insurance company and an individual (policyholder). +Available features include driver age, vehicle age, vehicle power, etc. + +A few definitions: a *claim* is the request made by a policyholder to the +insurer to compensate for a loss covered by the insurance. The *claim amount* +is the amount of money that the insurer must pay. The *exposure* is the +duration of the insurance coverage of a given policy, in years. + +Here our goal goal is to predict the expected +value, i.e. the mean, of the total claim amount per exposure unit also +referred to as the pure premium. There are several possibilities to do that, two of which are: From 56aa0d78cf7df2cf3e74ad626dbd0cda68b116bd Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 18:30:02 -0500 Subject: [PATCH 261/269] minor comment --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 6cf7e8c6ae558..61faf7c2225fb 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -354,7 +354,8 @@ def score_estimator( # # - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support # on :math:`(0, \infty)`, not :math:`[0, \infty)`. -# - We use ``ClaimNb`` as `sample_weight`. +# - We use ``ClaimNb`` as `sample_weight` to account for policies that contain +# more than one claim. mask_train = df_train["ClaimAmount"] > 0 mask_test = df_test["ClaimAmount"] > 0 From 27a344c04fefed46e034fe21dc3016475ad4e204 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 19:07:23 -0500 Subject: [PATCH 262/269] maybe fixed doc --- sklearn/linear_model/_glm/glm.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 90c87726a369d..c927114544d80 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -115,8 +115,8 @@ class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X @ coef_ + - intercept_) in the GLM. + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. @@ -296,7 +296,7 @@ def func(coef, X, y, weights, alpha, family, link): return self def _linear_predictor(self, X): - """Compute the linear_predictor = X @ coef_ + intercept_. + """Compute the linear_predictor = `X @ coef_ + intercept_`. Parameters ---------- @@ -422,8 +422,8 @@ class PoissonRegressor(GeneralizedLinearRegressor): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X @ coef_ + - intercept_) in the GLM. + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. @@ -585,8 +585,8 @@ class TweedieRegressor(GeneralizedLinearRegressor): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X @ coef_ + - intercept_) in the GLM. + Estimated coefficients for the linear predictor (`X @ coef_ + + intercept_`) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. From e817b2c82ff0e8e1484a59f339a7e148f76ecc04 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Feb 2020 19:21:48 -0500 Subject: [PATCH 263/269] forgot these --- sklearn/linear_model/_glm/glm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index c927114544d80..46773fd4f90c1 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -488,8 +488,8 @@ class GammaRegressor(GeneralizedLinearRegressor): Attributes ---------- coef_ : array of shape (n_features,) - Estimated coefficients for the linear predictor (X * coef_ + - intercept_) in the GLM. + Estimated coefficients for the linear predictor (`X * coef_ + + intercept_`) in the GLM. intercept_ : float Intercept (a.k.a. bias) added to linear predictor. From 0fdc518a186ef0b483ad28ad8fe428b4122d8cc4 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 1 Mar 2020 10:39:01 +0100 Subject: [PATCH 264/269] Update comment about read-only family attribute --- sklearn/linear_model/_glm/glm.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 46773fd4f90c1..4a44e4a1baa58 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -440,10 +440,7 @@ def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, @property def family(self): - # We use a property with a setter, since the GLM solver relies - # on self.family attribute, but we can't set it in __init__ according - # to scikit-learn API constraints. This attribute is made read-only - # to disallow changing distribution to other than Poisson. + # Make this attribute read-only to avoid mis-uses e.g. in GridSearch. return "poisson" @family.setter @@ -506,10 +503,7 @@ def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100, @property def family(self): - # We use a property with a setter, since the GLM solver relies - # on self.family attribute, but we can't set it in __init__ according - # to scikit-learn API constraints. This attribute is made read-only - # to disallow changing distribution to other than Gamma. + # Make this attribute read-only to avoid mis-uses e.g. in GridSearch. return "gamma" @family.setter From 6d4ecb29a84986540da4748cbdd724b419c30c5b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 1 Mar 2020 10:46:48 +0100 Subject: [PATCH 265/269] Update figures to illustrate that they are not defined for Y<0 --- .../poisson_gamma_tweedie_distributions.png | Bin 38430 -> 63830 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png b/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png index cfc8fef2ae40c2e422e939ff067b53b94dd08f8f..3b95b724a662389f0547e06049e65c729f1968e8 100644 GIT binary patch literal 63830 zcmaHS2{@E*+rC}0Wyw;O?8$EIVoKKRMM$eT}h> zkuVrD|7Ys=e((1k|L^_gsDoKN_dVBrE$4Y&*Ar`Eq(guH(s>dR5_&yd%{wF{6jdZ7 zq_*d%fLGpHti1z%P(U^G%+3LikaJEkz;hZeT`MRF38Mq?KWQ_j_a!}_x=3Asuyc}- zTp`iZylEBy+nx&!fjMU%?G1T8H5&PBI3oN;;qlG;SKrjXxnpW?+Ic?i*;UdJ9h89HvPUXw{_8>Y$)uV50@FW_jaX(&{(l~_#iEfy z|9TSjj@dGm4WjS^0m}?knXCRS0#XGlRqW^YqvdM{MW+c6P>bT`aZywN(%=U^SE07( zZnYFsUkJqSsTaekrrA_EWE}P^w8I??x#tQQxLdUw(0leibBytKdsOH}s#8MHwZjQR zx}%IK$Bk*i4cS_$%qc3+fT6!Qf9lk-O*94SNJwysU8p_uSK@?=>s=r=BRLAQ6jDO6Zyg$P2svupXN?rk!MHX`2^4;=T`v4*>Yw8ln)mXvw31rBB-zr!R@*|flKVs;&HHP2nD?*rZ z;)-X6l_Mr#5P_Dh9U0-a6PB$FgdBeK>17(SPcl5oMdNg9JSY@MU%-{&zVL2{-#ci9 zOze&uXj9@>TBA$RxpuV`)rBWY%j-7q#{zH^ZlPeQ}qwpcPYyVJn?*&VVuH2HEb@ zGihmF%XmTs=mX=-{3W3 zNbr`zpgV$u?v{>2wf87X&;ae>i1xcJUDSj7hrEJFbmzJBSeIZahZi>7mD183p{g$6C4&j*BBm?hlVNEYP=s)p3u9r zRYTmg@TSJq-c94qA-yFskZLC)%pG$2Ot@_*Yl=GGB5p!t3UO_4n;lX=hq#Y$OV2#h z@v)Exsg59WF))q-Qm@Fk<>)TmV`EH7u zS=kG8*%Old&MGQ%N1>LNjM6(Te&Ptn0;%suFl0^ffr7gcc+r5J5qM>k7J+MKt{10d z(VPJweAbv^2#X_!Qy6>8f$@n}%3srP)ckoR)38z^#eUV$ntGf}=aH_Iyf zxT45)aC$w z60WHZB6w(!2ZoTtyi6Nb7jS+QbjJ_mDqpKOxGm<5?8_hAwzz3s=$Jw zJs}g7g^Lf4#6YT&okz=Royy9yYGGlD1HPQ(1Bmm8ZOHx#r#!Xq2*?!ml{#{WjM5Y} zu(nu-QL?a;s|Cy=jAP$nkw9`-nx4YkAY#utM*Ki6d}{}c6kF|}!qx$W;8+PXhb2T| z29a}qv!bhW*QG}s>RyF*1GNYQ7$;>AY)#VOxho;|6qsI*q%B#{sjW$oT*lNc!tDXs z%+6bAMLk$@``M7Go>Y7}#U+d)iI$)YfsT%Rtyj1bzOd+poLhp$e-4tX9T^N~4;NE1 zo_NRW=;`yjc`drvt5f;V0^;`_ahNgHFqmbJZ~kdgm00<;jF)G3BknzNmI)>j?g$Zh zcCzfZKS8I1u-j+->gL#nIBZc> z>d0kk#mixCzE>%2pX7qh=APT^;=UnZ^2ECb9yg5=2c3T6Zp&#=SUbRA9NL(YVe`ROZJj@BlI5DgXRa4W}G17#bJ34H;w?JGq{Cz6vw@M@ur(UiR#`Qbbh95 z@5Myhg@ya>l$g5rSk}ckcrs?tefQi$tNtGz>ObZ}3n^wAFN4#1PHg>J=m=pFG~;ay z;dx+iqiRaA=EL}N^F>9G{uDB9Wa#)0y;LZkdRSq|i zoRGy>`@{GKmd z1E)1O_IumrCYO}|{)r(LeMNuo-~>!6k~vl8CAPS@%%4J=;D?%xq)5~6kqdcvZ3yAF zL{MyBfo=VoS3#)^y3ym$*i*iSoSOu)FJG*$B!v-480qM8rMx;L>;N(R?2#5u`4bEHaE>|T?BsC>_epuN zZzZ{Hx81;J@^ligLhUOkx*~^R7%`vRkrpHo0{|MtOz?fFDlR*eB<(nFVu!X0rCmcP zwVhNVYQ4_Pt?eXy&xW!ixl$*1^R3MJ0Imh*gM~=3h};aTPf&CY&v?8@$|OHxn=B-o zz=roUeNMQ1mYkWr1e@Wi?eD^NjL^e5;i!l?uC|x2&7jDQ4?2~MIK6Sw-7dSB1nLTk z%{@EaWX7~phpJg+)2*~cW9F;~6B7j)OY}(ReoE*5R&dPHuze;d*m6yvQ+6Rnlpx=} ztRz#4AeE4DE}~wg4GIL~MJ>@Qrq12icu8s)uX&hMmGb8C4wm)}jHjMZJ-MUVR<9i3 z?x9#YcNw*N$oz`k96$CR*Cn&tn@UkXb=S?9+_*O|7jI!^#iQOA$icz0PcUIjh`WE- zb@MoS_>A-Wt>h0U@Nhogxz@wM#p{+F*9q4!Dq#jGLN~1>@dJy)#Roa@P}@G5e%qvL zEy_v@h6z)D78`*gL^FC^IllHjtmw!GJ4F-yw+msop&9>#W{MRNw7xWec5Uq;g(JyhclEytK8% zbaRUO+3I{IT%jetT-x$*OJ?`3fHYsJn}*4uli5^VXLAR&cqy*8bD_03{b*|feyFs% zv=T(i!Df!{dym_fAqYrD_HOmz=YnF-)NwAGvbaeo;o>$beNLR9AClhnIalQOcgHGl z#PH(&Z(Z1g_Hw21m*l+SiFrR{r~h3)c$2BK{SqVM$+1h7nl`+@FKn^Edno2}<*}Eg1oHYN=^n<-8K>{!uvxPMK~wl$#|9SY z>}AK594T-k;ZR)EjPxbvCoY z5|&UhGdOOOT=dW_1@Pz#lhgwp9QT+zR%%2iVgLqJeKsLQW$S}fW4G-*R$U;(fnyDJ zAT5Sisd|*P!9@MlrIrHT8hh9)CCQ%L2sGecr1m%5j?%y?CsQds_#d`j=xNMkqr zEyOQIWz?aLTDSssYG&b<=E<5Bj=fj8a~~R2;}W&dDe~+krE|t^+L1>HxAYf8|6mzc zxXyu|{X6cRcYEGA?k$0()o0GRUqot$Xwq6Z7DCZQOK=$PR%x#)GIC(gPB#2U zZM07Nm5)SGvxCSc9T@aT8Kr)1mNDeoqdE>(u(a%>$gBKnV)S7>t~UE}_!B`KRs&0k zdTeHSw&RgaZ2RO=c#>jJEoyJ-_bCio*ZDBKQ!&woE<|~KA9ocK zp{i2lAzwA3?B5uDrZaO%Wc&MS7ZlmC?zKPeJ(bfoKY2(qia7` zWvC}k+wN@n9E#TcUgz*Ibw=-=JO+XwoD;+zc_DX>+BAaCf@5?SS|d6Fy(%)*qKpkf zIdcu(@y?1Xv4Yv^Tb(Kg-+J+IV-ar9HCaa?1~dZBN6U<^|=WCsJEl`fDNc%dq(JhWT@0 zMg|X6`_6Y5@8NmEd&6_D^Mq5Yiz^_#?p#aRZ#ux-cMBbNfOs9W6Kz2YE!6IKU{fT!-bMFoKOL>=5gmCko1?`yZ*jY4?vgMSgehGr#^%w88g49NE6)B^7mo; z61EC!nM3e&*{vfe+v#WRA-lfFa+*{vUJzJ%N2pWwh{TTOq{@G0p^17f@#x1$I?8bw zHp4v|n;q}T@-F`9LLUSc6RVR$J|Wb8*L3YM>~@v$L>C>1wWU%zd~;h zKCiU+u&0+fr2&fMrt@geVMEoi!x^W!UB;sK9JfLlb2ZhURVwCO!9R!WsVDcrhMKMG zd)+B~`_6~7HJ%GcKkA-vw?hk$hb_T^ClNW_{^4QZFr~=K%qg90Xt=HysB}P3=nXs- zU;hGYcilKCsLlGOe>*P;Si1H#=Hj45%)9<+<)iEBAObxoiyC(sU(rAGY{$+X(&c#H zLFh9<(2dlHmI|4u-YNY1qw1q;ryQ7!%omvQ8vGALJNK+IdRMhZgvs{Ud60jH@^-6}5;E9q5HCT}v+~ptL0viepj-POPNZVmNH@3S2Pg1Hp z=-z?@(gK&yVDi|i=S80mRZ_5|^`xSnTg2P}S9vnk?Z)^YIZBm=d0BcUnt~MsGIbY- zi!#b~+kMI2GWCcwJ{!6acP-f;Y22WXiOLOEI^pt!#QHEYhUcoMHM_CngMRjQezp3v zSDw&CQ-&`a=;?mE@g}ZW9H;JUkv`V@m{T|^p$9mOLnW`Y%xrE+Y4F8U)SgH|<5|^U znGzDqd3kT}1?@9uZOU3YZCb@%M&nzjO6s7FE089s6=RRsd9C%q6}S>EVvS<&;!1L@ z4jahb^VKBHP(7=95~dGySTJXio6M=Wtr9|}dU7_YJ2CYQ&TG7#o#gZ|mI z_mu?pJ%2bp6|SJ^WUm(QS^kSMDM^)GlEBp-uUT-N<7ho_krtvL?B^)s%(&94kifmm z01}J8bc*uE!Ivg9+j8_rm4wx>^mtsm5>j2`=21%HGc-ypBAMFJ@%coS5t-xdZslCw zFO0H0YVtwk>U(CRGl$HqB@53kU$KKXWCW{RoqrwY)fkkB<9M zKu9_k+`WOHvJl%lFlv>_CsF4PsXUhzP2f$e7^3o_6W=#V3Y5~OPz=pWr(KdF;SOtz zn4ExkeRi#g`HgPDc8=`IfcFE-Cbym^dGvRTE<_$gHFRG}2p-!Cyr!!@Mq;6p{MI>a zZ6Dpk%OLfdMYTN*>Y*5R13TGVHsjrx`X+w&kTZ!79MDc}d66~RGfiv4S@~lDGXlEwJvxzqKRa=(`>t!GbO3eh^Odw=I~r`?rpv}5ZrS7>B2UBY-z3=b|j^NfP<;w%@l zB5#CtP{TOdosotY1%bn!{MKJrQLw}gU;$Zx<@+Y=)Yr3jEU^pgWF1&F$GFscuUUq} zi~}{x13G0~JXsvT-wWa9kV;86a3g3;9MOc_L10D(kJ5_(CPw+y7|lhRBU5{dS@Aey zX-Bszw%)Moh0B6D2(617bXR&2?VQ&a)?u{9x_2&Y@9W{VV-|bMjx#gWShrg!z@2ce z#U!Pbyligqecrwv(bRGn)RU@B3eRon2_d{BRCr=t69cg_mm8=DcxA!uD`0BHr~ZqT zw89?+@U}R`_JKu5h@T=J=S5$Y)o}DO21OM@TZejfQco8ix_3>@zl4XuO|f=iabZ9J zyT9ueB9&2P4u1>ryWHNcz6=W%-|O0XD1ujc$RD=-No)@_I}}X~_eZup2{ZU<{cNi+ z6w9Ny_lv#n>!CT0KKU)=5Q;lWkXr6#w;t|YNilmWnsaE9{NZ?U-K@|>xN4gQ`?%_8 z<@8nuNr*HgGgi~kHLfGLnU3%Rb1qCF0Tr37Qx0MQ&wOXY&8?~6Ui6n`xTljeEQyue z!^Q3eg_%MK}9NndSahXrbobvSNr!+*e>kJ3J;F`5p@XvlP2pu z^39$esE~T+Q^Cm|S9o#+3rj%qe6+Df-QsJP+}H+QTv;hV$D}&LtpL6pxpxge;O#Lg z+l0-uJ6e<~dl&WVb(Ev^1(6(#>yO^EEBKdRoI+fSwQ*W-#em?mdy%{&WlZ40(QK@W zoL|;l_dO2r(}Rp1WY5B)67%&OGDzJ?Un?|qndD(7AC zVR?r!oZQLX%ODFA^=XJPEh__l`pYzAZX%&FSyEt4MbJ|6={7*t7P=SgRajv&t`S=g z$2ngDWi`ACzm}7vx>mRExS)u0iwdVt@&#dDC~f6NRJy#|JJ|Kd^wmwe`?1ex=2a-> zr-k$4wZjC|y$^EXqHSlP5Aiu|c(vag8PSyC=ZKAol+U2r*L&(wk6{@Ch6< zYIb*obD{ct^9>Ku_9a+LOT(&`rsmfXjz9ei*+)t!+g`?J;kTL{Ij>AC?w%~XM&AH> z9xs{e;LOBJ-jsNK-##`5fY(4aj``<7_e4|=(BlAEKO=KT4&(w!7p_7cL+YoALU9z( z_@Mj{7xV4P$7W5zU_fH=LT28#>#M$F!_P14^Rbk2TpE{f-cHxa0hIMTm+JQrR#e`z z*Vyp7Kb)VHDW%iM7g&!=Yz%0s+%oG2QVodu!5S?1bF?#_7jqLe``FZ(kql7Dcy(`P z09TADkp%YX2rQ=Q^oX;}(LS3VCsVlH_x^#VrXW+PzNY3+OX5d31)q*^-%z|3*_=_M z@_rN%2ZI_`i)C1mnq0X1fNJ}4#sw|Smt{+Eqx9pE@1?{);F!9T9I?sFloFZhVrAZxhC5mj|jhHMKz=iZsX-)&?itHl>f>alP)B&l9@+VXb!@OfNlr<R)otRrmaWmHP9-*wtYiCHJx^k&=i&RK|Zew{hPXA`j$=bB>a)9rV`neYZBv-$wog6)W zvSn}CY0u56tyw@=HxMEA0;BG`x{75fxaZZ@g62MGDxr$Y%A98E-BV-(P;%>ph?QN5ym+3IvWf5hwCWg{zGltKzfDHjN6M(lI#+gOWo3g5 zX@~Eh={C2we|}35c=P5B-MfsFTP`jls{UxFi3%Gtb93hhPd%6a>czj2OhlImiS zc7OaY=o>h5N?9HS5W(L_iq$3&4LsJpg*$wbN!cB@HJQif=Zzd41y*_z^BWtL=9V;{ zx5%-vu|dqpXzdXhGTXCFuMZFX2IJ(<$*m?soJJr@jph1*rnU^!kMc`=F`o@snK*TC z6Vn7;#IxB8$(5hTH2{mR^P3bTe_7ElQMKNDzOa?VkljKR)r0$iPtTyZ7a;s)%@0U2 zDci5C8sdSJLlv9S9r%VKYSUw8;mz73&6n!F&idSK6H)qi1tZ6Ng}-=_eJU}h5ohXS z_`4JdD8IDqTtxZlr4g<~U}^Kjz&#dAtw8*p3Oz^rSlu)ez=)bo5e6KqES67*J=)N* zH+JZ7 z%@#^N=N~pkH*ZM_8^LrP9QdGXh^20JyKV1dA^kLahz*ZOZphQPytnx8sn&4=q9j~g z(@dfhK(0-72QCJrlq*YYmO$5y|7__)G|$5yd0OJOtr=sVEI^?rK5NMA0rXr;@Ppma zyT;DIX{Pi-ht$&ZVm{hsbjGZl?V7IGb7#M(0&$zh#uUK*I0w045M3G4C>^ku;8;#D zTKU(nUynRIlHRz_w{n6U2ht?;Ja_A7JXYShRyg2s#|GIdZ&wG>?>=~dP&=9(SqW-t zYBIZbPn&0TbyaWIdU)V%(ou(IVwCwCUb^K9HelmIK-P=^@_*qa#k^M+lEx2>{CGV+ zJk@k)zQxS>3kH3hX~Vw9f##jtSAxuLEz4G9Rj9Yt8Zm!NIz0!oP50l*iF-NoLJB-veA2GvMpGVo15Fr z%*-dSJTLFrgk5;sj4uk2+0}M5Kk!EME-?fZ=X_XDIl+=!Q6%n}W6d_lSxy+0qY zp^t!{II#n61$grJb&cM6Hpi`Y?KPOR%G^?n?)kCNQEAWlR!L{q@iK>=1kMzcv$OM? zw{Ph!>DX<2On)AY7=yEwp=%~KHiKP^V1x_!bc%GQ^0yTG<49k*$cvm`ZHTVV{MYs0 z|I!Fpx#Biv@SOiF@2XVvM7zAA`>Bzcjt(`jwP0)7GQg*x$E)c&IU*x3q(wxco12>@ zE62vhOs%Xky&eeYBsu;1+2=M{#kb_^bYb1$!cn0D{jwzg)JUvbJ$_$vy%fkMMaRuY@_Z}|68%CYDb-PHp^H)KE&Xl^LUw6nPv4eeW^Cq2qeE6Ot;ds6aJPVRJkouUD?^L9lo$n54>1#XAY0y=T(_V6 zBoX`;&fF&~FE39r8;Hslf6(}f{<1=q*OCUweQRrh0ilr>UO>b=2t62s1+Hbne8;V3 z!%lbP;Ax+bnT16y6lKIFWouw$#9(3LYaY&B zTJQyPg=u=8BJtFIa{Zh5bHJv$^?#EE$y@dlHp`h&GJU~Mqrj2*qaVx%ZAlawX7!Ag z+-=$@%xVO}zi}l=wUK~7eLcVS*21)11t1qcS`usz@ zA(&3b>0CN~0bvT7vK?s;;TKC*=Vk!grfgrh?N0*OJG1O6yY@vCX_fihIjx$W={na; z1^4^SU9SveAGQ(#;X|4nJj73+O2adpi45vMd_Qm~Qhqv73CSIWdnTPV47|=%ls7lu-mtT4|be2Ub zNBlJRAAreeZ~iAU_dXfbB<#c4u9VV#00KA_kkp9jl6WohDlH(J$LQK6!q&sl3uH*s z43yOByJ?ltWn;~nXMfY6_$--g77?ftKSsjwS^mbYP$xDG)cd+yD}XuI?*1(W=5LRI z(>+rB@uQ^WCqMr%brYqD3X%|JuU|mRcLVx~xb*QqOJD4w{We7NV}4^Y@VzJy!!h4X zN7aQp=9_@BKAQhr&ncQ+?~{Df!=Xos==MRqKkCA|dEK$*sW z;*z!ak{Vq^IWbU}Qqy+RnKmMqH2PV(4+}wR0dZm1U}(*PZf3U+jIXQmPpD!hS!VC!*TV{Ns){u znh087KbnR9TwVP!JIf_H(;5hZ^rT5xr_Rr7vDApw((j{& zTsB8b#O>N)%gYbZO{koLBdI9OQG@3~DOTr*5r6*?twCzo3{9B!09ad&qfD>Z-M)k&FB zV{-K?CHn(7kPS~i8*)3ydjo{(39Wa_FXI6(!?qn^Llp@E>AS9lvh?3<>yDm z#l=akUc7j*6@Ri7I$cOdmM?sgd#VU38};XNGh^3a>LdG87z%X99xeQh1g953IbIvg zn)v88_)Z=`aRR?u%gPdJYHG})g>PAzv42^93WQ+F_J@BglFTrqSxk|po4b1PG|gmV z+W(n~ii%}}$K1?)`*Pd1cLP!#*MEiltCqXB5o@`*xe&XUiLI>>kX(IzeQo-;Ha7vJ zYY-GHtE=hD$93<*XZY}cJKL0NiIGB2w-KqEJ?WBL6O}~({Lnw@cq?X6a%GT7~aKh5ogk7{wSED}rzfkD+IYn}R-)5NWYKe_>Bds8;!alcW-bUNpc&-6;>#LM3M z=d$li3NwLv(fQx%g}EOYi4eUtDR8X(Q+yw3fa#rQneGa;5KwR@0=emY49q9~nH6(~ z9Nq$n8JH6}U!#+T+8rC>wPbDF-6a6WxouB>C(|Td`b3lDJfr8|0ulYjmrt%%EOI8o zHIh9-iq!bs6f0^Ltu!!1?m?+ej=PvaH%qHzZyNW74Uj4L?{21){&*K_A~<4x@803` znDMZ{-OW@l+A>^J7U?6mo%gjPOy{kpHJF(=AC#7?cg1 z01V9ltrI`wnIdNQMzX&ykh&e6HP!Bz7XmS&d&ybTMkCWR@}f#>cE~izBe%PXK;XF% zr4TE1X2O z4dCNB>UK@}!VbQb(n6Iv^b)<$s_INrEoDz@kE{LT`=U;})mx3V`;D)iOBzpIAuAEw z@D>zx7#r}nqfoi+DiLuRzs|p}&+-M@oV&17D-7-;=rR@VQ)Z~hBlP7RHe5u4Ls#Su zG3-Q%QpqA9mE;6%ib}R(Gysy7Y8zfW>S9#16em=MACR554a zbMD?76?o4oi!f3De5GOI^{N>LE()4n}JL zaVsn?j@d`5?z-IF#sAo*?`DJU6;S z7pSP*Bj7L96-4Q&R==J{(P^QYam>+BOO1J3M95d~V4a!x4R)xlUv4i5GEE&GpC9o) zs!U2^KqQ!4)x7$q|2R5`4XwbDB=P#W1|LEkz2I||;K{h!EQoGi-6eT{ZJlXqSot$S zea~s@n6>6;5)imPRQA3rWwTt|)xZrSY{_iD8e6H-VJRar=cs{><=h2v0!P(1#JJbz zCUiq2FtGRT;Vn(UQzrwpJM#BIOmQ&ipFK2cHoAcw}^RQGvx^pvFc?Rl;~eA ziD-3rK7A>}zD0RaZkWvtqgfa=|*Qhy&xHvMFlY{xz{QrQp`HqJr#x9@Y`zI{6Ye~fCeLH%d( zp~x}yW+7roQ|ZU192G#GTi-AM43wYW);!?K(I1rR^vE0eV|4yaS}6M}n*)SK%I@yn zF3}>GhJ%Ac0TI?DT#}F5+YW|bbCrmN*v}}18w>Ng53p>NB*G! zK)4kX(+|kHTWIj1u<&IRHRG?W1>DO|mF@33aH~>t!qtLZ^pyny{v66X^mLxnDd

    nN|Ff`=yZ<#@^1{Zfx%o1R}ve5!j zmZa5(vY;zCD&xy<^E7(Vcy^CjZK*>J*{o+tDOJr}ZhH}@gndkGve z&xkS+ATp-XO0`PY7=(tZT~_o+xZd%rUu>$@g;8-nK%lO!j))OO0EFmH-ZJq0two?L ziO5U8r;SaCKZGbBC+PxX34i&Zu(@;l*jQTVA4YgGL&SAz!3v5CA<*<+lQ-vH@9pOj zS|^+VY)F(An%_uRR^70n0$Cz6`bc1pRF#3bp%^FWI;o3HOnR?3YaB@2=88&&v}6bW zjS$^T{K&YUb2uc*p(_pj<@ou@wf`_#IhrBOM&C++OFE@aPu~Z82pFOn{U%m`>>>MJ z?GpMf3dp5OM3IO;Rd_5viekLIM^1;I|E#2@h8gt$&?>}hK%QhP_g$+`fcsQ{c2k7u zAMh(j(*U1dnC$tmRxhU`03_zIoM+caejR!0LIr`hFoMa#HI9xJd4e`luDCV{N}f)Dr&XdvbmOq(EaqMTJ|H@zJ6Ua9s1>)+xqw4mRku z0fqdpH)uoH9>1uwB5#V1(dA7LN4iACZ)!|VrOZbFCW-iK5{2qn#-HuZl|7xzDQF3+ z8VbwLsOTkejxY73GFY}599 zo2nL*{md zdiEeDYn=6S>oc>m^a17dr>U*&{K~KYEF7rs*ijYD!Y*{tJ`LvX1 z;TdP`aLumhCkuz}ZW^f$zn%LKBy8$q8Bi-WMv8QD?1Fdd9(Z?ORP%6|RRSv=!IYtm zNAtm%vd-sQ{C1WLvpkk(p+n9) zpjT4{=J*UJZbkN=c)2Ij`U+_ONbF?0EiABdj@oFxtp86U#PfRpA6o2MVc@ghBSftU zkfyKWFkK;swYfXF@14J0GJVsVm3@=!6W(zBQ53B8HduOor?czu@^EdZ4oEUU13*_L^qPOoiMVs z1XT@U;^4Flum3OAhT7iS_5Q&X5jKribNNWT6BSeS&U##z{f$^+7$ zuuHZtTf;(_Dm^_tA{X4z!C+2oz_qt5R^arwdQ~R#y^)%NwSO{$>^Xx=J1mx2ax5Qo zwTl5v0>onL!ML?SmO^65-J+)>hKk;Ez7s&r^7kT~9f`CJ?#whm$ILCN{da8vMZ8NH z1?$=$0uN=#IP(8bRYNWK&}$`8@s73i0Dx}*O+#c}p{j*0R3JWt%+=LH-X+2lP!>{e z{yBI~3ZgDwTV~Ue1{6E!{-C12XfpSx8YiPV*1mE;+^Xg|(1!C7wXU*{Y^H-mGTkI| z6<`U81eCWFy8OUqVDNz|QV)3X6t4}aM8tH-^{LKvLe_oyvrjail^~#-)+N~se+l)x zX$`$30Hi9rKdFk2%rTnX1eSZND9ZhGEYZa2>ZkJZj};Y0G~#+COj^ezZcliqwDRG; z8fTXuC*20x!2j4SgL5n&_`Dzs(DhmdMsP+;HW;*Oz7;#PQ;y?%6gZ%9Qf5RE;mY4- zm|yqDlTH>8VYgt3TZU((@B5NOWHqqkWFY;fD*v^DpzzCo(z|9VsB}`K3KrGFBy#+v z*T*OWThx#nr;4<0Az{@0h#eco~beSK0BYOV7wpNaDU47$O<=epj;(+nuq zV-0J!tX_RB4HZgcijDc5J8OnUAlgYnE*4XaliB$o+a|ky2}JIa@KW|iNWjy9yRaC2 z%32r{_SbGYPIN*_`}e7W{DBf5@xd(lw^pE@x)5a(*W<@7Gws?&r5rivSDM3;Qc{c{ z$DtfGz(J@$`u@5^MR0g}`Zj>EKbWk|viKpZU50oN39YRN90RP+S5GOjuJQxrwW7}U zh7hu?uSW&SvcP~ydJRQdrVY(3)dSjAk`S7JVML)4%CFZ-oj2h8Bz*|oB&iuWHI>SE zG`{}r<*=jk*^<$`gfmW$<8`?Tn2r4RmWxa&Jaiv}3Mvr^v8&y{G-Hh60X=YVBtWbs z@Pcyyr>_m%U-OA(S%T%4l*H!c<=uR3wj~AW!S1c}_4VobUp=LITP3P% zdyL9vz_0pmDS7_lNV;q)_q~OkDFsa*8}!rCkV`GWSC!$hutv&NfPTPyhV=kZ=b+sy z4*0Q$01bJ~2cn!G=znZ&ZA~z+rIc-!RGudLbkQhhjdtvXiKeV3pdoZ-$6LWAmSj;6 ziZaiblmInLovw@mxS2PZBshXPf`Ecuu8O>=Fs7)PYT>|*tZr&I1)M2Gj`Q}xw`vuY zS1mxe#iqS}4JuB(23Y2EL&IH5%M5RtGK07DzIsztWLF+VnsW+P9hODcL%xX_I6Cq3 z_ez>eRP{%4y_YO(Wl(C$@--rvr^Q|NlEE-6_TJ#}y1V@aa*u>n^CCTUxW9O2i|>jK z>jl|n!DTr|J^0XbX?CXYq|5T)5bFuzFxDx3l6i5 z{yk5_<6k3wu{I%NN+~3K&tDmJPy1~+JC+(X3sc%^C1oXh{l#Aq;l_k9N6mVBr`eH- zCrN}aMp9pulr*|F|JJwXmgfsEjzuiCzzA6~>L$)F`&(W?LDqBrdIS@)>DES)h-pc& z&a;X1@5a7*cJ9h#4Q;9Vk8@ULTo{_OPWe+;iSpKzX?klI9NV`5RjT;5cfiTrx;1Fc z3H>QQZWfwWo=u=fi{)xVP!C)Y zzZw>)o#-wzxB>dsg7M?WkMDHa(uGvuFQlIZ2u05%_-89e#TzGEgPmdj7QVieJz=O^ zK~}81+*Uhm3qC4TecpasOO5QzGX2H^euzP_0XicVzHE}^jSQA7`21-7EF7|h((f%_ zK~mjn#=vVCc9_q>&zq^Bx|=HcLrB@JCP_V?xww>eem56fQH)L{Tj+y$-V%Fb{1O)ONST7VzCc+8$f&PNWvWQLkP`V>YfZ?e{gpT6 z6!Xe(hlBN+%oWSJ1CkD#w(Ox?e~ki~s+gF9d6P1o6aHZ`*9Ib$IFkS}VU5EvG1=SQ z`c!G$V-PuQ0o?8V)hAknf7qpDTSH0ECgeUPKwIXERlhWI-+Gb<@J!CUBtJfY%xh|< zvY@pak94IqEmk-1sEWGBaX`%qzZy9fzw{!ol`wjnB&;PU7${jxDQ;fNmbvih=E{`% zB&2N`8xxcA6Cb6>u?q3vM123K9U7~$cg z+wbda*qMO=qLdw9J=r8Ni2`8gqMo6*J~VNIK}XZc#6&GML#B(Cq!`Tg8GgzCC7}sg zNA;9Fg|B=Ds5U6T3uR zfdAIZ&D3_97vfK@_z!9VLlXK2y1#m;Xv*fIqpU{7j9!>X%AsFG%?xv7#(b-e&4SC; zYGL~w)G<1XcW`vvuOn+CvoPdvLPMsS14QJb;Iv#H4QKk`@A9g!mfJ)+%lx7F=j5OM zwO^^u3B=$rA4v-%wavNx^A6i)gj1yaSgOPUmqZlWWqx^R%fGhrgS@O#UQCRSzHgsQ4AAfc&}wQMP|rJmuvtpz(Dn{K zpH2@YbKTS#X8mW&Pmwlnzx4z6i`fTuXv^suR&XFcujpXxugP032u4NqUu|@4Gx^CS^ah#K?oH%*5Ny(or@6(nkvw z^C@dU=jJ8Vm{6#{T;y29$om%l)Q@J#f}U>QUw93rq~#yz{5E)AdRNmaX7e%Imp%9= z$YG}$C1^4zou{pENV-EB=L z-9B_s0b|b_phxe{ophGyesf9m)m;(^YKH~u#|cK71-n7F?b|M6`7o}a)6C4b1y>7i zesZ%^-v!3MzuTkv`Dyy+n+x(2XLk*kLo3ff%E=PXCBhA#902|ABwj0uN5TLwd@^O7 z{M$6lBgV8l@CMMwZaOvzd`GYWIQq8>WA0=FfeYkF5+h1mnOZGKz4S>+jV^U8a5mJ7 zDe|h1cGnGnt~`H~^}Pw9Qw4_|!#AHtnS+i=vE5o2xX9x7isy$=M;08x*W|Weo=Qa-4jnefvh*^L+y7z9 zYy;ClKRFU;Ap8kFe9!Y~%4}Re)UFU}{TKGk$57Hoc zGuV9G&%d+yKn(BJJbLGyQ3jojDf~Us+y>}VSzaB7 zDARe-K69UbB2avOVCRH75NwZ4&e!>EKL5o7W)l%s^kI^-GvSXWowSd+7b6x$pq)sJ zw4CXgUyAz{aw=Ml$4`R2F(O=X*X{bD?v0)3t<59Wgl5`%3>yz+$A!d!g z`Oie z{8Or&?q=mtL1Xt@V`zgk^b~;}F?1MV9 z??q_v)#gV_HkL4a0l8c5lbjp{pVu2)q1o|!!yZ*$@ppmfB(qcS#ZhsJgTEBjX!1M> zd4*`%6=<*Mqu+#=YXr)DSX_yf^y-ATwFn9C1Wpo@U@(`{M;`LgSy{ei1#3l`CL5eh)w3jopZ5m=8+ zwrHH~NH0163c3gK1j@rtMjyqFXiRjndc`ccwpX`3 z7f%2C61I&KbaZIZz4H5ij?RPF!W4l`z)9ZeCX^HcxDYJz%Bc-iDKeDU!u3>2(s1Bn zDADJhlzhWt+o(U*s{+5_+Kw7|TK&!u!35@i*N{tqOP2T`e`^G3<%R^ShdORg?Q&B{ z=J!iSS<>DZRyDM(M=D(65@3*RK)fO_}5=S09Nue57oI8>jr6a=y>w zy_mAGYZr*#`43#ZBx%$yM9}CYcqt>TLYlAviDEHv^QUtt>J*Z?hUhQTET~3Z6`1UMl_Z1t#}y{77hNxpEhSU7 zZ?-i0Br*Sg82!E%JJvA0X=h2!lzvjaLQWE+3A7^_#wq_yr^BP&hEPp=ciSL~8H6tU zeTLAH_sx~>1o3yDtIbK2h(%*Wka6EMNPm-!>B)LA z?mHn}n^=|Uv!S%ce^Cbs#MA%Y+Th%GPyS$m=mkd9UCtce?>!v`BE@3#f3H|f=I7*$ zMn6W>5K#OXJPYkG`)B2LsXS#7QwLl9ABunOF9XWjl>U~9mqh(RV(6psH~G6vfOkXyENK@< zp#%Jrc_JPsKD>S8mp{5h2pVg3$GXW$$$BB~g#i&+GGzoQJy~598)Oz-7V_g?z}q_u zQ>ri*BMC@pc#3H42^(g^%u!^nN33EuRNQ9q4EPn0`9HAB1JP|n&Ltj_!eQS&@%MlD z^X}ktcM=S}@A1YJl&WU)>mM*=smLz=Qp}Pn{3%t4AytU31-JDFH+)Qu^3mG`_ZPd< z|G<9OW(-I2{qN`c9qGO$dBOjZ+Yvndu@U7>wisc9OU7@({O$$jw-kY9Z6-+SQiT_B ziU+>h`bg0bo3weVw0ZacD23EmO4Z#FlogeJi zd%G$?j;LN1hgc^bUD~OypiPK}(!PH5ANETy>}AqM`LeWkoDi4@AhvfLVzSBNu3`$; z!}U{=V3FiTwGYr(<@(~FqkQ5AsXPAh8bt=4Vm_+@L)`~*!gnn>zvYklyo2nd_Y~Wa zF*&M4nV?web|se=;aX$Y-R3Vg7G*VfibY_fNcaP|09=SMuayP$a9F877DO-PZv)ZW zH%L==NZ#asZ6(k6_VF-RIpw$~ebMHm!tgHC4Y@g;aP7WszC9oJiG=R;5hxZz zH-Dn;;^yL&N8-kDJ~!lMp{Z`8{!GG3Axx-rJh3=dZ(8J>7FlxpDsSO@{BsSs;1L1t?1 z!WKVv+>*dK=Z8ciFVu6@J9tb=Z5O_`7i0X(EZxB;%d5kW0 zwdedX?kxgWjs~*xNA_3E0EQZXYsO!kKs7pK33P0KuJ%KCJG^A4~WK4|Q&?5Q;qH2sf2BTH=(jw? zj71l&7I%axCh`@uNaj$Qtd{n5Yp;uX6iu#*E}* zYXsQ9=wDWJE9h*m`U2B$zl-{&P?vd6bsQv%T@K+sJvt?$Y4`v>%1%pu=q7`vx4adb ztj6(${Z`?m&_;ybXTf z9pElgi}1y0zI>Ux)n!qB#Z~E+?0HipQiaNS%6WlDAVp4@gz6pK{ejnSH+8y$g1!X7Clex>_PT!Wd zvTjrq6he0*oU+7fX^TGo^c*=M+TM)@?q=X&O{jMy$Ai1A!&w``EE>c~#wS|rz! z@tem%b8nANJbd+XLmN zAGc~GM9~1DQ0s=?w{%`Ysh(F%; z-fSr-DEeg-=sfQ(6jzlksPs8C%{m1<=r$GUmhqoUJUwtZF_@20&OxKa`qlk0u}_ful}TF(5Y$1sqWXrAybJqU&|HftHm9ho=J zaLAKvAN6Rj^vD%J>L!ED#bYO-scT>;qkCbYNvSl4k_{z&!TMdDu)R|mPmgF{vjauo z)31aC$6@wah&N`*s)xI-nuDZnxzDN8cI(;>jR`mSl5{GbSdXw>0~`JD#9i0vE?hkH z6o1Q7i?%WBPP7$u;BMaUa*kyXXoT5fS#sSc6fkd%8BP(>KQi6Q7Q=>-R zKn$B3cMdH8Wi!t9vF%p@e~3Uviyv7u*lB|N`1G_uECO~u3BEkCf_8eZ)p4qdJL{vO z>)B2haUs&$9(Tz;j=4*1y^R|A?2+y)%CW~$Zxor!H+bjq5k4q}XlX*L8d7p=X`U2A z8{@eS*X*!bOlBMMH&?q;H|A(APdD#pKh7SZF20ZQbp)XG$o;UBY}O zcXWQ%(*cT19`_I_zVD2s6fk|u|26%rQypi*T#@fDv?)joQ+_hYO*RKl`Y~wr>^KY8 z5s1vVKQQ;^$?@Or5t8#&AgL2(t+__6I9zt&vT}AZY&!aEnyc5D7AEAPb+kofzB?`r z3+kBa*3tbua27mvK4S1J;TWY{C2Yv;7_<50+5A3jS`-5e?g>x*#7V4cMZs&CDm+VvGld80S)QV+$# zXJubJERPVVi6I?8jfBtp8em;eAoTTAnP==>_T-Nr-we#NNC*KM6wE3YNp#?1zkYWUnQsG+`yD-K5I;ld)b}e7Da&#fut1QKRZU{hq*kuOu>^1GtVvb0jJu(6mYh1BUn#sgz!7E`--V?%76*)kFu1%ikafn5E0 z=ifhW--V*VLln3ax0|Z*&-Qgi8RrBrm3)_K^!WZf)poJWD5u zR(qRtL5r;WCm2>;jWEUX{W3iCy_39~1$VfCEzud7*zn6nzUlr5e!xbz>Kh z9yb=K11rdJrV|x5<2PeuJ1tOIUhV6X=6XwSMF*g@$H(?9#_)2Ud)7kJp%)@+`eKOaI4F0ARVe5|Bl-&pKSkoS8}mDCyQCt2BJWurZOpDtL2try_mL@VUqs zJhflEyOk7@O{=)T^VgRe=v-l_oA(ZKFP|E`Ddi8HZHg- zHKt3`O!x-Jf{-w6k7$ZU#|_|r9lN`CbIL~&#qjM(FVjmIkt@%f{`XOl57*J!TZ^-C zBR|_NF1ysWew+kk7L^Ky9xi2!H%`JwKJM#)bN_LBrhO_ek3GFB93fzX?~}+; z;}802%wQ|~`qKSIseI<^O8{oN77i;82T{&hg#$y+#U63mj%l{LKnEl%a&T2vLhJ4h9NRK4 zNW{rQ{IpkOnB`r>(?i4Bnsy@Q@iPCGr{9@zbrk55^=FD15@4Af;o5Wh=UWf`X?-76 z`$1~;=ByJLFOQS23$WCI=)=)a@mRYg$Gu~vkW-rMtxLbAZ$F8YPNmSysJ5e_&2b8W zRFWGuQ$Q{|38>*OIUP!S+7Y&+S5Eg$BjYCL0iTZK8(`0*&&W{{dj>t%-_Vt7rUM=0lobybtnUsuOp8B^rOQ#$k<#3c?TgVwp z#I5T-?H>KS>q$Y!qzwXyVq7)j{8vyj@o`YqRVjz4Pbjy6BKEM4=MB)WTJV}o@HguX zc=}j4ZhfZaxY;TrxorjNP->fAXihAdsigq3~xuqKA#QSHyf! zj7rF8m^^gbhwZyh$SlQ?^#FH#v|tZ!m*P>C(omsu6z*c{^jhf~*cFBkd9|;`e>%R2 z+DuZBMB2zYisL?#T8eH+bAa>z`2T{9h+szcB;Z%H!{Z{f*fP9she9;+Lu;)gpyCv% zlJzU*5sN9wPxiH>SRI^-CxwaC1GT`npDA;Fw>~rM;)#8a3Ia>y=N5(^huNV$Xos9c z_d0K8G}Z6bxIF%VG=s1E1{NibuD6}_g8K^xdT~_EuZ#TPZ_~E!hdl1tPSn5yyDVN> z`9Sg`76P98YsJPyK0xOCqLtnkxTAJVpaK4S9uh8&_ckNqbrA?vNf)7coy?++E9ltJxaqt)mBd68~f z$5I%J|Eu0J*LM#!b6t(K!|*&F$6${>0cSCArTe<169>rg7D$3Sp7FmY*%U7Zi3AM} zNziA=1}IJ@@&C^(z=gqvl8<7$V0ohmA?-vvom{JUPr!&K& zTM5q57HGh|xe{Kl4T#^U4hQ~*xoGjlLof!;>hz@~KUL@03Iw6252yNp6=?yu!w!36 zh;I9Q>Y|K%ti2CZES}L(^sijtpa}NMj(P8D6d3w8iCtEamH*UUULZ8bHdrVl6OpD4 zw6NYo-D5|0R_<@hCqGv&&%WEio4ZdZ$p$5Ttyt^^vByjycQb)~84jtP6Itw8cSZLB zjUWiD-E1sAyFBIhUe|`V`zuYtyZFF5)ua*s0_`3-5I{5^+QWeUyWPz29H%mSwj8{< zEq{sU#ACA^pzD>3}ivZ#2?M&mFS7b4p?aIOBNdF!zaub)dgw^aCtdXMY0y( z8s^*fW46V9vvK zkv&3b@4QtG9niv`Sun;AZxmh8b!MDW{m1p+vzw{c2?1)19j8(y*i^p*6hVW1%JLT?F7G$%FEDtErWr!7IEqp= z-ae<>QOp(H8%%kJ9Exgozk24Vq-V;T)y7bTvi4mx&+9 zlX@ZLmWGg!wZq&56khGjXdKLWS#fGP)`AgyYDrPa&A7CGgEu94wrPu~H>lJ-jKQC> z0`M_80@v2M5Uvody^4k56R0W6Q&T(a{@~3#oY7*d)iNOtvO<^$6xo*fU?6HHy3+tJ zshNNEA}dB2zUqKFMVty#hbYSUHT@d+!-4niV}cWCzOwz+sbX^{Z%+@PjC=K8>PmRF zB-Sv}7tV<%ab-8^1fSz)U0UQkRUiA;fA#CXZO8O&JEsOe`!k~&`shXtwT?NSn@-vm ztF}q=J(+`wTH8YFoLfY{8hl+?#}%%BN8q;wWr7x(D__qTav^nR>$%_WuNn*caV&*4 zzc|SFrlTfIi1y(P;0c|yGR3bTuPJ0KOOkG4I0+ZkK*G@m10#7l_#*c)iUIt|$M7(e zHGMe*qS#1Q7m0+K3^TTlJ|csw6^b<-dWquJTh`J+uJz6h ze0-MZ2{;D~K5{5;!PQ#nsPlOS3hkq}FdATZvKg6i z%VSR^_I8`5T;*?IN9y$2L4_h_;225(l*%ELPEvw)^|94Jmh5n?fb-kS-WT-?G<;9d zlzcys>~_{zUqv+VGX{EUuREad&$QGZ(E22Ni*?-fmM4vVnVgLty=?YQkwGfZ>FNpp}o@H!a5Gz?INp3DD$kd%?SC9~O@`XRWhJ?!e} zaFzYaS|mx+>3!;rS%PdJ}Nw%{#J)Jo2=!t zeifdKn>xU2C}r$*j(G@BB7p7hh9_1|cT{|~ z?bg=heDo*+y+H5;4H3-;>1GL}(v+=r$rgUh(7*QQX53#$Sq@zEszsKEr}*4Lc4OlB z(8~+jae1mr@3!L|z%Pfof0O5UvBUa>z#34pSCbjj<<<#Z0R4k}`+b5H*?*;!Ij_xo zGDjuHd346<&9@AQ8A8+&pY);x-QnON6jyxydY`E$`qY|e?~W%P?sZ4nJbVJ~nA!Z6 zp1K6j!N<8f{5U_Ay(0C~B`8w$8wia&4UQ-gcmpB;_7q|P>+(t-(5K7y*aMoZh()8{ z@R`V}d2XLc%QdZu^8k-mQzaIIL@`WfySxr8E3H0R(j!B|+Hl_+%qim<3q^YeODW?c zwX%F}OCwXGWB1D42BrW@$K%smxaLxZyYNy?kRRDN!fO+rA`n0iA`{y#A&3;z_Ymn_*bzogtfCY~D(Djf!)%ZVVWGxQj-+4W+3VbAa$ z#L$-6bkh|3mNIQocs&&KW;VA619BO-ORyinbHJUKsg@YEWN2QW)B3T*7pz`BW@KUy zxe_Xr-$?f@+ARD><6-i!mG0{d&Y~tg6D~<^%6y_VVXCADU0I)KAjKH90{_`S3&Wm1 z58b$(`Q!aK|M9kfKICC$sdrd#Nk-G24h|MzlN5NY-I%rUzi&Jon~Rg>>xiwFBn@ps zPA@XEAyoPPyHn@zl9xe7>?CZV&lKD31>3{Qk7L43*%t}HV!~qXZ^cg3phZfdwznY2 zpAX0QT6S#r&ArMsd0EUw=MhQ6e2`Hh=pN`-?+{tHW9V?H(S>IX#*iyXZmxt-g?0h)@=p-{bn$%~I9bL^h5iJL=5<76QMA z5J4KRl`icd%hIsbkcMMZv=z4%9i1j5rxCfdq6I|!$>L96n!R+0pWb>h(?)n>v^IM! z!JD9jev{xsA(K!gHtRaME{yUYhn`qpYj@!Y1#Q(pRsu4j(7J93=P&P@SVJ}WybZU- zjMl32;vP|nS8#@?aitM9yukZS zE6*d2yp<;uN<`2K{pNXV0xOS2^>FFhH-vu(o=Law2?=41SI_LD95pfq56_nYmq)6C z6=g2bqX9=X8IGi$B@UZ}bhB6sy|FEo{5qUvs5~>ZX3kOTsi@DHcxtMC14Zxi#$6sR ztnMfP2cR|+;MYnIAs0`K{~pV!Qw)}%QusJ3n0_|RF3OhF#P2N?T-?_8evtwtmf~5( ziz0gIC7}M`$OY1RYPgN)M8+rNB(`lQsD#lDRL$9sO_2w%y1hDGC!Ldn^=iWU6*>5wtC!3h!Hr>7|S16X{16>UIGiF z<~S4p@S}2?+F`G}#0*!q{Nv+lYSWG5k^W>1=~B`d7CMh8CLXzcCpl`!Hsym`al6YY zK!UC(oJBjK@zBU(~k(zlC) z9%;Phz-}|{3sFKbgEWar+rYm|`n_~!8TzC8>1mhrRqi9}KO*F#HB$$!1hq#a0&F(X zL|*KU*%2uY!P10b=qt-+iHWX zPXw-p^F*KT=W0!*x8M1IS$Z0+-3i{g(Q7U8gCdR`{TRz>8m0;wMcAh8uf0s5eqSq(aRMF6fzf96Ngs z2{dzDu|x3_C7GC=fr*ta`6!@m1#6>F&C)=mt$;J@TW>AXrClTQ`la1$M?H#%)dyLz z(hY?AAjCtc1GTCw*@bAGLKHu!(5e*1&4VULWk-zSK5pKWd!Dkv(`@K=q>Gwb_@R`t zFLgt=if6^g7AI-~6MSM^4CA+8Rsv$&r$}Mq)UxjR2 zUmsBiHL)CV5DD2F+wutRDn|Q7Z2C?{?2BZ%DN_g%n&a?W){2rPjF9__T?4(Fgu6AkN|$br!;LC`+w<& zwQ0mNXg!Oiwv0_^79V)H0TC-l0|R!fB|dLjNvS4~KFlR+LhM#ikw*An`yRmQj>aX@ z4!3>pCVEvwbT9anaTi#OQtHbef;`z@=orEqqBTq6IbGWNQ3h?u8)*KGXCbV`A96z- z{NILcJqZn*jn$#P>}2UQ+mWp7q_cUlGkQG%|FXGr*qo>~b0rbRGpP}tN~M{WSZpuJ z2d=+0(q&r_I<^XyCB~q4)yi#PtgB1-;TZ?wZ8PPr@v&&2OJGZ^K5NpbnUOCCopATR4`-F}oo0^;$+16dq zCx`|vvYs!3ixAEb*S7i@FCKGu&Z=%JZf8#K`+vL1e%{nM z?9Gs@3S)mgTp>sc__yE`l$1bgb0NfyBXu&}@n{%+k*APao{_&Vr6|g7Z%l$dU#X@a zK+f{Yi@?jj2QrHqO(=TEMZt4#6Q_(IibvkteX42{MkU@;MT^I;myW52jV&n4iyXTg z-jg?+o(`v20=$<5;&Q}m%QZKcMCgOV0XEy6HhAfpPq`7=dnZk-4F~^EfBNx{V}CS_ zUgr}%F`eKDZ%V=>dvj>>IiJIzBkgbc%;Nm;%4C)S%fZ5$nRIP8sjPDRkGI#m0eUtl ztsw%PqYA>VeGr{3T?jXb)ScA+5+@Qeu(6XmbuE2Pph*p_)~c&E^gi;DJ4KMqf3z;I z>YF^rllzGgz@28QR!AZiYKg6O^~caN z@cAZLC8b+!OYf)QFr!D^qH5!I7AXD{cq`U-5ixZ?>Jk;$7wBf z-HKE~i_^cc4a<%M7NqXyJ@V4ZPVD(HnBni=sNZbA-}!y;P0YsZq_DQ2!ZHn)Zs^PLN4svRW^hE5Cx5gy8P(?)KrK zj(Z#Ioc4acGtHOJi2P?|b8#10Zf8h1`fT^T)GwXB#Sxi|*fS&O0avkT8gmz;51Nup zUu-Dc(We6UouRua=AY8gKX)(Nz26uU8jfT3F$MT9X){P4<6@FAA>8k8Hse3lG4Y$Y z7l|7+tgpD3+RYxo-n3kA^hROf;J`3kcXYkZC~zf=zTaZsdbdecF5t+Y;>Z!J<80GL zC#z(}-Vq)enHOgJW^H*b&^36~^18eCFre-qQB_BHaPV`6n2OFX0pX%|VdqybnF>Wz zbqnwgEl~YMW8(HI(<aQ&gKiGqVGsylta8jGd|ePJEd)EV0Po6QHMRs z*K!+E{3W%ek>%&hJHfzy>Yy(eIj)xbT1Sl(SuAx1_b2ph*iLOP2qlB3CQ&YzPIY3M z_JmQ^?%^%HH&9F^3v8sMS4^MMKS7wO(y^PHE0b^uIY}Y1(-tZhGCbp$+1VmLhxcwF=>^iPMASFcn+j3&>j~FQB_=tI{noh0rlQiyxy}~!{wW!3XKHM z^#uNnhSSAH3C6LCal(q9FfDIN^9I^N|8{;Juo#^CSr&IT*FO%2qGog9Z1+O2fy71*f_j%_BuHo>ihrTSLR%kU2)%g6+=Hi-+bW^qlk_be;=R4l@cK+tiPxda}(n)r6HMtNmSCu z48Ap+xqF5`^cfZXj5#gf>^Bdbb(?ap)wK4}_1+^9+nssH+4DU=`72@7FNkQRKGAh9 zZNt_gwua78|BjA-f(c3be#(diAgP9-eq7(F&Y&&hXaC4F&zao6M8!($j@i{2vSxVA zz0o|l!FT&G0ZqT0KHds7b4s+Dz)1d)w6zJV^aVDcL#W!8ZM=0n9Dz(k-+|oGWatwC zPQPgXp z>J zs|2+x4(z+op`r1?A-Ctq+tTxU80`InmcBVo?CfH`y=8OX$OjW&X+X@h8cB5&*tB$V znj#gHlQp4Y*U*?UGWp`g7O~$Y{+q`W-_ng+<;M@@_f(3PM=tKj>FKU0gn??>!~%xB zt2lTe9g~x_J`BQh-&@4={aUq;;G_zzg`n&>SE1?OC`ft18(Ab@D?HtxLC8bSLKFgj zj=1I}=R4^SZIw;7K6>#HzR-Yb{)zTCEoo;&4(V5us81yxe&-ts&X1dQQdONXO%%?A zZ|X-rba>+m+7P*?==Sma#;a5eQ*twV5_$BvX=`xmD;YZGCW>7fB72%oC2o$1`90j< zu~&2r({?>%>31k7mOYrSL_r$Rsf4`q%$b3(4ztb7D zJ#z%TR$l~{>%Eh4!;cqyLzf;1dC|dJ z?-)=U&^Vze<$a&DPUf9yHA(fmL=-YzsHO~Vp;4uZQS1vb(FhqXpZv=%u2xY^G7BMd zNQfaSTF9EIT-Ov^#C$RR2P~|@#k!18ou=QnTX@CY-Ep=HVt2LH#6CMDaovh~Lxf?Y zxAv~6gn<^_W?U4heoNr1rt`f@;}AvUA%3PkCeS)VDkMkXDk4i(g@fs5 zP-(YVwLIGbMBhR7#q#!W)zq{61^U&ku3cS^4|pxn>;*f-GO$?Gt&cH>ZTxbaMZczQ zo<+cU?3k{1Th2eK)>&mMw>R{4-%|65|fr3Igsw@ z^<8gIRq@<9ANlGtZSjjxx6it3zmX>MXC6TY{q4Lg(ErzRda;r@iw%FU@JH#1-dkp$ z=ZB;YHRP37lLO{lbb}vC5>X%hu#xl#ZtbSk+SU={WLF!Ul3_jzyUB+D5jbfPSBlR~ z4IW@0O9{7@MshQD(0T(B=}wfSn-R8ZxseO5TI7EF@bY+F=SUO<;2*9~+b>0vUu)mX zPX1nOO6VLFEO6apsSXW|ec*#dZnwP;x|K@~+-dhyCR^y(AtI${qE89LsXJ}@)V3kt z^$Zs173Semv_s?Li8(oEDJ-H<3l@1i?7u?OnrP%DGOaF5EF2waSDKT~hf}z(2k7uc zM9$JYA-$FIS6OX{VevH0gzCxucbLokt7~b#dF<$YYY2(3gSkiK(OSiAufzu_D1GPA ze|&8B6!Zg8zZTSDotsl_U5qRAKhMVN(Sy+|#yHfLtB$R%3Z*g+3c4 zyWT*Cdft(*T#EnCFFi;MOZ)OIeJTpCfNw{F$mj(xo&^prxg6+y4yt?!>FI5`2^ zz3DJtH^(z@Aa_(c|0tI!xC`#6-jE>Xb4Q)zN~PV2FnF%jh99HjjP)5h8)~r7q%H1i z<3)IWNDLyAY=4v#X%?z~RLZ8%tW+m8K6-|>YRil#gnc0;y$vS{IhmrpmCcBxrRgg@ zLmbC!!Y%*2^BahWV4Zo|(TY}Xp(@NL;%aqhD03iiw8_jVzu%7}NVYx@kr$q=%dKAdr~2{e;MzFU*hV;A<7r%OWdwwIU=xR|~4 z2i2V3j4$ey)UuRE^9M{$KRQhhxjcI!J3NNehcMP6e~81cDVzx#r?-K9+vHYt`OHJDRx0`+~^ehP6zLq`_;j z6vnibh1#7Lm)+#!()i%=TDcW9bp}8)qL{BS<|^r+zo?IwdzxvCG8)K@tW7&spj($a z3btr{^B+a^6jS_y`HA$?BKM;lL@DvVhy))wBQVverHF{XP~^oa53wsi3I;+JmS-Pm zo((9~Jv9YJBa4lVQ*Y0uBVTTyEvl}jJ%4rkuM{V}0cX{Rai#!hc=TFdNy20eD&XMn zUhC}AZn?BL#&?Em6Fj*=9aNJljD;y#wJZ}YD$Vy?;E+-+$^SO}!J@K~MyD!xg59#u z?VJ`^tJ`P9>M>uo3m<GVdop+grC%I#6+uDXKww#XVxnm5T9KvX62dHXE-$jzd zm1;}&hyFqy9m+N6smEziaB;t{_;Bj;VBFbMYx6+kde*Fq(d@G}jQa+lKF^eVNxIKAq3X_1NJ zwTUW;PWS%hG`2EsIgLF=7$zRZG0sRhDQhCYYyhc)G^$_(BqaOK+vVpiYlvs!8&fDg zC5G@|iYC=A_9N9aQd;9psRg7teIBlttivnSPYDD` zGtM@~N!=3v{OyV>)fGJh-||@V0%3f17T4Vy`*e$SD}x{PQvB~$hYKYqBO^xy&*rri z0`@`2a5}eigggiWF%$IKw`>hxdVt^*L3(mvupTeja@)2Qay$E{3pDZaw!WxC{5L`f z(RttH6t?!1q>u?bj!^`gx@Avl#M)v%+1Wi%57F-lsjaiLT zIFuTE1gHD^m8q$fZubf@vD2YpSNWak1vYxtNdS-QZ8+9MpqMY1(Hs9sWk5vN8c-VJXNEN9kRx@sSq_l4=OdgsybPOW_&WB3{lh zcifzyi>CBZKy`M$9OS&)e0OR%5LGOp^+{Khl#J{)F$&F72JmCRw@g5dxkRZ1IuS+S zExX%Jf(uO{LzcWkMl6yoTm8|a`g0w7n$wTox!5m%S4&B5wO9AhF7&1a4XxaWMw;Gy z?Qv$hzn=V4MEiIyeZJxz)@XPKk=WFW?yw>IM%#fIHtKqX4-v>()3jyh))RGn8K-B@{VQWkf4awXe>i$wjy`TgVZWSGs)V3^sdXkXww(QW zQ4h0Xu7uFMiRnotBb1bzOMWQsJ+Ivk2p=Er3w~jDA;eHr+UQfxv7noHE-h#H!3gNn zNeZa;Oz{7+ZfiQmLzwkJc}YTO@K~){*%kd*rN&$Kx7Gp)iB_j2IOZ1}ZCI3eQ88b` zC_cLH$!7npE*HuQ0+t*H^`TOH*SrCh^Gz{jr2-|j#Fy1Fb+Q+Hu$EY4*~D#@scl+r zA7tz`UH|-NGV<#lsiOmq@QNhXSsELLZ)5h`0ZdGqmFA$z;gmu-sceTA?+-U81*=at z9Y&p&y9%B?;T@)n>U<%n-TD>Qu@V%YcckfMHf_&*3|G#)Ai^UE2fGYO^G$t^^6kW7 zT$xLJxCD7e{6icB}gUgnn@;!F$;(4J)ni~6HqSrYMAjQKX2!gt0i zqA)WEM$rld%c;CjK#QTdZijz);ezxj;DNPnR-0kp2_s84iz7nvQP5+DWM*pVX6?a9 z7%GrI?)_?~$_**kemg&JYK>BpR>{evL5+NZ&b+F1My*XrQ+pE$`8G%S+Y#sW11?-u#^1k#Aa`--}=n z*Hc8K;pm1U7S5H-d;VoMYWHa7!7tL#oXXTaO3btAg;4N~C`WO?-1?WUF)fDcXWf9R=Ta z+7}XHcVdDDk^P5;D7Em|dukr-$_Ora;T z7<3V1naSE-&@PHQJx=`4K2z9F0m`CoI^Mz}I3Ha(W5@c!#HN~k3>dq-_u*QQZI>dL zHDo8S*SD7 zE}r&IxZ}Tte_0SeGe|%q*yS#l8qpx2X=%1gg+V5N!pPEJkk+YI>lU>~{VmQF0mdw8I_)<2DGZk$V`$RO}(;3%Vm4(|JY7`9^g;Y z;(k?nHLyKgaX5=M=!PncTeXFRf+sqX9K)*E8p^W3 zC5Mhq0^&ogZ^GxxeGP??~0^$Q(78UAU#P{Tt1NK)B?CsjTvQ^8>6K>euKUatA4g zIlQK*quDw~fhWG+m`Hp2J$zgTr1VSA7z5odvz;l_9||=(rrkI`;oWXiP*srViFyD3 z@$`-1ku_brgV?rhn-klZ*yhCcgcIAw#I|kQ6WjK;^IY#azxrqI?p?LktyN`It^w-K z>o4ecc6(6evlVzv6Hy{eB%VBD0JVHPc68+}RQ72?t)RY6gZ_eE(dGnQO${sxLgPy-RylTPDes zBWRhgyD?VoYT;bCB3oNz)5kv$o_Dbue@YhoU1=Pw@!_X=z8Bhh{?dPZ5fC>5N#Bev zOGOSMkliWf)53l_QHkzt{M5gf-el)Ly(-4M?7~D}?+uyI<_tsQPmuk~K?q1Jk3U|K z`O!1Q0#}Pc2?K~p#VR&zoS z8Z729YrOtO?`75XUqtTz-Z@g<^bmVd$5*}IlMtyZY;S{Sn{G9fFT{$=4vc{gkeqdP zrFss84--?Sa5%3h&h4-jxLqUd27|BIP;mJB18Y&!Mft$(B8~L2AMD^FupIVYR-S6y zMzs7HAECPRuOsKl>*961+=h-!&15G%iE(P&XQu9>f6=@-Y_v9e})B`M;) z{TL7Gy2O{x3g`^p`9o#2s5@PdFOPu~Mn?)Nq)+Jf@=o&9{?N3LM}DPL6w_`33%qa$ z6B9MK!>Cd8tl7sU1s>J%KIiLQF|^8eIENI@%6hK|U&ODsR!hk30+Tg=MltXh(%Y*& z7~ktX^k{z|fQ+TGbUR-gttHYFN(@2ru-kNPl{d<}oP=d7D<)8y*}nA^ytQ3I#WD_Z|vgV=IhF^^XyWzI@|s$6AvYA~7`hdakAwALsRX_aKn#}J1u{?R>JkjSZUv{mD`5@Wx*%P`jmGMt z8mlWRo+@5GY60WUKNx3}vurh8EkkEs8pQ~(v?8*9&F?O4&KEt-&ko|&*3(5raZjBe zh`hfL?34;)TMfyBW@b{^e&)jwL#fsOK8>~N;JAc+BL~!i^&p?x}jILbOE zWOWijFk$YOYZ`vc{Ar>GoaQR|wVxGz7LQdP>}(hzS?@f{C+zZxNk88P!b0ZQ>1@UB zrRak)Ga^rubbh&U7Z?|K{(Al1!Sh6FJirv>2D{_(v3XTp{9{FjG&@=%d1f5hHw#|u zr&anFc{PONJ2FuRY;maI4BK_IF-we4f}8*c@O`JW!R{+?^(o7*yjb@?h{QRPyd&iV z`MPcM@NMzk(|RF;r!gG?cmrHjE1sawf~D?mL-}PVHN02-?ZHX?Vz>NVMqZp0M9kEl zgeh-3NIak6o?V_>aD_C#b)&4#j@b5suL2%dbmJbEAV%+H&B4_gx-0h~U>>ysZiS;x ze`hL!Xa1J*i(q|D_$q5;UZ|MmQcbYf(Ow`>j#>B|qk}f-1=mZ-A-;*^BNmghTn=6SN6Xg1;K#LDa>CI$HXbrdcr_8|&kL&&sj%H{)Sp&xte-AxyXXz27 z+>w;z15ir&Qg=%y-1;Vi*Ty@R*0M@YFRE_)NCZ58OX(MGW8)z$t*Q(z)F{?cbI(%> zSiU!u(GIVeL4{C(3Mj9? zcUEl!b~k){tQ}b>@qMI?qB2Q3?$AJk;SJ|bEV~pDW4I>v&Z_K?@j|mUaJFw}67|(@+bZXu?D^3$^4>y*-^QA}KZ^`Ud3>GEp87U67z> zyk|hiZ2Bv&o93)DKHE;9ZFNgKb%{m5bgG))hQAWeHQ_z&`NnuDN!UnqZe$2Jk`OT_ z48filn5Qz8mrUSUylw{Hm>gH_@1rY>uX^I9%bDLy6Swz!v6tE><-HTuQD#zTq>Qr2@|sk4`YCD z%-37HlYouCrh9t{FfWD}trZs~%yHPwwaj!k2kU6nBuF@$X%Rc8%M@uW-GAp}J5A{! z>D#6Av+#>`>9Ti^>r+`Zg!Ad_9-)8y=yH?`Nsvl8+#U<;5oVEu<-hv~mFS4GpfSGm zHXD=4u>6V!s$J@ef~IFRhn@Lac@*sR;TMvD%4edYZ)-CdkEXQ5ZAk&96aDg*NL7%NQ{#a zB9fd&tdEBHB3iYzTxE*tI)SY$Sho=8da(X9*G_NH2&(d zYOZA4IJjONS91$flSof)`D7p8Ft?Q8~qI3;TAXA)}TQtBC$nV}%;5=$Do;-*8KWj&BXp!Ia!x<+GjV zCM;NtC@AuYDH7^Bc*f0c@q2p)6d&oqIe!rhB`AYtkirWzK7$LAQjCf5$+=3s3M}kdBrw`xtR6udNwtRKlr*25|3Wzf0 ze$|M~mYeNP9<|IO*9D;Q?xyS9VX3Cs*j^iZ6J{-YmZ^ds32~-bNSq3(Fsh3?qEL?n z2|7EWRSMMX2gu+)d|jzR5wX`W5|ht6J_gLt`kL9}-OuF4rw2|+Ye7m1x6*Y@m>bN2 zHsqV8c(+iX;_G|dI`2DYXp-+^JqSfhRv%8|{=r+HHZ=y|A~(?e@**P1d{)`k{2tq1 zsxaB%#)lBMJ%x7hguj*sEqPiqXZ`l{?>5WU46yNBJ`rAwa2dOOU*oH~>A2W`KDDYzq$RgJ(+Fn|q&B|U=V;?X_EKe| zbT~+zfgnZF+Mg6U*Lot-F`gZ*N6*mas=c0(^yd74E>e_Zv>D{cw7F+t&P?_z=&31` zc@;C7m_YIHAa31h?I_|spU&HTNBr@gBNVR_NpNQmQViYEllzmNg;-2)xd`50P$A$a zA(0|<^^M=$D2{Rp<7#&#Qt0o*f`V5ylVELH)XIO(C<{GZ;%HzxhVoLT`B5{?_p%^g zgFL^xu@l(im!*A~eNZB+6-$c_p=122ZO`;v$Kq&iemFZjT~v@KCpW03bpvWO|6?(& zauvgufgoJ`=;e@RoUzv7hxLWOgB|0M=>(BWmDu_2Z@>(?wYL|vu+}ckJqKPoo-P%Z zm(!o>Gp#3HnoQ;^&aqXyP}0eUN`M0VNcnIv8cgr6U_e?34?OtZeVmm+welMzcEc+v z*5TmmR5(uPEI2+v{ofT%Rh5~w3Qo)5GUK=TJ5QfykBNdT!`H|wp4nGS9h)2b@+`|& zWF}3`(?Xek=VO$x$#hc79Z`FiGeLA@p%s77e6I)GM6VYEUgxr6OcmPUA;e;J;I*eR zVnxvKwI%})5T*;_0&(n&4ZT@dhxWjyLyTAzCu+|?5f8kwaJo?)6d*#oCYQHdYFqG> z=z4b)aY=pG4FvYOTO*B}aFrclfmymwy=@|scxF4>3a(9ykXOfOlEty;nw>-Pl{ubH z*VNy3zN;fw2k|m`xEA$t!`xwMn3B`@YegDwcBtK-{QA0u_=5o+E% zi9>bY*n-I!a(42dZGyH9!YXz0Sg4S!6}juS7N?AN^R&5P0`9vb2^ET?%~-_C%c3>Z z38&Qv>zUqY6iN}t;>F`l3SF3)K?*RDm`R6>3SXwn(Hr?Td8%}x1WNn=&g!?_XqWb? zX@uoeAqeSzH4Y5KfP#|YM@P}4pb<_>i{K;2HZR6=!yIx5BP7q{xe>)>JGf7m9_|NYOyTDZ@#b}@F1UgjsUnj99)Ar!bX-9O=#LX~_^?LR; z*vJ^foqJrj#3qrENN>v(gEc$_?Rl%)dv^fIq&K{fHy@Iel-9mi_5`ilxov|-93mfaG=x(lQ#1j@-<5iijS^DR6+VGQrteQa zuL8q?(7P9_V(%SZLs(6EyzVdyd18dN@{m5dL{4Z5T_vK9V7LHjuBLUz{sK_f&=k3` zvITGzZfMfDR`<)Bm=hYP@V-iO(7dWIV=N4I)HK)->Wlh!X2}iEUe(j_?hVfC{7^p| zqq(^kDYuR*S*pysT)a2!c#b%9e|rg7>Ck*|-s7`rulBuUY!($;&!)0f3l3RXRb@QV zG$3acuuz2oWY50w;zw4Z=zWWmQKe>Z&nh-h$IOD@#mxg&X z|Kv$wC(QWHZRVW?mt7AN(VV{c{Cv5ojp%uEz#7zGu_~ZQ0Rr6o-Q!^XdN_$|(-{{&HF%EZg zcJ#aeh1~=_br@9<(Kgt7*Awe1+Al&HV8-g|7*1S{!^4NMaUb`|ND0Sq0w?E5J-xWa z#_M+te}J9_7cNT7pYa0eTP-udXMa0StHCnGFgqD6A`s(=@hfH`;`CRQvg(p-#u(C@ z5_WVY1_L;Vrfx;@*8X!E&Rb2u=zFjI^f`ihKZ##^B?)RPFJu_#t!e9JaOA_z_K2 zTc`hH;v))xv>2TZJjkiC|K=Mi4GOAC%#o?XX9mFsT{imxdQ1VsCt&mA%afmTDcCZNL`(~!`eUx8P~g(&8! zir9(MK9Tm8wpOp>vIEwSPA6}Fh4lH(J&%xr*4A1Zn(xFZ>l;Lmb^>y%;U(qdcG)CR za&Drb*`rzN0x0?YOKcXJqKoGo&6tzntihDZ=uLdayc2_I58RiXUjbF|}mvi!! zE%S0rRE-tK0Iqjnm_eU*2qk`RH=4zgEl?~fl`Vq<5rFu|?SH59^x?VSFUD3v>R%n- z$?1%={w=RhGWB?X*w%UE_4pfEcbBO%r}Mkw1OGLlj}HNW6qHb~B)6vXQ1E!efM*;Z ziUB01fSJ+K!g17MGN5l@iPKA(noLO()etcxrQE1gRszy^>%eJsqdZe}EqEv1AAJDa zc%^bEk9^5v*N9Q!2rJ9@igD-mCAq$dQtPhxqh~$?5R@}*?|6jTgRbw2E0+O@akKJu zqGQSHj%e-y&0jeTXeMg6rvmEUd~O~OvDj4++n2wfqVS6@AMHpiSQ7)l^?B_{_Rpw~ z^hJg9BeUbFPIfoj6v_751;&$`&sjdWQTj17Y0(zx6dkG3b1#40vfHfHMiC_CS*@Tj z>*^Zm=%f#iWgjq48tk7sUbsw^^U~mdD@K>TLna4L&UHIs*k^J8^MD(0T^0}K!(B?M ziaJwR4V{QEu|3tfA2{!y7=!bn_X%>rkws4X`rt_)6dP+VAe9tv6t>Xvq@EV7M`f0k zAY!MA)^9qoW*9KHffv~NV(nL|KpelJL$S7g^7dO6z=homKojQmsJ&O2wc{wE>W+b> zFhc6UTMR<{6Lod4r@Rm}z+5dw%4%e9ZHrXB5big`uYbcLsWCx#HQqQ)N=B5Nvc=?s zm%7zUecY>~;o20C`mLfiCK?W{6~UGM{~ZCZm=EM}UgXT>dv#Qmm0H^3 zV`HCcxXF@yot@pT)w1LSzNSn>VInQD7LWK)T%;Q8#5P&vdP;sDj zkkIv0oKAXRVJs0K53@5J#xiOBf7;vCaFGU=f8#`onoN}r=TYB1k4ZBn(Gq*CU$7Hb zfQ(t)Ej14H8${y@UTmJ=GbK(6@gKOKJMlrihm6Ohxpm&WJgz(~ z)P*Jl&CMVI`u`_pwf5v`{<=D}-TJ_KlPn~J)%(A zqCD3J8>4&Y2lUHR?A~4L!spQ8BAGkntaU9@;_COXypfm#4ZguTwArR~rxe_2?j*xkIKPVqkQsRMsoSw~ZY>-)cM?%ivLpTop`!=0@!?rp z;jE-(jWRJ)jt*sW5C?3}7F}#8P_aU4>gWLGg}kdr&h5SE^pI5e3KF8**xxXKH9WFa z8Qgc7c(!o+14MszU_Mo~B4IcE@)S<)1MY+@x|Td7{v)&dFFTQoL+@Si1*7Fcnf-*) z;4gX3OY%uPq0Pou%}+0C2pgE_6jdV7zJp1EOo$)dDg3l}InzdT=Paddnl{E5Ye zmg`dddxI(A;EOnT(&vDa+${d`13>t+f z-}$|)L}+~J=(Lx3te7o{7iw^NMEB!ra20#=l*^pg*=ihpFf)g}Tx zrdiiystr`El_u9ZyXwR3o#}xAMR#7Xv%8bRo*PC+GeZLk2&fiI{*<^lO3BgD)!U6B z=z>Oc#g||oIs^a+t}CnD&S&s!)*|XxtDknJkAypIoqCp&W8I7(%7@#mON`?QI3F^0 zg{)}G^d?Zptb91Q^bGp3!C9*@yTbZi`O$E-hcvZu@Ik+YvnDNGing+2{Ij#2v(*-j z9v;D#IZ{*_ek1lxr`4PyC5f)}{?2*Ao^3Waw9A3dSEjQd?4boTdnyB;HSc|#lz@fo zf_g7Y@56`h4JP%YwXce{^b5ZS5b=^^emRn@d!6E&@q3w&rF7B~zdXD^{A~Bw?BI;l zg7RsDAp9d>4bHn+Tq0GByasiLyJp6)`P-*AGnv<_tsy32k?k9SK+l?u1LnG`Bdn%| z8w3*Mv22FC%kl56fQMmt7!r?Ou&~~X5I6>mrmOB?j5~Lr*Ap=^mvRP(2;(6GJGGbT_aehs zu!Q(!ey?Wtsj|bAR;b#i)zsU#wy_&EW671162iunF~4<^tQ5;@#}hibGaH#KL zm6Iu8ch?B}S5H@M4v)AROZ9pK$jpb!Ri^DoHD7jRWtTc%k(}u}ekiD6aHcgTTiguv zMR)unJc35gJzm2czuZ!T^F%7{f8$mUn}Y?qJCquBbs_*9`$A&)Y`oL}VR^r1GPW09^_eDj8_<_{%6#wofj@eeyXz z>L8}!A2nc;3+-G)wP>>c=2E@|^tuMaxg!cFc=k|L0f7g{9wj#Guoe%Ox%F1S?FK@5 znPu~}&#S~Fdnh;knbc$W3d81?vK@sBhoz!|s&&PAXSHcvM;jD*&;AKLVE_xl*kblC zi>3Lt^KBWsHA#uGh$GEAvG&MR{g?e$+zc+JztaTIEe~EsCN+4~zr`3eRkf^G%s9lmKZcwRlqag{(Ohgqe z2FQ@kua)c!k=m27q;<((kcBE48uWflsq?!Lm` zzGpfs)3&YV z_CoQpaLdcp9~h0_upTPcZsr65VfM2J?xaPMms3RW8-5tco#FwVhMa{oekxv`5Ax~v zz3kc1!A6w#`5P7Ox>m__w#6!%%4&lPEqi=oZbWDf3scz4Ukat4eSaw&AtB?9O_fm4 zL>1JQv|n7+HDYhYzL>~hZKBqbx&ycBS%B+P>Z{;+Ig!(1Tc~m5&{megnmUQ&;5k!e zUOYu-Y=?{5t5%Zjq}k?_!5GgLHoaf+Lb!r1r)y;eIHveRTR1pnDf4kZ$`C#Y2l zoJm@D>3$2t0@_;nS_}jdjA*a+sMLt0nwJ|Z(A1f_48JIO7EO)hsKBbLKMiGM*ro7C zVdIaphN`h!>X{FO=RA(7Upt=>*BF^aP_`&jn3BA=^!rHv-EJEfg zI5wHU8lxIJUJ#9M=$GO+h#YQJ^GoCS#0u=zLPBU|d069qs%Rt$v`ENl)xL{IxI1PW+b~Hkmkl^`> z*#5RXH|Uk>Tw-J65-D)!A=an5#@Pz;$?R3lfvyf?aWv)w#(prnjtC&RZ<_V~(`Q=v zX1YQOh!dTaJBQ!=E$Ih6gE1?ZH`t}$!A z*1Kmb#>WOto}B8VNyIKL?BlFKFapc@)f|g|`$N8|l13Me??imJ{Y7<$!W0_*w7$n#+reyV>hgT%knpuRo&K#Qm& zRO2r21Kn0Ipy8v#mUlq?Y;3KcsMGKKLbcWK)51z~oUPgSnl$KMWnlD^&C*I&%H~i`#FdjLFsMHxs_nI_XnA+zZ z3ma2qf$=W^I{}b>a67Uy7<4AOUxYi~{>akz46XzNRpantI;)UPrt~&P)4;}ilS{LJ zlM_dyJrzz?_Asw9NzD3ac4lg`$?tst7Agv_-4egK^hCS99t8jg?QM8!%0NiV?Nah~NZpB@Jo) zriWkgiU4WTAcgSvbt_d(ZnOxT^{A!faF1Wh;HKnoh0yloN#XyH-k;>pyS(q9G=W7s zz^eb`e*~0)lx;|lPzsz1fA=iVo@w*{NF}ARo#dG5Sx?LjP>I?shj)z=b%+ul-ch8n z_g$F(Acvw1yjqN?L~(wz97yFkyXQ-AKy-@#j760Tup?a~|EhJQmV-VzT8S%F z|EccehT3ZA?7(%#QZy^OV{$amw~vf@#Eg{CX5&%0_g@@N7tx4`#@%r+SIh+lf3o1; z5F$`U3V|dDKxK_+)CGZ15%B$DJNzJ}ghGA54O3C1ExShc-@XmK5r5GZ6nwHTu4hgB z4sO6|1W^KahOb`Ng3R0;ABeodz~ioZe_G5 z+CU|Xj53}OmIjNF{hwiJ8^)CIZ5Xⅆ$u-kSc6_;XPr*1myn71D^qjJz?9$ja%mv z8S(?!gL|Y6o7;>HKw)TtTDjH)~;P>ma(|p`thpb3v|V>(6(QdArl` zvy^!|v{_9VQ|&%3@6Ky(z`z~jXUyt1MPr_JAp+t8LV0bCb<$n}@)TA7rP|&YP|#w1 z4iSJ@rM&AS#LD5}b;Z$$$+)z^LKnG0$Q_;~D8LiHR+wlTi~fU|37h@J;$kH6LCR8Z ztubrD%hG=^!RWkMiNUZ@J7G70L-TGu3JouPe&V6r zbY`xB5-`@|*;m@TFMfKP?#xF3=HjK>(!r;JaV0Z&<5*i%N(<9g7)_DzzldZ!DvIJS zf{-=sRtg?GX>(x^b40{^uQ%BJvlV^TIeeflL@zH?L}mfDLTFA?GLY1?Z?Gw#AgOEf z$QCZ>vF`%P@pIn!Oo3!7L7n|PgNqt8A4JoTfBR3fl*NA^h}Vjw)(T2Kp~nZ8YsF&mzl^I^}26>+VjC50Os=YKpBr%3sH>nhXM zq=UtiuI*goe}5-Y#9)A5;}Yr`Eu<*Ra_isCa)jfB!I>NXa*`9wA6RAlkzTS zVF{i5iQk|9fc_Ra1{KYViG@c0)IHjGJs40>;wzP$&cp8x71ZD`>!isJ^^_pSckD!bU?j^Sg4sXG-VYZ13QVf1+uAV$#J7R)&2fkFC= z)T7wL5$pQ9H#As+QhG~#8!?gu?`w9kjFNlveQuGZoW^ov$ut-hqYqKdTua*8j@a9aXTIsUl~j-CODuKVksUl-_&g%BVt=E@4U|zLd}6Um zhA1cR`bXdWx&bChlpR}|e8?RGIZjr^ZbDy5R|%ZbgW4c$qc63Kw!+!4bF?ovJK2tJ zFY$JcKv!Sa=e{crgrr!;S-*WB-mFdPsbZLYM8OoP z<_+SkvpF0*$S-U=HQQbM9r_JzAq7DSz&)IR>$EVU0<3Tu=skNBIXDA#M@l)}^{6?s zd})No>T!(yRLj4P>ay0hC3e@VKg6SaosjkHzfjC2con(_N>*$inaDAB47i_{QmF^&^=|*ljx-HGqmVew>RKZB4F9%WV-f?ArArlW3@dPuo@`1tB?8ku2ZD-paY=zSPYE_ln#u~}Fsq^rUi_ySh8AG<+hQSn5P2V_RR5^I`KFA8 z>r}KK?ydZJh`2ybK{iLKMS^qS+u`|HTWQbwLe;7D2nyW{ikrbb1N1{6O&!)+;1Lmp z?N2~ZTPtK@8r(9v9Gc{p|eq3Z!wyaWMfB zt-zP}tJc2M+@Tcb^Kaf6&+#vulr&IeiLZk@&E5nonLDtcN-K}@?d?()fEl1qN98OsvS z9SYkdU+=Idq@@p}ah4dl78FyHzP?;gGiwTDH9cBiSa+X-R=lJgR1K248J$M0TD;ST~UY)EL*ZCS%be1Q|_9-uPOn) zUzNs&+G?%@$ak@=mgsJ+&Ux+L8Lw|I!<(J>p&Aj!AKq$DFL~!Hs9utiZi6z-ns4|( zDM2G$tcU?bHGq64iD^PYfvN_8=V0=JsSLtFAW1|lo0276ug1fLYamHmnIO*oOgCDG z)|RI(J**bAf$K#Rh!d!KV-;@&>3kHpc*tHoG055IB!KuPW0gA9R|q-T=$;A09sxme z2dE>mYa9YVcL7I+vuEf=8K3*H?pp!dGE*<`v!%(g*h#k82c;o5mog$u7#mj`uU#cE zK`IR>A2ZXpub8Ask!_tv&!hO9L;0Kl2Ik@6%x^HzJ7N0~l+s zEGfC?Fcg^0$M^Oen9wwAJpOlI^;yXgifGF0kHM+A&3MIROwf}7ftxLc!<)f|!tDUy zfvs^@XBMhy%|*`FhJ>kqe!_9{0k_YNuvu%OT$uzAKtg?A%mM zF8^}+{>d#bor4==AXwS`ToEKIbcliQ5U_Lo(=uV2Te%yQ$P9Tcl$*(GVrno#zhLe! z7$s{aUHCAurG4GRHL?FGr*pB25mAB!N;bHEd1?YUYPt8lg_KiQq)PfOGIydVb1E8~ zb=xN!x^c?UeSnQ-6|pAiXv2b^^_YAy={c!1o@pRB3%Qia%SV|CB_{_6Bz@ikFp`y8 zgtm)x_AVEqS)Ks^8y{Y8z!fOyQB(enjqq|}+ccIHe7981YTYtq7%5On>a&?;RH-Xx z=j!@o9~cb2AjcbMvI}9vR!{!cJv|bxGRw{AY|e0ir5v+32CtkstfZk18oCRhO(W zQzq^fmHXi}u2e-RcyA~|DH+`9yl**+G)j^!hhLt=a8x44Tn)hJ>)idmLxMjNAH(~{ z`E#b&$Hq~TKhfMWgRx@u0aGP)c4kT^2LlOyePb}A9o}m9cfEfXm4H*fK?T4mIP5{J zrC~($^-%%?yMO8uJJ@vt0U;XqJK^;LXr%gNJ7oosysM3#^}DYE2tXQRZB47J4DY>L zs%W;_>SiP6fg|}B!dJg#TvGt_8th?fO8|@#Dhl^Ajs_JFFq)LsXc>G~ zR}AD;JT@bhv1JEtd^G9x82p8O*`HbW9X;WKD+|8smB-$Qb`D6NZu@Czgt8Y6{fHCm zGh=w85tq)qcaEdJ!1?IO=(Kf1Md7QUyRLr%*3(F82x~q{A#ke;73Mx~I1oU*oLO6k zYUt2<=+tMypy2aJY|9{H9F2~1m}Fa+D)~vKFGBPnSu(gHH2E+z8R%NHAEg#0PrdkNE8W?mLV;!a9Y4_yF;nY!?g28KfeeS54dvKFuVxI_ROU2OZr zR^I)NN}Oa!Cv}+GT{m|I8WF^`Itl#(#w<*aru6ki@|YAD9_rgn$6t0$wV-OifUP%6 zFF$*ji|;{c5V78g`pYj)B~GP#p6jd+jI@BUV@Wfwn!w|_DX&((2uuwC;Chz=R16MK zOoOWHZto?VacuM^E~l_@Sb>5`prw`RfF}*mAR?&FPjF5D~=) z6)xQ-C*5Xho%dz2>ol&NPf0BMp(KfkWuQrsXtF$!fJVY-JG-wNg_spHIj|0n$=?U> z;vw|~PFrTDb{!FjEEZI|E|;b-H(>&SODb{_CU(dU%daj@0w{XNbeE?(S3CS)?#>2s zm=7Vsp^f$m60}CzlsnpzEa?j$WpjXi3?lH?W$}#w1q3|qa_zQLn|^37ng0r;gf0X4 z5&i(ZhX{M#$v`H@gBMeQQtL;jk7;*DZEk&t7aJN$@M?0zBiRPZqfXgc~s7-Ep zB>^pP=W<)AxlLii;cn_YzMut^9S(HaN~X86I{!>tp@W|m`9mIFO+|%hg-D4=qOKMk zK$s6P3EgFqEy5b>6Xu=atWFQ)z&b;>hladHZX~REq|9 zjV4m?@&ci;yk9P2=jRHQN=W^mtHS2%3DD+Dz{dNuOeW5!TdUbCIrTqeukYW6BjEYE zqO+prCL)72z5ld0kcZW|^DSd`U+5P;w~cF;JziYylVtMiJUyW1O)DdxOv2|djMt4d z*Hf-fwN@5G+rTRQ96$<~{sBDoSB0;Drlatennw4)d~f)lXaF!uhCm;&_ylv|qH_te(0ylM-R6)u@U0A}mI>txeDler`7y3&rn{-j5={l;zW z!1%6;E$hM@Acy^pGL+Czz$#LdwWFH94I7SVHpK>a0LQtDgfZjvb)Gl_8dEtsD^Pcd zjWIFF7*ARxrl=wqL$zA^glB~xZ2=Klp{Y6N^76;lR^-|`ZeZ(C?bvX!s+h_<_BO}$ zrE-z4F$-2omBTkgbwp`P&WOg<&bJWNpLl?DtH}S7o-PB#J(CwNTQAP-=1yDVd(vqS zhUsPqA*N=ByLyh^s_V=AhaP9XDM@Z!iDb7^Dz`cX;n}7f(=KF3tDq0YB6&dqBXHH+ zChP#Mm)CISz8+ei!R2S?2ZkeEd$u6+eVFMM`%k>(f!EUV=n5tG8_+EU6~rNoeTE@o z)HL(X=O9&HoRb?ZqttIqobFRD1rR{XI^|>h3@mq50cWhqv!+tP>~m5^XF=zq#`G>X z;tw$}ylDRp(WRyZ<2yDCxhjENjmM8?j+)=Pn{=q=PLZk`)t^0&jR1nNzL22)J#D+p zP1x4O28qVXjXUttTa6=SMrcK@JT~{w-BXa;pQSam1>rJ%%{ey75y^86aspt~LTi8p zQ%Pt+g&v6iH~@1;WQbBkl$CzKe>vUXuK-A|`eqO!Ir@!CEjC2eb;9$-$j1{I3Qk8_ z2Vu-%$Wg(0H6UevyzcT%*4)5sK(>@DTY3C&6&4T1>Tn!`?R<$nY$YKb>B03{SX};o zf&n3Wjg^Hdmm9b8l!KW` zJK7g4|nOK8r#6i4El4`^SctZFoDtn4zfgK&Bm#L>Cl>d;X1o3 zvz?u3QqngsNut6a7<(7?xa__Cu(lj(`^BKbhalJ(jk$9y%KxnzR3Ik_$P>b(O4Tl7 zI#y#A4bBEYGVXJ70?3dcfSl^MehmiZA0E>zhbbubX0;Jt?FQmg_w{oAm4B7N-0VhG zH%??|7SSgIOflJas77IQG2OW6c?0DLmgJHXLkYhI-vCJPPqPtun0yM;#wET^VPy)4 zLwlclJK7!jYk0qS|6Htm(PzMdiPvcr-3mp3@(-YdfECG|->8qsR@C1pWG%VJmCjW? zS-2|+N~F+x3lzhp0OpllfwK$uwo#Lh`A@NH;Z5ruP}QD){Z9;c0HBLMT)f9e{CJun zyET<+0GJBJ`jLPwPUy0TXZ;1$>O?9T;LHpl@cgnR}o zLf1RL%Y;IwMnO+dNESQ@+@iD|PwL^?caC)i2T6N1XFJSMdfHCY3?Tg0uI({9T!37G zR^cR>?4}lmjKtAmj?6n4(fexKf`A1JCP|z}0R@}GFsB@SRe3}*lLq=*C8IU_d|Wz5d% z(q zA3Jsh3kyTqG6snfOBNwL^Ye9O0^dnY=CklT$p2eINxlW_hnVeNER4;Jp9+2DbEkQ8gf*4PE8$;7D@}ZT0o;TH5l9jgl~B{&MxV zfo+)vRD(9!_T@$w-S(#QV1Z9?I|%oxr9GX=D+bJlisZjNweyq0Ma14)J zPwjL*%<0T)w^z=t-ky%ZFi=Y&F!~bEQBaA7hf+w)iA&#->7dY~%Xhm5U?gEgK-qkD27&p;JO(2)+2J@T?IL&aqUbU2@xd`a|1x%~sIsm-`EbLT<#4PFWIY|7vWhki z+D2QPn&_Io91OrVP4MG;$c*P&Pg6sdL7HGwI5(W{8p_Gx;U5GqzD8RYtnOF;X3ir< z1j}*@I>+bzkzGMoy?L`gO4;;evWBkBLXUovqQjtTd%7nTg>Udstn($T7WcK%a^Z4g z991nGGqLD{<(B1#_t_%>B4l^W;8s0;*YF4CdX^@rrcICxeo$HS`j)Pw-?oTV(!RoS3Lk94HF8s}fZ`?wD&MdyGTgXK4WFGg?`9hY2R8(zr1gX512VFP3+)@8+ zBY&Hb*`e6+@xE=M7~ae5RBlnpgZRs{tZoSYIe7*aSX&B~-z-fp`|Yj4NvyIl5e z+drI{aBb)=c~1f_q>08|9{ysxmLyrhZgT zTJ<~Vq$J5{Rr{Z?C62dM*wMXuU9LRn4?Ya6{b)?C{b$p>+}54Bi`Fj1n)usL|Gw^G z>#6X-E$1lE?t9mF;A@M+RjxC~FSc^8|NI@R(3)ZoJQ!8ZdKi*%{K#Af{`l@jQ}&p| z1qpsjBjc^Ls;`R!upI0*ef4(43o|n!R~~U73m^{Weoo`PiwM}W8MO1XRXx_$zhyI- zifkrE3O}}Vr*HoGkhNq`v}UBARFZ!X`-B?yhdp^+s82Xv@4mC0yImv5@R|UYD-(TV zrx$KgMf!mNF?UqYIezy>mLS19>10*6+Xd6$NJ)pIs@Fo2MRxK;yw3rK=dUwx&N+vW zmI*E*_gCEQlG74lCQHqu22f^#L#y#Y(+VNN*O&YLmzs;NP8WJE9J~iC4=2|1KjRDP z=Vz35yA$BoK_$CFF2wetTYl=HigVEoZk?1-soWZz~x!Xs?Ia^VWU^5^I0!&GS1qF zt07?DU~x|QhH}n@K0jhMh#!gHB`Q@YO4?M)Mzs4&cpTJIT4RA{Hy@OdNHX%=nC=!} z&S8z6!;1R{#ma&I>sc-S-`7L@y?NWCuT9(Ca3H?VBmyPfdqkreqvaV2CyplGoXd<3Ig@R&P+90_43jQS2lx(PKc4hwjpt7CMwW;UMaCmF$r3H=G<-)dFE7-wG{ZeTu`kJdoY*(sYg`wX1(?~|2BMfjwqTMj= zTGesTE{dbSo#72dV9hG>tgXiN^-${(*|p6x!Ggbke%pr@U2?a0Jh(33FuRXWdbe-N zYmUk;#^JOp)#|M?Jkg>iY092C3?8zDw+6t4T?Q z)HSBT$LH=s>k@oS^0Uxi_|W}VkATslO!vf#i&r!;uOhX~)^s_YM~RA(4k}JiZM}ha zAKbIst0?$+yQ=wD2ankj!H&ns8#V3N?ZmIsARR3;*#_UF%d0YwMh zP*zP3v7OPrVocZb{>kR9htp=>ispD_|qNG}?O7MCpo*4b8?#XbX%D#fo4t zC9M!?GHcm;gA0k!^_2ryPY`owy`4V%-N1MZR>rNy-miaZ^&`?|4_A_N{4I`3JHqC+QCz}bkHsm*U-KUsvR}rRB z*KxQJ!iHF-IgdAgN}p~MN{4R7yge(->rW%V6M_kFI+@KKdV?NP*M&#eGwu*lFev`5 zeN2E01)SP!j!!gXoLa391DpK+T6^oTsG_e?nC@;#rKGz-kS=L~p}V`g1VKPRkQ7O! zq`P5&fssbR0VHMs$)S<1@A%&Py}$SS>-*#0dCoKE>=XN}y?U*^&)yq_#vHIe^)_L< zfqY!u? zj`1+O8#1isxF|1?c0kfKY(pKFao*=)R{FElbg#M>81@8?BuvhMgo?ITHjjipnw-SJ z8dvS+E`RBK3b&6_(A(nt!d}6izVPh?rh5Q%ky~WU8JXdqoTXx-DgFLE@zuzUF1^iY zu+}5a%#3`gv82RLI@iXU^E*^5VXb<05{e`b;$cp(r@qsi_S~?i2%-O$i8F89&KeO!Dr$6s8rCq`Rvy; zVXJzS4p#HBfTgq1fihYi1xJh-byJ#9Itp?3)l6DRds<%&Tg%EF<-&Ck$2M;gh{+|w zA?8fd@rC8*b!81C(9_KYX6+S!#1DI$@T`_8?W1Dwu@aE0A zJNL@SMbm*h*J=x5L_{6xBzrs|{JHOZ8T)nXpRG^owZBd2xs9#sMaj)!O-&})l(KRB|IuP7{5&Wj#5al#C9rznw*B?7 zcusxsH|ss5pT6=N??37bx55$+HInJy0Y|&x;-KM_t8#$|)(nqM2W5QzFMY%)H3=}! zXBvrt(T9(p=_N)JAIJ&f%KP7&5OQx8`f@dKSAPE{FuzF2Gj6J)9EfRoyU(555G;4J zGeeRNg9VS*iZL@VC`m~t`ul%c0*<=6D|(WN1t?OQ?ym$tNcc`=)A)PMnli?IXmz17b^1To!F-~A2r=3r^sL2xjV7w z9J;YkFnF9tMk*7{*YIy9i7J0K-YR&6t6QU_zVKjgN|k*fbu680=g7Nc8>m+wA^h4M z9odkR zjz07R@Thb&ri0g5FjDE+X{>Za%V9V}2V!28^_9a$7Ad_=cwvTYZQrK5|bKgQO1B9GHO?oNWK ztwQfe5=^nLPRSFlPO2+94m{jC1BYaY1-2(*4x%uZJYzfzDMTh7>TYbJP_Jq5)D-2mLiucLa^ru-zpby+^#O=}H6v=9} z)Du4*u3QBjF$U3Z25x}*!p-%^ExXae3^(5(;e}P&7e(|rJ!mt|+JB4G3=*m0KPeDD zUTm8!1T-at70OH;8uZ@ztFp*N)h4=$XZp@waPd{~g4IUAJqC?Z*7x0$4joJTw!u4s zl~Igcse_uTYi0GZlj!4c`>9B?u(x^3V;@c?sL+yaO2*8Tvx35nS!+dm2Ue97koNs~ zb5fGt8CxZe%#AoHu_+=%V%j0*^(&eHS2`35X}l=Q?9${s$z*LyURDOBlm%(E+B?{o zwBkX(?RHP>IcGZS5U-`j!E!N-TdGJWFtR}?g>K=g)cIBrS5W8u`jSCL4{c-sHpAsg zPA+K7$WnXs!0IWI*GqkuCP9Of?;9_o^f%gJm+XWA_V3{R6e`QQqw>1WqOPsTzKc0h zjUJ*pkj2Dib@D==OjIO0g-_^YV}n@Vbk7st(p++*Lxpp`=hZ z0=;BJ#a5On=KdCUA1=2WtYSoU@r}#f&~Sm9yV20lw!UVtt%lxA9e1oSj?vhTr?;}_ zWt_AW*(ewMa0MfcK5tHR#?<_M7?R4|tbKj8i4yuP>#yAD_Z=Egy!@g?5MPJMgyWu$ z5!LCE8UM!G%4GP#E{n*;)fGhNvZ0neZ3!O9Nd0=s5AIM_d_(5fd0`oR|9lI4)cf29 z8Z=q9rPXlm2)flitGjuL{}%Wp4WYnMSqMM1|L2&_|FVPzw9cc>0JB%9IUWFdb;+A9 zOlpE143F6824#r~s`V(GG^Q|Zl(&lOA-F&pvBIehS=su5^93ItpuNUAhx$}!x3~oW zF8Wppc+Lqi=xA{Yq~2a!Z!cmthS(494kF>^0?ocu;NL=RlWhfr|$S(1P-*jpi z%GFMMntr}P{@mr+!K&7Le?-n&jwRXzohzm|{M7(2(~x)BC2j6Tlg|cB!*khGeO~uN z1yRe%-5SwDa8Le4%+vmbUl9I4QB+;C9UC((*`=KA<}#_YJ+#YZGB2ZqMbjXMyIXTB z7LgttLVj=}wM_>(t(9PoQcQL6okBA*bQH_4z~Gs`u6 z0sPr~akIJV2C}uHYvt_1CarGNeJkBKMW4dkr!6f~+>lhuW^L=MZIoup!s{)WyZM%J%y&x?XT)~q zI4t-e!TYZ|&65wy1{e<1rN|=1T^iFTl|hk1HDLSb*CKi_(^;8?iPSa*z-1Wa=B=Nh z-p@a66&q5?tC%ov8soUkG}z-Tlc|lkCKWovM6SY@_{Cz~~B2na)etiE@U*sqtA%(v8`=@)|itZStThd|&&vQwG@**uBtb^O) zhs&*n=acm*OqUoFCWOf_GVVoijAv|sS0^d5Nef|m7du{q-vYq%!JeI@Uo=(gF;2#0+%->Ylb;y9hxsg=O4+_?*UsG8{gHh5x>v0x=N?b zg}O*wYp?f{i#GENrgb-J*Ght9<7TBE?eO+M?@OpHi*IiqV;;}vCkw>rLc~|q)}G6z zwhBl#>rc-Iu#u!TohcDdGwmX>Zah<4h_&4fUbfAS-T3m;jrlEr(uaVfUzm*U*&!WO z-$ebH63T;3VVK%DBu=dB;T zymxJNS8Jzlqp)UK!xkfeT#+a11scKj{wRpmpi_cwD7p!MVzVhq zpu)(jTDbe0_<=x9NuV=4?MGix3X0ry-gWaT9)~$DX7$7qB7CDyZeGJZlSXWgP!GRm zE-%XJ)ABRCa+VQL}H$E{6gHSVr2V2mQRc;M^0v-q2ElL%o>#8~M3-sbQTDe}=& zGL`=HbBJTc_>-vp?<`A&FS-_*Mw^^?M-SiIUNFjD2uyzt#j0-dJ=(mS3OnUmUX_2W zkJ5q<5kK!!C)_dy~oUF=FP7|cvYd?}v^au&>9VQAHpQOn!wriUks}-4-%tG*e$ToAb z7UX#yR$XWMQ^$_~}=HU%#L;5RrSjJd@BD&nwAPQ7D^(qdnqeqIsdTyVeF3uALuYA7Z&i z+kv-RU^-umD}e~s``0|Vi`$T?{+mZ|%n!bpunQ8p+z+fRi{2i~xoV(JF^@r^@zP7Zom&dY&QZw3s(I%_^xbBWUyr`YjO9Ujp|8%%F z_oMuI4K|Y{&Us|Ho9+kWU%m>CiLa2Ai_BqxzOrY_2IPcJ%R6LzAdh=sgC0PbV5a=5 zv{ZBlj#w!COd?*$`H5>uK!<1}CkqMT%suj$eai6$V;&ad={WgqD@>`o zYv=J`!$17I+J$HB4#ea06}bdQEfZMBRUG0Wuz+6^t+w}xE_(GlcODq6_P1tSdewfN zre4pBJ&^HhC@|jZeuATPwGty18D#B`az(gl$yYhy;Nc?1o!ksE$<#`f9u4BJ4Xj*n zL^AS~>m6o>Afv1F|=7BK~6E{&2<`iY180 zcA@>em=o0gODp>YKH$n{^SXs+aQE3SHhC{CchnyKIA8F{r^^>iIMUUG*e0^ghSVO2 z_TBB}pkB?D>ooRR@Ad!4OZs~u^!IlAN%)NmkR0L^{Lr6XptV{Q5#2)^MPuf3($izy z^{2zbyt6Kjdp^%j_w5(K)K=og{<0$UirYoF5V!<}8eYOdSTH2wPc$@eGQ_`7i$&SV zm%Z6p`Q5LPXI|MPHM!JX)pj&!sY1tBviAgawVq~$8dqVP)ETbRvLVhMafE_GW38?v z|61Sx`#~))g67vO%8%ka#GaJU`AJP(u#j8c6FC{>EvagWzRc)~g;)NglOHk4C_<)W z03G7aRVd@u%fB!D(PW7CGozH&_=iqUCeMl?xO`51^1D>ly8BtSPL&0M{Oicn9?0#M z^x0_R<_v|qLc!}lUA`y3dpGKk+R1);=FM69ptG|{lR5EM5n4aj1kzK91{oOq8Rdw? zS0y$s)lJZVmBCOEvWN{Z5#lJOB2(Y31;${|^PrFDk6wy~qlu%Yag~X=)jm&S7fTC$ zKyJ6d1E{l1{Z7N+*wvKG8~N$n?R1xKssDLK^MF~Jl9Ll0c`p$}$L)Jn;Rf@bjoQ!s zyL?2fbJMKWV3rcB?-p)bEwQAYJ}<8x4u2AxFpQNiOTZr)(4zSaB9Ah5RUWX`1*Z~rP&nE<(v8Qy zyC}(-h-hYGQqO9-v`OJUG=Xw5L}Q4ziG|;%(XSpIybkCK++__^lxq=7t^VN$ZnINd ztu#F45@b?7w!UuSwYZ}87f%m`(~)g}FsD1e@0z{u@vlj?s0a=DQk51NmHQ$kV&8=x z&eqAXB=1oItpP!}X*N3@F~2MzA$IK(N7V9V6Ne0)h7hBktO7;42~U*tXFD=@E6`$O z?&908$Xv+|FA-_DFIgYb@!oOI_qYi8L=XwHC0#oh8UZ2i>)%td zNK+9i2LTZ^uZEmHYo^{9%bVEyt&Nvfwoz8aX#+FGopIRY2GCn(*>F-IklgKODsant zKGK%%k@M^A>>j-Z&&TN^0@fQOerA&xh|#q>T^z@gRKP~F@Ke$%K9e#V{F)x(eerEd;t0oJBvHqE~DnuiE;t2m6A( zC1oRGcrH%Vd83HC9zrN+Qxq)z`hX%*uBDP!UNy17+D>j$uIz?{y0fjq9>|e)MO!-l z1oyR3@f4}|0Yl&DK(*OE$B^pJ?RU0fH9b2ua>9WPtGkEsw}XA0W?m`oL03{EZ1pP6 zBKp>qOt@ zGc-yb*!m_TX1s~&xVdiMo+ke3rqbKbUFp5P>D+o_3 zc*BE@$tWG-DQ%+7y-38(m=JK+Hk1Ul8E|PvG|)=aQ0*IS>~HvVo)+&h`#s^N6*O}= z7fmsGc|)t(j%_*-Om>m?G3{@m%%xT4h&ZO$tw=T1 zcaa?}t;Vw`+GXn}HPJU4{>nE>VM1f4Px%33Dz2J`?pn%K+)mu6-GZI(c2C~E8}aWU z^1Bd>84=1{q+KLy_I~`4^WFEi_Y!X~z2)*nZSX$zMxcqey_xqMO$QI@>0rSh)!!%a z(%}#WuwLcEtX+gnx6z_aj!egv^X`o~Y8BY}h4DI*edf3-=Ws2QQyZ z{kXDSP(g~)_B^QgQCWBUy^4T3jubSeQNJXhUD6L?Ku%qmMnQ_y9M7>P99Rc_9UN^r zZ|KQfGsd$FZPab+F|d&(|C99RGkznNxwzdnP#YiX_FT7wUxJ5?E4EM4A2Tl*07tA0kYY()KCu_I~W_CFuSfY}F;rOn1o)ZPAs%$LdYwAXhm zw~4MP%A9?^L-8(9Fkajje{D6_lodwYrz=C-kqHf%p04i$wMR6ZzeKOQtCqg8)x%@y zYopn+mBp&Iv8F-1euh`NLb2Dy60J!a5p7@eEgA{FUi2`bcg> zBJ|`-Rr*2t{KsIK(@@oUd9kZNR;R>D5 z%!(fuIO@VoBzdawfdiEpBawvl(*l#hE`#FO3?4onx}V1YICs7AZTfs(nw_)u3@{_y z;CPWVL9+Ytia#X%3h6|tiB%Z+X*TN8(#P0gjzUMyVDAl z#+fp$VN8vF_Py#z?m?oUf>vkk^Wh@F>6H5y&O5sY&*b>wub3p zk!_-cbsbdB$eG8v?A`Eu=pa%QOVHtE_>05NtBdae5_PCvhNN|at&Qj;vZop^hjGYQidE`%?b^R29-!zkyVMQ6 zqBvKT{F|uQ@DkgHAA<)Bxn5d|X`Q`$wJTCz)YPA%KZYtXvOk8kv%C9@o~X!Ao@LcN zzD3N5f!2nGz{!bxzfiF)G#w{l=nCk{?Cv(8<*EC2-mq5BU)(vbqg&^)x zERsS_yFSIZ%8Z%Ty~;|e-$)}wFVepv9BvRgrb~(zqA}X?JD`f)NFB8ZZz|P$NoqiX z$*e(IP~Jp_Idz{TW|G4jctOqkNt>0efd0fDX=*$)W}>ZkjA6y#Sf9j$Lhr^ z>cz(b2c+R^90T6&=9+7|;dy(ezx}ge1KU$XnUI{_Cy-;rP<43l6Lw8ZA2Ns{Nrlin zA6m!ylJ=Y6*Cf>P^l=SU9D#!p@2pwgD=IfY+YgkyNk)VYlk2f;U-i(Ym|IXdrOIhe z&5g2i1Hg#@3VAzys8Uh#sfsM9DL1_{0-C5@VYXef>AJrD)JG!v&){NMk4^ zFmNDKAM(y>no8}xh7)}ru~G2^u7fr{MW>(}|7`NGNS`-F6vtRJ|92+K$gFc}cXkSN zpsdT0Bu6p8l$0mPNyb7UIKQU$-daROpfpj*7GooAh-E3AW6k+}Ol9(FWwLY22Q_#0 zQG%2d9LBp2!|&$gklHsLxVeo=d4dpa9BdnUt~eG(x&1hwZ{9Lf5Ne0ITBQb(VGV+6 zA*9sSM!J+PLG`D-Za3DxS8dLAx*1;BO3_b~iuL2>zSH_G(R6#=UF^u>aK++~7$ymj z7>(7mTf?jd@t8T7@%$3K60(@39HrkdQ{-Cc`*WVhbQ5;itxs)^=S)xkn4g|Tykl+_ zeZTl1JKHK<^i7I&YyPQSe{!Tm1g3gLcm^XEb=eL5)7o(ih3r z-0Ec&3b(>}#-6Xoqi%>+jo-WmhM2u$1N-|CwND9L_It>*mFmr$nle3VBHNkrl-ZOtx&2KDf13navT(Nj4cw%>K-~VYb3DXp zD`a}7D>nlp8z%KN^EaV6t_)srrqFs?zvyIIFIm(BU~B7AkBatQuwRyZ8nP%6L}w=A zYkRo&d$!HHUBD@HzDzwN(qo{mOrO}F2FVC{`tLOEpC0O>VH(XpM?B3_bV8TQ09+S6 zPOMoLI7jG=uQd+P0&TTTkftVsCe{5>SoZ(A$zn>nd19|8 z3nrYWU;wN{MgqJE_$dVAF|z;ntw0$x!x|%s3FP9q z%Cx!4&@`$TG^!4E`3#yk%RCJT7YH_h;#wat1_ojc2%P zR5@r=6>*exW$8zj&B>O{!TfW$^D_0R%H~!ys>o0L>Wnoa^f=3YTC`n%&FnJgJkQnt z#`bV|cZfU9T{dT-Q3bm?S~Dq|`{th&R5jKDV)lV#s{3HDW~=c4;}+b>G+DF1cJqv~ z``GV}kmf#un1KPXnfgp9hePzFQNqVq$C>)yBx*4=`^3@VSbSJ@Snl9i4Ne_X%`63A zL0>Q&C196N6KKk5v>ciZ!UG%vc8Xc`&*sLI#STZiqG;fIxGIV^n>DFqJ+`*&ehB=u z{fK)d4Wvwz!2(+WQ+c1(c_W_CYO;B{A|ANZ?e?T<~CC%LiF*lQF7ia!P?I~}4 zczxH1DuN21Gll2F2FpI^f5X{9S*>Y(Q0yB22;hsyn)p~95olcxyE&RQ`}RjqV+H>% z5q6^mz{$l~7Kz{DRx2mw2p@{Ja=YK&yJieNcA0e_X%yjSfVx zm?DcRY8y1)LC6wWcupkH{(+^&qb`=nMy3f`Z&X@W+k%kh7J-;~8%A_B7Grq9%`CSq zu*Rzm&7X#KBBP1*VLwfi=B~w#J7719&3(Qu!PE^v&PcZ>U;GyC>j(F7sT6^pze`GxAmcB%fnlnAp2@1&oILaw^vZ=kSn}_pt z@}m2>EP^|C`=MT=>I3X%pe389^ANKKY*Y4{;_UGrA82%WJBZVe)=DR!fLnPY;e8q# znW|7cfCQk~8LGC4kc?MPeOUcAa3+|u)zCCwZ7zYaCU}lB$}_)V^r4#9wi12qmf^HV z>284)rO8EFaJHg0AD!qEQHi55p?RVap-`h;a+jHA2Y;Yo{sj;o>tDrjYV1IxiXfzk zEW*qdI8Y>7>C4f$d$yNmKzl+ps=QE-P*zJ@J+*^E^sVuA3HJ3glr!~7Ck9v^`5qQB zR@d3IxV^mx`xIXq(vD?lSfL;OGJQ06>(z0)w?iPo9xi;wTq)+`LKjjwXipga@!P9^ z{6?lJ^j9i}`5$HN6?oDmGCF-Q97#0+iH(OzVeDf83xra8b%nAWYjlle;hpSq_ha^8 zUm4V%C;I1bDi@-Q**f;BW?O)DV(?pGKSZ*5^B!u9`!AL^{|ADdIv|Nc_5WDwnJ|FebiF((Rcoq7r7jZfK&Inj zeOKxR@~g0kDyFDRm_8f4T`*t3l9+(2Oy%bDg(KP7iEKpeFQH8K~bKiPamAjS>dH?7O;BH8(2D$Z4Q zi0x^t!7xNXA7nR$a_YRN*=_G&3!sE7=h^>PU+|m*TErd~N~)&7^TTvp$OcvRZQePd zZEAFsUe|c$8$Z}A50iS(;?LvmVAKw|l})vQkUsPUYz%??y$8epx?^gDp5QM90(&op zAN9gN$D?riEXn1ei0PeNeRQ|LQ80#p(*vmB(3iiJZT25ZeEO5$zhgP^%K|YsD%q<0 z%3=Q_-A~JG)}jhXIvcI(`nFVkqtbAc+W*kfn4GgWAiV!p)9;j3{(rPI+yATJ0ka~a zZjJk&M=`bj&mLus5qeRb8S0v}EnAhjXP{yD^^5yI;{`P1e;Y5525StP5}9+YsJF0p zy8fP*$^~~>Hu@vM@;89#jjsXFc)|fOAyJsff{-<4a-1ER;G`1(S_GpH^E2^hj6(FA z7MS?gX7hjb691=Oynpp(aF#J*`bMip^P?eKVFkOqS%4KAR{Cc%-*skyn5m?J=BC?g zRM{d>J`nd9yWKl~N@fL2S$aB%8Q9gaX1^QO9s!dvj{+%fa6fti>px8A68`5s`~TkK zskjB>#h8m|CX)H;M4yP}s`f ZK59Gu%GLS1D1rj~)RncB>J+Ts{6FKc$3Fl7 literal 38430 zcmZ^~1zc27_cl5M0@5uaB}0iQDcv#%N*Q#Q14=V=Bhn>;RiQ6IAbfM+=Zl+! zz$-Pn6AXO20e}2d`zCPt-L#GXekXQTHGqRaq%_xG;NjVT3*bK`+?4g*UOBJb&U*W4A2#6j#(5I`IXEpe9zVW$Uhu#%sLs5ivQnq4(4l13erF4fM(h4OG@L%$ zcJayZQd%jmMQcJMnOuaO+=MFyZd=Q?KRbKEw_J zn}+n0$i|w)Nn)s|udjqg28JyV^xs8DN^cJNcTs=Fvj+*5JEc!%4#jcMiPUxUWl{PB zjPymdR_r7XIKY%)t9=jT)X!O$I`7=$7(;+iDW;!=|Uj#(O+~lp*Eyc=yh5<>> zXgI&U*{{ZBgqkPHVf%Y*TBa53lgWhMgj^M_uIaAr-1LP$Q)^jZLe@T35v%p;d7K0= zVvL6$s)i>ME^=<(#B;GTkBXa0o3s>}9tpjiokyG-OIV7O&%(syZReN}aD#;j&j_o2 zf+z%>ucdyDGJP`oswto1cncSoVR&g&PlEKO?G7ZHxs5upb`~k;f*t)z8%>=w#qHVj z_kCtDl90s6Z6VGbZMF=vzf_c$lx9|?PR7EH@^#OfCL}#QosJ)}0;?OSL%`ROKakAx zMla&>%}DgLQd9#mxbXCCOsK<&WDD*TUa=HSAe1E_M}G0+kM677CELRSh0E`YNL#~7QYUW)PK&}SC;JNe?bWcOaFJ+{ za;G$d(7>6A9+W>{g zc~sih(`;Y2!B1mDS2lh~oV`kr-D8tnu$&`Iy5Bw}&7W(n+B0pkUva?5XR>cI9(jA7 z8s}wJuB&vXZuh)t4%eWQF_{4q%U25=f;5=J)?vDKu(T`ePWmLVh}vE@Zm`eskS#xT zQmfCtZY~517f~ab8ef6wZewCGefG}=E4^X5tGM$kT16hku>zu=K9WBBv}``HjLCfI zh`{Q!hsW6VXa$SGN^6{S>W+M1Roc&Es|iOKZasS!M}Ci@hd~nNQJ6Ms1v^eN5G`#B z+(PMg;wrNFJklpIA*N1@mXl{9>62zb3&^Fby{kWlfwuH=gP>5P&EZT{W%RodLRw<|TcK<|@) zO7+}!q1`pW%;2MH-eSD_C65&`GsI1^T&v{%vWu7yrjVnd3*2$%g#*#$PX9~}v#0O# zbGyDx0+I8wCWrF0%=M-%BcOJ%IVnaTW zlm-C%81!;p6!-P30+-+&FL~$av`mJ@pKJtSAOZynw9c2MReCXbe0~qqYmg0|-Nmu- z5)MI1hrf>8LfFd*QcJK^qx-n4XwL+K49)3wU2yrbO$2f)og(C3Q<8cio-Jh}YtQVk zRC%8Glpq5IU%jgVf)0EJuvwL6?IsujXIov)3T1n5J2tt46K>MDwJYcl7znYzWS=DC zlU}5tI?g?Pd0t{&i;*gCJ@X+YFBOE%FBBx9;pPo_-eB6tdSl1vFQdV-dRLf-(Vb67cWGAD2`0U*Xkn3wRmYF%COLMs5SrivS@P?>z;#@ADXf^wa6ZR zdoFc%Q|ZO)K}meomR?DgA7Ht6$1JrsPGZs;@^v!)?4pQ@Th8MyDbdRBejq>n+#i19 zRr83bnrH`ytx+J}WuN!(wRxth7f0^J;1pB>VmK>vbJN^^U4J1y+fAg1Kjd zpBSvXwTb2M@^+^i4Dw{1U=OUuK1BLo?LGD{DPi{mO#hLj^a{g{<%Y}0doH>~o5f@j z_{~b{Z6J*BXCjZ=-~MD32u_l~+{uI&X2Ml295j%RMATeRTdt^v^`G?`j53y`oeXDp$wM0B&EHdEsd4 zhLT-I*tw!uobvlUFRYp}9X2Jh ztVA;blL`~>6JS2IltCZkGOlDQb-cWHTNV4WJW#UKNdi+FV7n|mkBZTa>5wbAt02$e z%9}{ZV`7Z4uCzSX=cKIXa^Mlo&f=I~eV}}ELC7Ng`VV9CCOCh!%IVw{!TY7v<0C1| z)jGm=78j6$f4oS~(cHL!%f6H_9davyJC?||tVzx%Ac*vyNP6*h1ni9qL;|n4_aZ zSL$5^UxCMP^_{r*UxSjz1~@g*I}-DVLW;L~;N?&+99wW_(ANOZW6Hh<&A5G&&~c8+ z-o7gfSmMQT%+WDJ{f<{XpJ2ZSk*r*p=kG7SEw0WhtETjP(c$C0dKR#O^hrjRim#&q zb;IBAcFGTR^#X7|1LeEF2~E;44&Y(tO`1&zWMVyqop0^d27+OaK7i^cXPEG0-is$u z)aM4yk!?S0K9|Fs?l?5u)AXI&c%1WnG@$Xl=OTJa6}jfq=p}B7nt-9H>T6gkLf2NK z{eNmkNcA#oJ*;#a#FZZ&Jr88FFnB0ThndmD%=qBm!)ejoW9LSqy`VFDE>*@y1wk|O zIQdMtjg(VD#{CPdu84GhUzSk0Kg{Fy&J1_eXyefhSvl54ZyzED7gVVIT)UMxlg2oF zhB8>gt2IMBV^UnocDkrsX8de|xPr~j^ZiD>%iv4!EhZ1V$t(ITm{kzVPo^_Hjm9NJ z#yj!8-Io_xcQ7+ns2b9};?g3b0NM&%OU`Ue^F{%MrkqLHK%kjop?~Ny!na-W*s(ql z86I?=A^!U~nZIrDY@)xF!7gYd$Kj{eP3bAmlt3Kh#jXtcQ@h%3RGBD}vpy?obs@ZN zw=rYsosUwO1SXw9Vcb4R}c`zW3JTC;n|BS6TK zMn_%G!(k=?Jf2DqHfJ6uFwPJKPjmfBQ7_)Vt%?cEaV+n_OK+L(}2gRJ+`0<^r2cxQ3Uk1uobu=ya}Z4lFO~^tmdnO1~rY%5fVc=LCaa zIIP1w($c=-;aWC^E-Sp@EyW#7Q@4AtAlMNBu8zhDCepV#Lgg}!7eC+M@NBP1k3D5n zy!;V1`75fe_su8*ArNN7;5bR$LVi_dvq(-&hGIxbH%Lxf0cSfMHhN$Q(Da+*!5RH^ zDU(0Eyi4;>EhQbo#p!Qc`upA7;oNWm_Dpnpd6Xn15GKBZ?8Cbf9Q3+de7eJ@4J5;? zhVt9b*wHxZ-glMx3;hrM3c|Br+_3GonF9yBf$fNwlVNZ9r4r;3i1s6y(q7uZu)C{< zPo8f?y2}UX(l&|u zTm|Vp?uP7G&lW;-9lGsW&25D z#+LkR#Lrt(Yegzn4eDug@`I)?%BJW28->mJ(2+gY(D$E^{hW-ugau*7}$=Fy>FN3HF;CO7}29^B)Id8*=D(SqId#J zEA{kVSlO!;r*jrl)=9z(76CSbbF=xt%cV>=C!s)82R)LmUdD@UR|hkLJ?Jahy9K^^ zBv60KN3?tEmRAXO_0YZZw^wUHKunba(&ggm!4XQMG$Y(Qi?HZXxs?~(#-Xi?Cw=WaWc=7=4cf{f)YuT)nfm_5G zyoTkjV#2ZYF3*3VFvpP*5vmoblOaeJR1N*=!wp;j#<)p;f40f6>5l$ly|Nw@nn!EM zsQ`RqgQFV6r-UjxclPW=+O(n2DTYe+2-uM{cAI+tiS%@TUjPiPVc)CNPfSIsn1E5UFt%rP+mF3fp;A

    uYea6)Rn$XMzfLKm@C(b^8sX7FDV?# zIR(5B)&$ht3{Uf?DKW~bsHm7soF*bCAGnBkP3j!B_iS3Uz<}Jzs<7>Gc&s zri9#r_dj%j#B#8M?}r|Guj!lMzbX!laWa;P)XJ1Cj&G$;*j{_a8nsN-79JdbQiwxP zFngf3qUQIMq5o#;lN|mxu2M!liW21fIrm5WW4eS!x=L0-kvawmqAt<$XY+)2!Sy;B zK`RQp=uBihn2mJ2Oc|;GY|rPCf&J%JCcUi+hy*IZw;N2YjI|L^)d>o&d1)Ff&E0s))`577!=tCxh^+kPQal&^)&v#hperwpKEKs zPg9mge0M>;(9uzT{hG_FGcMLcBEkLp=dU0&&704SN>~LG^y#{lC7@8t;f1BcAqvI! zQ&WbnuC9dcdj#gN<>h5ER2EWc+K`o-tIfq2j)Yo%BoU-e&>RtcG&HgNnI)_%N1!pC zyk{}ErX_{fSHu35~N0`@-UZT2ls5l*&!hY#*5tq`j2HgwSl>EUeDF!`3v_4 zQ|u94=C`tM63x{#*|}N1Vek0ufLn+Uh&v4V8l@VhjBbc^(x+ahUsd3hJcjGP>* zOkTX8{*$j|Ggf5KmnOpVj`|25JSJ-njvE(+3f-k%=l%Ka9%Psp|2{CxfxeX$$GC|n zia8F-!LFRPN3<+wd*p2k-@}`8rG37cgQPu+O2ieoFAbGZURYb zgKN0*IKLCAbL##8U5{_+$4sx&Y5Uf-t)0AOrR3|}DT>4Cmj@=HbQ|8Yrtau9K0-0A zt*!kbB}K&;As+Otz8=9S0t4;Nzz2SxHxjXW<@CD9;4ZMsD=TH5JEr{gyN|gw(%V`> zN$W1Iz=dBy*wx^#+=LRQa}jVyXZ~IRn$~}Yg7C`4(UNOy{wRJu@*&AKoP$R6_3PIU z2^f%nM+`6BoC_a_?teS7Ty*92}5D)R)p(Hc&P+3yvwS?T$7ABp5cE-o68 zlqR)#Aws6kxL?xXzo5{B)++sTB$}nA?*x_G^55{(&q7f?1!q0!qITbZ{vWF<{ zR2h@u)X2D2O;nVd8yw3d{_Az`E#Qv&wQjt}GjOd84^dIkN1~z$2?>fC8szcvR}$l9 zf&O!TsrrvNae1tQL00bM_W{?AxwQa*BGj@z<-RDTI$_>8X-%Zt+<++E?Gy z7q;tmwh#RG_V#81&bJ;IRz!3s^1xR<=?T04l~%TX^M+U2bK80#OSUgtK08gsM%W20 z!(X#Zb4$Q zUZRJGUp`Y)Q*tzF^z-gr`t|D)7B}Exd{YAC~dQ%n%_Jz@vab`jGjVRgiI{ zps-M=oR!{;hT9*u{)UFNM}6;^A3pqPS`!j(8qN0r)kdS z*r&aN(W>_UxtFr!50L$ANvH&r^I6|r-y-oFLN9=~+wT8^hYu;A9{VB6bokh#R3DME zkP?Yi^@0CDm?32<*Dk@(Mzq+)mj3w@l`?1TujqwAv6p!0cM8{5jK2mb{qd;8_ujpC&t6}Hz~Q@>|?cN&UJ z>ck#XcSOXzpvoo#cCi>wos;`tM1S_&Civ!UF6@;FM@bf$HL(U26clLsR-`{PuKuvO zxk)iyWL)#{<42mP=;){F>ek&KxqwJ08W*Cx=i(w*ag)N4OI9-Lui-TubIVPEBO=aH)ocd=gYn>3|DB1lkazFi zC1U{KtiRarwTSQu>gEr_&*)Z-wK-0hvQ^=L%;x>uE+9T{XMTFz4klMjy61bqRC<26 zyA5A)T5`T2ST2j0Yjd{D6dP>pokGrqrD08l)^YV{Prhv7XPqPbs7eD^p>!qN@)IBf_-kGc|SgBi^Qr8 zB0Kn0OUKNh97g{8H~K@>HFDTb3OhEQ!tTn*n!_=u*8sc#upU}pFO$q~(j$93^JnAo ziD7m&Q5%E{p;N=* zEnw#_j;fj*CUp%Bdr+Cuk@4}EqiH9bq0h?dSu$ySCG{VFyvjcs?Odm&mSo%CArLFD zAmX``{@uy`ufqg{MsZ;ioIzyoigF+nEdPqE3!Xo;V)?Kx&x*H6wgga}GZ{+yQ$BXb zp9L1ShL%oC^pU)PL)4%BEd=<3+j+w=+SjRWI$0nvADdrMku*F!%-TM#b#Qq)_(6y& z1-V=M%v@NM=T6r%9XXmv`UT@KIm!vvgZkmXhZC{L@V+7%eF{Ih7=ReAV`a3H&Nac@ z!l~MNdR8#m`;CNYjq+?HZAu#chC(T9Cbf5?0F%C_UW+Ke71= zl^RR~VyIvlMApz%zFQ#BrpM1s@UtCr4s}I)mK33eW4`z{JxgUk8*6 z;E$&pFggo3$HGyee1z1DGmuZfIIr9Vu1B%NC`m4XZ`(*q7D$>ygt&^1!Pp&(viecG zaUxk$QUpE9Dk|beG?ewNY1GwVnbc1iZioYP+3(&sA2P}z1z<0h>Uj)6I*at~R~vGA zA&?pYReLL@!7E8Ot@##+v^KhHn4Q&5${S*5@7CwLoJ-LQh@fAvq zc6A`d(3m6}wMblZng2i=uOvHijh$E9me+?m@*84rdCKC>TZ$*rG}P;a*MrJFPs-@I z1^oT`M}SPO_GfegL@iCq{l55&vr?Ud-v37EV$G9;?s?*Y)>W#Yl0f#Q_+KL=egn8E zKY+1|>-pK=ii%=H?Z!$i+eu7)*1w8v-C<&?*{ZDeIm=jI~SloZ1^!_!Lulx4J92~fT{%&miC@Jaeo2oFX zapeLoItGTs)YPtzTx$B2_6A<^NCn}F2tqvT@dCi4Tow(KsQ__(k+ktpx#{=c+vRQo zf{aixhpD#r#OT2)Adi33%BDcRBOv`=T_>T7Y`I(#8uHg}X?AGD*0tQKgs#NIKQ;JxTEy z$g|SnRiwTL1j03tE0{Eq?$uYO&t z`fF-VLy&o>efSBm=W`3DqxhVk$e;@)z!J|@RaNiu@kK9n#$(%PwRR6n zsxHa?XD}9Lg9S-siDtdejAfo~2p+juAEHo!)?PDhh%DgRJ-XLWtNK4f?OCHqorlYI z4My0igKQO**&E7W+VwTXbm~_FfOVPDo-}N116x&dsL#sLsWRTedZyl7uktmwdgJM^ ze}D1dJ+E7ey{7^Yu)arDj4v<8uZAmF9*uRA(9}^OWejfLy3WbcD`u?2V z>P-=F@-{X!eD>zen|Lc``$bot!TCkonV&0X%9f$mio?gYDEV`WVqR_S9gz)>JuZJf z4vtt7@psji3nobbUjJh+toQ8!a??unnyTx+Y);9L@#9CNQE||6$XOkAB>go70mcdx zC?)y%{vPCoFeg`VEPr;=m+72Fv8e@sKmFr!xz{EEs3&#g$D$Fa*rOF%71~WbfmGE8 z|3XIzfDS#|o1Y_~PYo@Dsvgey{_-?&3jx^GKL$q{D)|GjY{GXT5eiQFLG!&*EdVe7 z^HJcvH|hA?kQbk_(tFVF{U!>pQ91aUe3{-uX)NU=e`ZLy*Q5T_+$iTcU@oD@*N`JX zM~T&a?B|>9(+uU53d`f%t)xdb|9qZr%sD*wj}?Wwv<`#4eCLL)cRoZ%_br|s({HM9 z)uMkBCOuVprF!#r1|PYo!hd@KM)A3ab3cEE03nGgGAK_KvC*vC`rk!mtoi#gK0dDC zQJxDtdr$A{6>7GtNQ0;o^l!06d=cc)0A^^i`yahHy?xZt@#H#yv%D6G_p#VCX;1zX z6Awl}A4HlbmD$)DLN@He7sISA>&)UkBmgZ62=s`9;cAOF_PSv1kTyvSqV?<${nl4U zM@KzF?4kX+>HmZ}6G!sTSMs>EKw+~}AhA5Qt1^_V{ zNleQPdTwfpI#~Y$)-N>&m&5wsVzJAKT6pNo0{+@H2y7s$pT%>A+I3FaV>1W1R=fd; z3H-daq(yh`I-CfSNEc;1swdWneGvx~c>aMw!y`w1sl(W~Euy#=n*u%T!zmtfuB1F; zp~qQTPZN2B_A}--OEr|03B0|%+uFUyoq7RHD`Z6o8jw9U4&U*=I>mJN_FCH#;dd`JFjN5E z&;%q8z|rocoTFYIFbG0=hPuoL4gPPg_ZwtCQ=Ra-H`xW-FOJW095gyIGB`2M2FCrieGmxm5h{Xn_k|1T#2-3ZRBAu56tgBEG(z0LoinO43-^JM>l+*kG=#LM4! z3W(GR>;F&k(f4T6;_Bk`n)-QTS6#ZfQcAt{7L&KWe*L;RQ2d3!VcT+d zV$FUJACNoWng!&6+4)4NMJ57gnoFM=MkwWFliE?HradGKAh{7Foe6hvoVS4K;fdo> z+=HY1bH4a*|CDR~8x_*W&wK~ym*+#$9 z{p3zy9!bf`M-m;j(N8Ey+88H)L~4(YG%#Dweoi{O`_%z@3-E^2@xN01-N0P+Z*6S8L)& z&OOwpIQb@$S>i8)xr%4&P{YE+&xnIyX9`8cKi2Aj8U?Ebdse-LKink+jw?YJUov6&T%|Z$7Oj$T>zCB)+S$vw#SxTkFH#c=zOH@MlEi#w zdE7J@A4m!8spLAk0ZGAJ7*hXgVOQ+?G(hNH8T&!s z!{Z7H?te^7{KE2#3GkFw0H-GO{987__~c4->-jA{Y(PoX)!e&qyqMBwGf3c6-oz4l zZiB%F;`}mZ!|fO)^Ke|o)a&wR5K8ya6Y6!DGDk|DJ9nN&QNH&*7VcWXU|s=QQg4-$ z!mYczdl`dKk&gNB`BO-kjSoh}Hy&aBcOuQ)r4*}^>ij~2k^^Ef=m>_vT7%1jhLA4v z$*rmo{G& zG~T}{4ZAfY65WD$gknx<^)IaaGxDf9+%$sB?NW%yee^vw`dI|ik+zB?z`KXPm4}Fk?ryq0@uI_15>hn7PggY8-ki%|K8j~2`y!ah&CEcw9Cp{_oYUkww zr?>gE{A7O+LSt|#Oyu4UC~-U&pybAR)NaiBO!MZ4Pw7aYps*7E=e!5^!q>O|@3081Ci)m8Kx?rU5yTUZzWNQ-A2@QjrYJ<;EMU4!9(_~)Cg z<`kz(2#3T1KpW})pApKdrUZ~rG()Oy^yAa6hd-t_&j)b?jlwGLw(PmB^+*zoh>GnE zrjH#zqDuhh;Z}02&Mk1?_(fUdWkRCxJHP*Q+FZ(k=20Ft$&3)RiX=|>wz0jtoqdDj zn-7rkhf6AIek~yi2^(!;<-z%Yp0#Z;qDD(%U{HyUTd$tj2y+D9JZ|`vX@`2y()XD@ zD&im^AT5N#A$;1Vwe4l~2@wkgCQT56j{%az7xi8yn&!Jeu%EqV{rhf#v3pb62M>er zj7ySd=-CeKAiUY8ZIjg%bZh@fAIyW*nZX%2-o;^F;s(CgzI6m|clbc&)kD6B;0ShjjjJU?9;9sr^#w z>EMP>n~+#h!)yst3-+4_nxDow*j{#`Md6R;F3$~<8-Hb{5g1%E=$0a2vrjlB&I<)E z{hbGBPX72V;2*bE!>1oT?9t$0HwV%WXV_scuxT6@8A3afFV#E@A#{CMG5bKqCx^Y7 z(g}*Z%MfN%&FQ7+0svgj-3LQ5FDny)LhHZrOK__>o0cahBwdV@YVx;TU40+ljVr5p zb3Jb5;hfA!Fm)OLF-MOE@O$szGk&QD%1mdm&k6K@x(n9y^mU|7UdYA#c%`;V?|cBo z$K9Kw?-zvTIUVdUC#93K1q_~Jui}wq_zvkYYzeu5kenE6!akRN^7<>NbfeTPD9yXU zd>W8d2Rykj>s63lpS@`5oe3gNd@kf9GGLyGI$H3(WT+A^YjTHwi9IGtvbHL;3-o6P zPQ}Qv&{MbI8NJY~<a;ctJb5ci$c(6)c zp`r=$Tc=(-Tk}}Q#hE%ohCgdiroy-Lfxg5Bq~N?C?v!G_6;*RAi4lA9s7JRirf`+6 zv|#aW#-kwVjt&pM~UKToycDf`)c)L!s8p-FAMLV!q%6sW66S!RPts}4Gz44qtG&@ zToO1~HJT?^yy0c1CHe>Zo326iK~$Q|NY7f4wa1O)pjZUOUU#WKmwJPY4^SwG7LzrH z9eE8d;o|HiOvE2>L}Ga;F(rk3KkRIy0KQAZUG^~qav*Q1(_z)=l(Zf-e1aJPIyav2SsZg_-s}+&^6>hmpOB9E!@6!a1y0TFifkq8kbw} zEiQj-D-sZqG;%_TA)woaH!*XdjO<`{N_@0nP9%qjewER6ktXV=3>~KcUnqe3Z|q%n z>h%RFJyMir%hm|EArApq;BT|L9Bv_Qy`7fU2f(L_G#xc@c51ic!&N`#C+Z=cSNSV9 zYf}vtanz-cHf8y+iv1{?WLdv z*V{(!LAfFQkAJ?ig?WUeZ4sgI#8M~aMC#=B7u@?l!AU1v)+7Y#8|1G+OrYJ8;;m3c zR|e(LU$$-;Lq$ao61!`5)fc%Ulazo^fVuij;}zM0AB}DVm_sfVzDt^$-B`tkXTm2@ zHMz()NCVhWd&HA*Wd;pKML-CunGIPoL+};)R&o*_Tts;eWZF2w_Pf-_wyuvsLM){y zSI9MFy(w;=V3Byhsl+hl`};^aGUF}8^zg+yxT6Tr%*T;HQY8Jql8;t>O*p=&`!!zq z@T|HHJ{=BkWgKtL2wXz+vDp78SkO07s`x^aupPNBqn(=htUXv)4aM_X0sZ5eoU`53 z3F4B#OvS8VJLPx|{8eS)qceDa?_``z+ev?B0aAsg`f5%h;#5+Vu}8UX8!72!=jmR- z@!#<>dMXR7c;-mS>f7IW6ot{Me4$N^0OgXF#?^i}b2uK(OGxW`&~;6s)*GcEWe>UG zgHJz(BQFF#)?vIgf0L&Zantw$Yo=exdoe!IR@|Dq8Z2tdnyQ)vrEYk!+tmxk|mwvkHzZiXc zdolG7J3qKmhH23RLc@FQQ^rWilyyY!ZlG{T*71hpv6@W`go_>NZy2M*32S$ykb3%3 z%A*65>SXu5(2Hwo2q-YN`LF^9G5F?oI{n>DG=J(L_J97ohb&vo9t^ehgD@v z^7KC#=}$lZ8nb+ZKT{}5rN`qX{Ba^$+498$Ope&9NpG0~H-p|~a9GE;>k|nRx(#nz zYnX84o{NJo64a4N3M#?>io`|RLK`ZqO~In>u@_(b^eU97 z6S5-LncY6-k=^GuZmn@CDcgz-RFS?zybO?^Pi=v7x3-=` z^ENEyDCt-8Lbt(8Lgd3?t>5w?Y)SuYd$vl1s&R~65-D$o9VMqZ2$ws#4Q~4b4+z+6 zdLGTS-_{rQBr9>0ZVQt&JTu@VJd_|(#C0h(cf>F^XK%jTKbFC->p9)M3u;t3}NZA)I2kny1Tbh$N|4!2& z9^%J85y%-BQ)svtk)gr+)-D0Hd=>9XbxVQ&c z&6|-i@v?JOhv`@@2IL-oshx%yvmsP!Xy|@Kf8k{tkt}M{+zcC}0raS%YNX70cekmw z8cF@wZYEH-aEQxBl5)q-JC$tBLqm$UBicTiF5g{K@xC`v{myUut^SP-)?isX4i{DG z_0ZrC;T@D(L;#IBfia21qyiX~bks$wGdV^so^#kF+8~)TT(73?FuOBN}|ex6?SD#pEC{CbU{dFtZ#t}>i4X~TW;&sR-@ile|_Ni9B6b2OdA^j$AdSn zPr#hgCxHhc465V){5R1E0P6v!N8JDnH1Er9u`789b7_CygM<54b%xgCFizNR|-%y=ggj zHB>@u`;E=*#k`l_Nr;ETB6F5&An*>gfMV)N<1^}BNVsA*Wi7- zVWKKRRWI|UAYITXJ<&pufR{ky?w@;40Q1bAP%|n>^ReU6p~Caj_1VglEtGFW#Cw1; zrGC){__E*lc$a^0tBMCK@CVvT7+xnC#E8zEW4hcvIk7%j<9IG8P-GDuaf2y6xeKTm z)*HG|p)yI!i^KGAAkrp0MvgBmSrKFkHS3XpWBm&7ls=A&(>ieMskaFAbQ}r6KA>K2 zyfO;{n}6`Uzg%T-_q$r$*vMYW=qJRB)Yg~__UAPul?^u}Y`=#-8~`e+33B53f*GA- z+uwe4Q;XxGcc-!!LbIC}jXtxa1Gj;7j0~mKmwm~^BQqc7+FcDnN944KKo}5RcW>bn zqvwmaqM#~K3rDc>Z$Dwczz!=dnndZ*^n?%f`zoF~z45vp@izXvmjj^4H_?t~JU z9r532@baYt@^i5L-!=8WOeW`~E}XBu#9y1%^)UfG3I4@62J>}}7Gas)kBncQYLGe4 z@m&>I5n$_U81yb!HMLu#oDR-zLm(`?C=Cui(O?~VqI!lfji4VXt$e^k6!=5rnoz>L zH($a*yQ`>HmPbA%Rci+Aq|{N*terP-1p29YJnh(G#xLG^_6(!H4|nWu%T3miT%MVk zw=NBlK8F(4S4&tOrRDRmqPMZuzaj%P0-yA;G64yXX$`PeGa zu4!oGxCMaymB}swZedmO0=&MyY5;m)PgDMG;OR*HV~+lfCjQ`vdI0ga9!q`jad8PK zH;hUP#L2a1+`uggEI_4cSCoMx3qw=x--jCZQ`3q8yfw%=UihgQgdw%-uIP6X!Jy-; zbgjoov_TBV#Ob)ws;$=vcJMjF={(dLsTM^kSm%6CpwYyAKP=P?TP#;{=HP}$n*iQO z61pNOnx?3nyT?9hL}xqOnjXhK+(~ExPR>m3QhxPhziQkA=Bm-x}Di+?h5_*#6B{+RB&$B;Z6mM55vV1F4cd8td*=6gTqYqu8k{?-nZ-6Gd#=XxzC;g%3fUl=0>0M($ID`Wuwfsl1a)SB;5py})n?Cr*h8ho}YO6%$|WaXGWhNGn0} zGEn!9R)c>thS?GV(op3X2}IFjYltuv7tf8UR@;IOGvWpM9*da88HnF{5bGyr2 z)p%N|@b?7It$?C?Y2YjCceIR9LI?2JX1Yjp*Hf%6l#G`jlyb@Zma=sTM>`3`fP%Dd zi{Oz?w)yNfhl6+=g78mCP)Y#7=+?n-bc9FyHy!fah0G?``la{*du2GK@bd%^xZMXn0(I zxOn;A;FJ3ldd_QY>~ERui1mZ|+SFZNxiU&yKj1RpB2#0z6{f{WlR@YYCfv>aLt2Ba zd3dqASLNkwOaU9K9C}yc188ZAz?*m9)Gh zc+|6*aa<8qtr~t0g5PJrRdcahRy7fLaHD3O{T4f^*s-A##YMU=fJy8bk7~GecMSS} zY4mypbYitf8%0bw9AZo3n{a{VUA4JYH><9)c53D+LgEjt~Ajj`zu@cEi9 z?EQ?#j>iUaiOPM#b|5;>JDTDrIbh+9RQ$q0WM`7*{nusd)J<(iUwfrB;yqBPO$$N` z*^49kRLH>uz=|qHomtZ=aVa!*i;TGEHkh=L%;^$eOrGi5*N|yh;_m@QbbV4fk_V0H99J@hXrUuIJ~>44AOH3vJU1(Ir+oj#J+Kf&i`3zVcS=gHFc^R5#H z44)?cmX$5-j5;6yJ#aeyF>o&)e$L4gxif!pKzCN2sOa>X*p?j=K$Ectk zbV`Hg;6}Yyxubu2xal<^vA%j4?+W=Amlh@cN7iS&R%2z{+h5Iib;N@FFzq|f&&FR~ z)o%~nZ?jea9z$B5E#JQ_FO$h8yLT&g>+l^HWw6>6+onJ5)z6h&Wx{Bd@Hp%oWh3hm zq`=>r)9`i3t$X#DE-*iv-YJbYo9`nqeZAc}v+Mvxz56OeW3 zJLbH_p|@8j9?^e2k^j&ePF|dyM7uagMJ(G)NM7Hziy&Ks9`+xjay9Y{7cUL3Y?hfl|2mQ8$`JH}_ z)Mk}I0W7v^PS9WC!I{~_5v1tKKcnC7sCWT2zGK~w7PW{k6F7bdVPVP~z4WHY1n|%q z)n*}cwVHLr0LHuRPlDK2^jc3T^)#7S0-6z*-gR?)|1Q;WHaogi>Ait8GCn`~%}Mm} zw6#6Z`kAHJ1=g23W>{K4x@p&s)#q5r?=?#A-n3E2Icor0Rg*{EYWUMWpk0d{A!2bG zP7>~V{)ef?(nGGBICzlU#-FNZC!EJID3SpnN=UvBa(fdz-<=A0ME| z(_^hhHD4|$?xf#QCF69(D0>RqofMUB8bPI!KT^#XMspu^H!^53b$@Z%Tm{HNs)F4%WrKaNKrK6EMQxiWySRw zM~y)LW`BdZ9&VNE>co;@od1Di348qp&u*{C0-ISF*RgZ;_RXr|`cJjoPN8jQ-+Thf zRvG+hFp(X!7u&IOd#&YxsxNaAGkBv^OO{`M-5ph?^&a5OoDt6t2Wpsz_d$DW8+B6r z$YIy{mYtT??2uo+^GTG4Ph}v&r7L+nF1k%hYoDenbT(okws0rp!RqMn0WPY@r!H!{0c?ZDxAvC)tdJ z@BAQZIHtIt^ZhR52yK@xl%6%3B-;J0n%7LMxSKoU*Sf4wdt36i4zw==l{OvI-G*aE z^S|epdyu!5VCDk2*|@O?g_E}`i~@5kgE`GSJ;zuHGEz}Zq%|b&@pWJoj)|(JHy!Qi zD`7S91f(pVk|UI4{aGe)!(F%TltqL*Yc6$UEs{*`?4{-S@U$dx0y<0Dt=7KUH_ILI zxesWI+j`v0)_Ug{*v7VvFK%Nuwrw?5W7{?w+ji3=jcwbuZQYZ5*Ug_Kf8O=3XXcsNv-h5J zR2D6XQnb_h@xR*`<;chvIXbqZGHB?t9Dym&mYd4}sdC_3~uIExX&%hT>De2@|( zB8?&B47Z~`?o_r+AhbZj)A@<~Cy4=(XNw=3Cp?&UKX{wgdPXh%fi<{QZL`S#?+U|< z7s6xQZ?(hxx=yop3f26Sb54l~MvD?jb@BN4b%6aM{Qj`vS3F1?{wIB;Jl(;)e~qK( z;_9cz?`aS{M6f`t10vAonMxxIq+gi7+FCwUO8)Xw^;$u~H(X zIo=BJZ?%!Y90!mb`Q`ZkN+BkJ>5&7@;^fmE=89$aeOxZF6TN_p=h@VjD~au;^KDI^ z`D%BVxY>LCC;7LcHWjUIv;uAedIJ%65vJC|Gpywt=jVo3_eZ1crC{2eznK~}8XbHe zY`bbvyIEwH3c1ZO1?iCRSqVslp<<$L`nD8iCQ1|0$PZ^o;Mo}JJu@zU-?nHqYNv3x zR1p5C)8MSHC(IAtl+NNWXE$W^mte7c_ga|m$M%y!GTIEfLEyZRJwBsKoJV@dJ-E zoAqyZ4nBOhEXu+qH+He1>(&F6#qidq2p`5tptBjU!**qnmQf7aR6 z8yHZhiu?TEGdxxa{eb9x&uj07nambW-G0_pKW4k#Pt&`J=G{K1Sp*`m(BbBuLZ>?l0i*?*Wf355# zpJSY6m6-M(BJE&0hcbc$eIWq>+Zko+0s$jdp&4;?9d#Pj9kWYu>=Jq>=L;$Il4>W% z~F|EDS2M$v<%1kCBdRXYNu=U8Z_G-<{&mK;fKTvJBGPcg=1y6;npQ zO7lB10e71>2%GRi$eB5?Wp37dGE*=k9u%rKiAu6a!rCfsWywz^LZ8*IJ4~`dw6yGb zl$TtJ0*;?QvAespr>8aoMjR8>zh`lgdD%H~CY9580<+dFw?gDUSSSa7PrvUS{b=Fj zcu#mZn@fJPM0T|mWPe}3Jyam0>SK?^?)nzSn&ngw6R_g+Q1*mx3jPO~hRO4mEO**=QZbn#IcqqDGKKJ@psLCzZl)U@VLTS`BN#xQ!_@8gnjjC)KZ{P$U%M|y zK6uz(@MgZv0QF;E1QfnwXBX z_R1Y4OLamN3FUm`7|ciB&cZ$e=h$i=h_9A5TnvWnfB3L{Vo10Q9{$Js{jqM!CDj@dLEyi583;b1zq;?bC_x+) zXv3!h0VFV%K;>Z1rHa$@l}aXoQLb?vZ`1)rhwxWDRKM2k@tnguDW$`kn9KEaf1JP< zDIv7&$#(5uM8l2pf(+r=yaBg?Xb90#%hK-8Ply~^+(SPg7})dxJaYa<@sj-^vuu)* za|Gdo#d@^t)6=HyzS6p@LaQ?iER!kG1?zpJK+A8ew5FB+IWe7IKwTsi@{DZcHRTra zgxGvAqAX}a7`U13v&oL?;U^^(#WY1_cjQD!1GN)?Sw#Nwz2}>}-FJhtj z4demN^95*3Ky?Q`ppG+K^GM-uDCZp7S7#+ggYc(nqC3-VW81WcH@NmH&3jl?` z`g6~G+AiDBU_O~uK!4BLKoz5$;fC>332@9Q~(cRnVDk_@<4L1 zU0?5<#p%RMYTIrzy8osCaYkDa-kDWmWkbT1fnqnc+B@CVOVCUT&&eWY=`pp_PFb~6 zV!L>~d)g5}GD~#0+E*;a_%pEn)MZw?kD2;JbeA!EbPbj`pJZRS1rvIMRL5`2519@| zE^&0ck%0O^hI7G@sZgRY`ZzDtI)E0MKJapP-}d*BLO04g9_O%3=E;rWcEDwKIDXNl z-BH!Y8)d>OXz;;L{PdCyTZWSL^-=lyv_|D}4H62xJe#+Um>cSKx}XjY395@cRl~>Q z2Ql$GPMOF^TgUCc`qL~ryhVhfqB(L(rHs_DgD%j&hZEo=kJke>S14y8$h|IG0)^%H z>f9PfI2sTdEK~URU#GT7@ClWq)Bp7mRioAt6iN~0Cpa&iSy7kU{o;>n`uCKtFa)2F zzmxq7ka^lr&v>*BQ_R6u+H}y==?dLep#mmU3t0U?Bv5kp>FI}b#%7hrG}L11D$gPw zjjOAd?#?4+t!UZb4eL&3`v0Ug1R6?u^)2raE^|LIUJEl2pMS&yAms(y+eBX1UP=Y)*)Lk|9$wQIRWs%&v;K5$>2U#8`( z^&^86Ke}Acfdt3ivRoYE^h`Lqy-~Pa4K=;gWrapEoH37ETGHomqDC?r)aQ1>VQxK@ zxzRR%1P&Wr@|)C719(lmTG~KRE^OP>WpHQWW?KVklI=rr{c~EL)9v5RYf%>Dp5NSE zKVXP-77x^_0)|D_uXj~0SAsnjZT^^bNVd7uajf~G&YA-Ee-gp3VKVi;jVYBt$m2`&i!n@7*N-9L2a z{;)N^{Dk!^geEgg)w7g*u`4>?% z0X-v?+Hl4AnYIx43Jt+_9#F#3HOXQy#lm>NRv&$9k?O?t&F3R{#=-I?$6wry~$K%P4934U9esCGC zH!(E6izu36Vugq-{7r&@uj;Kze>#fd>fm{x;4gghtLjN^R&;Cz#>RRa%{R9#uuomb z-c<6Lkq?7@_Jfk~UMPJq5UdBFTiUDefLHZ26SWtQqd$K{ zx*II9)1M%wPUC?V&S4DW(i>{T;WnlE=t6-_!D2Q_a7W3j6pq0iOt`<|FaQ;^znp4z zUH@n~ZLcSilMzYDr;A~H4SQ5CQejip{$BLEe@R0Hbz5Q&Yq0U5t19)WFtPBe#{2WX zD(2|bmbC##rpfI}X8B=yfvSn2Tvr#A^UFS0+J&j16udZrE~?ctb8 zLaT|jvF)~A90+hhBtaR3?4iE6a0o7l#vyMyjel58h6EN!@>ZS!d-=u->m5kyr{BLj zB9){t?`8dJs%Q{^Zi@gj~@p720Ss! z#rlMV)@er;gBlP^Y!dWRfM;geJX#9`>>S+PY>E5M?tHrRw&L^-!QGXr{XO&PI-YWA zw4@r7qZ|ss&(oL1vcR-kKlk8Bajw;+0Jsf5J-9$drWC$FefUiq?|oc6vXm?mSYcMRl2Gc>1W?klk$=Y2oD~_zS*y1uy1fN>Gp#}xW=FP@A1QJ^#|XYOY-<6 z0>xMbEILO3NCgr4WB$Y&ZV%<(p(lDJMfL87%5$rsB>^u%9x$-@>UbHrw7#dUYb*{? z_^O^NjL`Ja>`dOVb-ol}j=a{7g|6`!`|1Z8a*&?7IR1g<;T80dG*{>5SLfi!l_3MH z&K*cE0bp>OKA|VF843^(Ti0*5ORD0iJg>zUbJ==S{VN!#@0<6X~aN?Fi> z@!&mqe=XChPd*{!MmafAczW4kG+~ZWE-}DFTPxnJzfz|KJHu;pexggx(uMMafORZ5 zxY4*AOm~W|aVfce7?_o1K;*jSu1rPDBiqW}&oys39q(u6Q+tbhD)7i)4Log8#NT~5;gJXsCa=1fZD6Z*O-5;6t&NkIL`b&uqp9=47p!j~0N4&T#mmibJPkq(k1 z$HVXkeS?!6b3b)(J~vIHRi{pAUa!R?I-k&FDA$NQF;c{d1T9SFD^J;8Is&>D#;V#J z_Xr~_BfrXJ_QkDJy%wl z{j%wV<&qL4x7*?ZHMVZ@2PYBhwFdWU#erH*y$9ht=xykqNP`ivs26Ax@O2C*r}Gd0 z4Jzo!Y_HlXH~0wmdZHDfg_bGXW%58~EOjKe8_~&>$a#4g3-uJZkgu-Xw%LMUiqSC) zwy*6x^kY9ywkXqb+4<8=B2mn(SO20?KM1Rzme7Atv$M)#-ia(!Ed zDbs$t&utmF>^}X??nN08fbl`f;zCLzC4Sj`BmN@4_y|5o3&6$o9+y(oi4L2B2-x3@ zBz9#6KgL3^w}uuO&*8&5nj}jWvVI6}j4~APCPl0tDY|vsF-vtW*weIyX_Xw@?oDdC zc;HT`O|+rBf}sac{mq!+_f|wzJ>^hn_GmCKrA#pbP*2Qku>pr6r;_ySA$#wCUG%~z z5MSJbC+sEaHeUB`J26$&f6*P;-WjwV^bYocZ}PEsM&=xk#aUj)q*kfW>$ZX1o#4y& z(%`aMbV(Izig(?iJ5tzwA_22exNhUV4XRmSkb?TR!Q)3s{0q9yLbg$pR>eZv z{sl>7M&N_YOm6>q^Su5oe@zXZkOXp^BMtdDimSLXJs@EuJcsJttG6@I1dotxb5k_6 zgqNSgIUl#HG?coTbRyd-+7B(+-hW~IZ-iH zu`GSsH(S5&U}nYeR>KMy4id>D*$TZ}wW?9h%8SX7d+%$a>gd;Jw$tnL9y?}( z_B0vAi_(eZZ>3GQuvENUliY8)FMYg#Y$pOH!Wd=BUB+L(m!(+goG$vPy7Xu`J5a@B ztO<#M8phgrHb9k@7J0y5afp=PSk;#QwydHanpcbBEl4_W;}C_3 ziSzwC1FyOo0Ih*~%J{3{e$pAnSqmcTVuttQ?os5w6tcQM<-XMM-})D+F;#~C8p9S9 z;=`9I;l=&noawLOXb*r!*+TbU8^>&}_^u=_E`6h)H|=!5=~Vwb^=Hnc&3TWmgXoK&qV2#w0O?Y|oDqqO?H_BXT@5&QL_f+-Oq57=(KitAb*ec2Ze3aLS72L@W zw1OQ@>vD?2Ulg=p2-8KU&p#&}|7|fkdQ5B&A5o2SjlM{GVKG#5b}v2-GFFGG&ob1$ z`qthehPlH~`a;}?hvcU+3KRyoZ_3EM{cJB~5 zyr2zsG;c+wkN_|sTI#&=`AVdYP9g_KVRH*uWesrQNpRsKaKGN`Matf0HRwCAP!Pl* zveYoFwFm>$zJIjJMV1vqDXU_JoO}zixNm^2Q6O=(7rZ!$82gGCz5@rxUUoLMSDT`n zJL%(t*ukDLss$F8GeJw5SE-m@Z(kBRtP8@ytNkX&ORFI>?Aq(OLCm2_?QvS(Q!8sP zPbITDZ6KD+)fJRo`;4X{3dA>`=cJy`5A3pb?^ccb7OZ{eeXs1dADvn6hyW++d-?8a z8Z;?DYVBJzM;RSbUZo9mbExrdVGu*TAgL?C@s{!u+_8~P)RVn3de18)- zBh?^!MVI9%kMnRb;>bol+aXJ(B_e*}1jwvJLFm7p);@ZNc@lWv)hE8!hTUxX2$z<1 z+uTvF5qV>r>b8&Zxi~uhxg)%OTsqP%k6aiY$RGxoj%-GY%l;%vRusL zb%Zizwt`0JvL7*)-t9G=Nkh*e@hytG=CCA(vrN5#y=!Te9k}lOK+x6pjHtPYH1OKf zqw<0f?25~gUq7CmD=NdoZ5xFPe~4bvag&yID^&hrsowT_y`#!zUm$@-^vw&qR_2MJ6GUjcIw4rOm~ZwMZ+ zmxiwerjYFQds;MdFV-EhdN#BUyov-+zA%p_aXkxwl9`rb@ddR$H%Q^YzolP>B+eoy+R>tj_T<&~YrqzVx zt-Db)5Smb;&*2tF!N~SoF6&9>AYe&t2VTIIm1%$0TV?)a`DKl=h&DbK0imzlQXQa%slmY&p`h~srZc$_ZusR;geems0ZR;<>*uufZ1o-J1m&Q);op2VcNsir>5`7Cs)c>%Da0M z-X`{&*(4VA;~(*RS>)qOzI3O`Qy$DSJrGuH`9}TQv-bbJ00T8R@_)ryb7EiGG#@{B znL^zGEwUMq0XHj|IS+NXxen*^N=Bm0^xFa zcF-R5I8-@8zCxB>Qjk+zqyGB-1RLPT&XzPXUs0yhL}c`fhM9_rfKsl@euY(~-J?Mp zbvgc@3FK0_V};xM)ae(Ww>&hdwMNimjP|8i{e?o;&LND@S$G0bay>vO((@IbFP4=k z+`>_bKJ7)$Z@cBENY>cF_KYJZ`u&T;80cDCZnedS~_zOn@K#E?619>z~J1!n>OB-fD7U zlvR2SQIXt;Fab=|m~T7;6*lW}D>@}Q)F0X!pFBYbp}5joZ9m?|<9;TM55VIur`E;a zVjjjz8uz&mGVz)7kb#TovQ@)y5dvd2a0h|ucFN48V01z(Co7nq8#=H0Vy0V(22chn z0|G_s@Cc%qIbkZ|Pyiw=Ij9iaAY9mYKcOI}_Nq_FC*wxDemoFDaI0A__FCM5ev=<( zG|Vp6^k?Xs5lm#UlFvj)zS|FGY;`YwH{y)ca@X(+*|3PR1KVz3C z$R(n0&HM|y)ep_Cf=G1t!-?#%u=Z=p!Sq6Msm>3m`|6*7>|vMgj!cit*1ECEKJILf{ z({IMp|5?3gh%D`Epb_61QPT)IUZJE1LQfO$gpulD=rlCOkaItwb2vkE_d?~KOt-tY z+y74KKB4NaC^=FxT@>M1WZ%+x{=seWZ|?fB^*HzLRCe4rMMG4R=PbYRgp1AQE|&RxwV0Z$83QKe;hkd9? z9YT<#h&Z>q_60T)d~ zW&YLBz=nWLV&bq??_DZV3;Dvh*Ry({OkTUak?I7QNR&HlKh)$6T66F=J_sdMM|?@U z=`D$P!iZ#V;#WY3!ktB+zW%5l9k{%gcs-|CjYIG?mDY_}hU zsRfe-l|OfjsNXkpXL=Ad6&0jr)&+N`wx!(e zXw4;5)v-KLvv31|>J7yEWz*l*tIR=yC zU-R=-_r&Iisd(03CgNK@w{wY{f43iSGOyRWb~yY*s>XYC-v?;J41=ktEpq1;ap z0!l(~T_FSjB3|ygU9R-4ppiu8d3PA1`yMmt4*V};uco?4uw>5d9Rr8>z(!5|^n+SE zLg^Q6r&cylAp9LM50|o=vw)a!YSY6lUR1B+5|TvJ=?`vYeAuM3;{C;@*U_RIm9Ryz zNU2F5&dgK@2{FV(1B#dk#xGJ*YAlb;kD!vCf}lICsGb7+07?|1K!`BY-pOc_C3F(fc^MTKGS{R=gN8oDt6p&u%l(D!c0+p_uR zc_R8B1;ou7Lh8NsXlwwpq;?~h{K|$qR{Tmt(w7J@WV|3!-$V>DHcDp_sPC#%~TC&I&Lyr zbSLYrWh=F)Kn;a)c9yWVt|R1v0EpE(eC9$$tZw*)Oyg;Pcv`Pq2BQ|wjkVknhOWs+ zp=lbNb`U7O^(m}^f_TlO;V(99a&6-J_b$lO@-x$%Ej3WCrWY$1q|%;cI3N;Wy>&aN zhh}DgOO-1vFP;(aU-$R#HYbqO%+xR@5@FO6LMNhH3RCxwr<&%h$(&B%zQj))ma_|g zJ|6DfR^2n4X}Kpcm|h1V^53(bwzrooBN&ZLDBNq>0z6h-&cvd+4y!(OaaR$vmKc#^ zKFHCOe}GgN zIVihue9C$?xVb$?jKZg>x}mJUI||E|KmB} zjU5d{)_twdyaKm+qFQR{w$>9m3d6#6)K;=A-H*NXt)+Fw#V)7cGC1#!y26#YWG6IJ z9i-7_S5sAN_=P^7(Ukkc3kLyekBDd~mMHlBXnLt{VB^LlB1_E=HlPcj5%v=b<-;6e zcK|!lw#E8`H<=SQy+WfCZvSX98Ak-KD?~U%tl5YEs!=T4Wxf_0Z3p4a@2%{jDeFvJ z@im_*yaub0rM?;9awiuV*8Dwupl@=Nf8u>wZK{kcYGzVXQv!lDpTVBTXD5JRv;oA5 zF!KG$UgPm%oyIJJHhfYKV9{I9sY*4Pe*?@e5M7dnra_-E90E$&K6ZMf@VA49@&na5 z=KFf@r3#F<>qkM(m9;%$K0!Z#{%?fTKSgr&0`C^Kxkx_`(h{b-hP*}$s1#5J5eSlo z2=s=Y-E8FtOb@0%30tuMM}oI0<;veX>7uCn3R59j*rWeXgG zi0H}Z33Q>|bMwjYc~HNffcgjc`(cDNKStGTWGx#LEak)gh0(7QC8RyVE}&%q!@=6} zy_`p@Yvr&Oa818T%tj05{f4(8TWA5{33G~ zp*lj6D$g@ZfwD3F@t#Ph8)Z7Ft3X|fbQwH9Y$PEo7RCId$1?|&*TDSNR!JR}Uk(!= zcjBc;c>>|52ns|DCb5y`33H>VO+Pgb?g>Yt=5{^K7E7)$BqfSjM7ir(uE(q?pmmcS z-4C_5+2L?t0Z5}R=7h&^?DvySkAjjApiQY%>iA!HX?fc=$8*YL2LWoItnP6O>NWNV zAT6ROe#pJuNf0bZv78kQNrzzi2F|b^eY)a2!%=fj63;}w4m|9g8OTo%0oIg3Y8=I3 zl|Hut#x66E_x9J>`R4CRlBeF{kb+$b3MAFZMT?g#($eB7+4K+Wtzkm`%p_YQN&|5y zz(_W9cBW*dOb$cXo@3+Uc77=lMJ7s9*gv6 zVzE7k;~WfWQEX5@5v{V2ui^Vm#}oLD=6=X)iQ-8sr~Co9MF12E5eWY)I3<*U%?gMt zH&P`|wXmE%+mydwsQPN!W#w=fa8`zfwfI~Zt$D&gUv>G%Q~kq<3sg-_eSn2KhqR13 zoT?yuZ(29DarslCKudPNoS$0n55!MXN@iD%OtZDX+o;xOuA<-0qa3sKy`4)BRY@o67^o$jJzd3xcWXlT2}lT)*1% zHJYH2GuufPcR#O@=VOH7*ep6w?(6}`kt_U&+d7Xz-f z1(@-yt_$7Xuf4Lhd=g22u4_IC=p{O^gh5r|HFm(k0ycFJnGci}7uvv>f&7KCaE7o$ zz(9($PyAmyGa(3e61@F}{g;&}$_xSY0o3}@wWI~_r=oxRHnfs62; zefuA;8K3Ubn~-R2LFBu&8C z;Vx8F+aC|lT~u~tg&p7EEp@#V^t>Pdfg>ii3PtR<8B(y?eIac)cWc%$I#L0n1<3E_ z)k7sE?H>HVO?|JaCwAOecCx^mqR~>F1~{N?IA3mXZn_d9?GF4`mNPFmMtitni=4-j z8b{Ji4Z~Tw$HUYHkBkgw(P8qp1qMqtAi^pv9kTrr^wY=m?RPeW$PRNgVtMHJv=x&W}vzSseFe{%nT-Kilii* z5Iksh;Vx28ULqL)VwGRm2mzv=^c~>FQFd{KiDUVk94|sDqq9~a+I6isvwSpZj9RE? zWGtM4N{F8U)Wwub4tN;zr`P%EX4Ty((H;05cjH?!9_(8JWO2T=Z@XwwVp1J%5x*Ha za>u5#p4-jPP_V~8iysHZFDSny-Z|G^3NJl^N*FWhS1%E(&Ie&OAPqg=+*a~*{&xO# zO1+qDR8woFvPDe^A)lM0-l=9?XH0+R?hdfk!tG|rz$DqvY}rh3y3t-2eHtpO<6Bwa z;hMe?ktOEm-S%Px6y?yQu?Tr_A1V3A>?$s(ZeyW(TVj*Ql-%X-$+%o{nSQt4j|S_v zCq|M$z!W@a-i3AQ6i3`#XMee5(+tdZN0^DZ8hJz!J7}_)mde*)XN{nUv@VqC3C!_w zJi@z>e0k-Tk4>yPYH7ci2K!R7RmYzkIsQ5wrdS+G6VOq6)+4t(EGe|j-Q$N$dv z(w~2IYq?mz*rc>WmP|yP>-Fpa=(C{ufH^_Vbv<}JuJJtK&z=HhO|={;`&6(k@kwMa zB5gi?v>^N|9~%09-7xmC-Er>v-bPIWF#t|BRxyp!NE`ypUj%eKQrVQqAb^q%v@oDT zP66PISI1Mo#EO{04;x?ZbcSWBSnqhkt*(p%gc%pk);1CJgZd4;&y4>w#MiZum!L_B z;DsfC#O`z8by)qd{v+nTbdRpnS4vMqVx!Gh37t5$caFwwB4_dsIz?L@AEq{t0aRjk z>csqoGr*%iRFgruH~bPeyH@}Q9W)a%V70SBs^mESk?ETnjr(PTCXCh%j2sD`1}(t7 z;Lr5gcwcdXmvdw~cABH4%m)D4_;xRV4g7|K`bOYR`KFf!?gtxv3Cf9hxhm7dOHlNQ z42S|2Ki#JRwK^x}ZSG91e6hDwO}W_tb6+|oe{@Xb;v#02m&LB*)n@PFLbhXL`%Y9B%`uT8)aX{RIjjn9twF4(CNcuwa=?>L1SabvSTPS601=-jAwl(wFU99f} z{uIg;Ci%L9J2*YVGYN?Fk%Yk^d`2F;XlV!~o~HUklO>dVpnxw8#DSs4LvR5*V@V@X zQ1DzTZXGT>wk|&)BB_G<^V#f9Q*%g1$1a_Z@UXR{;L4i6{cM3~bhlYA0oJZ=`}Fm7 zroT5TAorVml;H4aU)s@!e3;=_%kDq7v_LOspFk36S{W;9%|sH?KBgdvCJB$`$Ny&N zepgTYT+LelD_J1unZ{kf#{&OT4(X+`G9=XzHLac{0>n4*1C;vhT|pP)*Z(?zSLgW_ zv`LYEM*duGb+Z^$9wTPVIbPRlbdDy0QK{M#==S{x%$?n4^W~Fk(z7V`YOGC9QO_n} zs?S^^G1JY!lXT6=7?Z#GE#jDa_cPZaz$Z4MZm@A*Mr;M%IfUyAum}%80lx|s6e6S7 zLIgVkaexSub_jDNChY9vR8*9O%jX_kk+Oy&6#z^)NVrt6voZO?^t8rf_yE_p@?O^F zOAN3vt|hIb2Q7{c+;>EKKKm*Md20_SAhZ!CA#7|K;-cXuAx491PH7<|FjhZQhd$YJ zw!PxhH$LRrIQgRd^P&+*mc>PckqVHE;lkW;m|Y^=NVtv zEZt%Dr&?Rlj?n*HxpO@q3zyF!;tg`1wJET8_kgIvI!(pNG@I!2Im4;Pw@8f){8hx$ zd`7Jm82(uNXzEp%;%*z~OW@KDX~yn1LuB|#8NuYqvRM+S0C*pO^Z(z$+Ana>&fheafY)_!i+f3udD{B#%TKQ_}(#3m7^0vIaz^x{KO~P zl^_z8m4IO(|LR(B`#id*>0@~iIye^DL4H!r+45}6{-~vBijvp0x)~i1@#p`l(tUeE zG%>+wd*GVS;4oTm<=C+83|6(f(u-k9gYc0b+pdZ(Q<*zSn;8u?8+0ODI;T`uaYXA& zdYU~IutAJ>4Qr0m+tmK=5}0%dhWUG{&wiebY=|maqXH%ZqH12iXVbmtcm;O zrUF=UStVwG<6t|pSA~L(44~zLI~|e$?0LUZDvW_@9o&vJkT5OJ{I!PB8mrHox7?~u z%d*yl94_SFi#UQ{3l%usH&W2-_3xbaImsE}{9popwGx8FfcSAMV%|RXq@Pd>{^|r%&hrC#E;unv53c#gWE%$TP zt8-wS1O}`n3jW?u#tR1y9@IYhC7-1J7&}t6$)<{nWgQS|B$67gJRdIrMLOVi%u^va zwCO$$PgGZwPM%hBPeYEvwg2h*WbcAgE8{s*Z90pg_1!g=du3NVsRPId}q9y>Y$ z`cdBGXlwp8D~J*e5Kd{ZD&15XuQq&$^tp)%t4>F=7~w^9eQ7zm?W`x)C#Ss%kA|2+o4PrI65Shn$>dn^e|<~Q#Ci?emWiF}96vB-JJna`UKq-J|B9J8`D zoih#1968083}UUM)cdVnB(LimErMz+QK^R$c228{^(}8zmXO~v#3~H$$82D*5q5e* z;n|Kw;I}`}Eq_@@lIcZr(kPN~C)&hmS)44NP0qR6<8zR7w{Ek3sVtnNt*nL~4!V#{ zHKehKXf-T}+#kuudE*p1z+_NE6-}WU2FyuH0@kRBz+z$WFd~`RV_m!J`8z z514+jm+oRM znL_&~GN0VsHQ?X*aQckEb4-+m3LA;!5~lE?yjtFQ^e*Ij*C!%Us6tp8nqr?5-ey>* z>6m`O&Z+u&a^7XrB#Uu7SL*uE1xqrABRUT*&i zuI@xGw!_zY!GlXeHBOeo3d>mr^DJR&|Fkeu)?~^6AjgaCr2-4}swI;E?(-H3m=xT)Fs(hlL7QW*CNoU!fqF8(R;@aiM-yZ1>q3vK$8NlEe#L zDb)Q(Z73CLOSy?L$Hf*N9FBM(c=&w+JU8*%@%BO>bu4xMdcpDebZhVOnjSDDez5?q zCm7CD@`|oE)-cfI z8doulP#cM@Xlo04`cY`EeQS+r9o$yLL6I3Y7}j?gJh^6q3gCzOdPIOc0JcUzK`fiu zrirG%FwtT?1F$VZ=#qT&AE4q`?Cj!?VBM!_rub0D>$B*CuzSCREzAMxU0^_*7GS^d z=62jMBDA7&(wu@!lIaidhv&+K0q4wC?jHRvZ^v8{@zr|(E8qty-Bg#ZIyjZO(9VBD z;n{dVeI9(CR$q4wcBc(w!XOGzg(W51sYiPiG=jc-6^f)G`GyF=!8j2S)lMN*;s-kp zlDY0-`7#FwA#3A^%^BPdt{jzMU|iM&Ey*CB=UbMQTul*}%IYX5#=3{932M^I`NTBO zH_~xZ_P#fgIv1Un#W#iJT)dqFX1Bc`@Ox@=hBd!gno~Gt^SzGokJfksnNceIBjf}} zq?n+RiXsiiYI2tN3Py3RP9V?CCk>^Q0l-w3Z$yqs5wDWj2Y6p_WOjhZ0aYQRR-=mi z6G7R=6Lej?pj-LdN`WdvdAGcLpD%-9yyAFRbCgBhP>&9&sje6o{wz-dfYJICNmn?~ zg*m8F-(rK{ts_N?gx_Vn9)I(QyG1ii$9p_KDmb`R>J>Cp2M1Rtucf`%)f{N*LxUL2 zBp4q!c8yJ^!e3cqJ|KQ^_dN=QL{da-CkSj@>p0mY|6V~;jKd`Vt&lp!C~ZtppP%Ac z10}dH+K#<&lYvcTExG+ltnhH&v_niKaI!@*f${TUJ>}KS2vdb&NGPBV4oi>2WZ*~* zMIU6Nal@+FQW`(D)IC)O?0bTPyX`G5T=_{DtkXeV9%)p}Rojo0KbDHgq{*Tsl_=9Gu5MbpHU<*fH9O21CtGg+1ntGdLbeLuP?o>5<#Z;`^34=fm2De zu=;2y{mR7V=DUw>n0L3;BQSY=TdXX4nw4kV+c)AjMaceqWQGI2ZC;uT81$^PrafzD zG!;bz2;H!$)N*_7?GmD5m17fpa6Vx1px`QKBYD5jJu6<>+`8-qAXEY|J`DBi` z=SAid=XXAQzz{@OsE~ViFN%vp4fm%@;M!TA1Wta^RF-}1H`rL=tYV6TQYE@97RS|3 zR=T*A<#76My4fmKnt}SPq>9uqPus69i%;zB8so3plYc(k_~;v@4q}dayx+lVe{UM+ zmJWsN6&a=}m^7LnuXq-luWh3mWOus?d^`CjL5rq7hz5tosq6y9;R?$;*=vnajW z(HvWm>0g{pqF%;?5D9{70+)+Te7JZZK6U!@e6(w;EFKBjfcyk(E9|Kp#wlkByEwb_ z?(Xo_M*{0ZUkUN|x^Hwr+H1*e``2lBXf!l%=zvXVHA{In8YN0BYJVR`N{SI91uzT@ zJd?zZLVl#D_E`J30%Dl7MvK_ymph~&GRAPQx*n%%0#!> zgDzcu866MzQ|5?geA;7;%E4!x$|9oCL}d^jPn>AjXF@t*-@U=J>>I63gCkmCCTy{( z^6HTyo{__#gnXE`*0psAZ!Zw(^P1+qr2)=EnP*%jX=O=Zs|=8EfrKw*SzsgBH9cQY zz&kfPTx8tdHM?B281EPl2J%~4s*ix01${kuOuDB24~m5)10iivgvpfpYUc@nXa^t5 zt+{MqK~1UYs$8Wc^B>I70O)~O6dd6PVwGP|8*>rp?8K_}Qz(8&1k{mju=!!=IumYV z-6_QTQ2NUG&jH|lfU#@XmJq`QX8dqit73|hVQQ^rqIbzmQs(3Sy^=}rEgTT{@Dv$7 z8$lo;5ci{iwpWNzqQ5s2<>==P0<`Ru-?-WgZPxAAf9`vF2MX3Hn4{m@^fovoT=R4@ z=iynrZ7z~3OgA<(AfZVc8VZ2TwF@ra>;&fqLbLRC*gFMHc)bTW!(}_zL7!%xda113B=o{l=w?qE=O;@d1`8 zFCwKWPiOP!Rywx7gky^Vx(>js8;r0U-ozGaPbh{@5;e=pSD} z#J0rNQLX-`75V9++PstR(A_-(O7)%U4C8=hf{`lAo#JEic?#Ge)&Quo&Dhs$jhdy5 zmd3K;kOUO7a6Wt`2 zOWGKCkpN&SnLwA$KULqH{Wc3YPYRW^-TUie)MB1i!UM)u7x0nsk$J3mul}M^xvf>L zbB`?`38DaHv0$_ffOW4U?-kgOwYrVoumTpAk!!uXe?qx+8QP{=z-0c5kAmidGVW>$ zAZdFq$SX1Ut-O!_fPixe7TRq6@u?`pO24^!|$}z@PA0iou6VG zZz~;F+@=sovp~w zA{bNgjmvr1%(MkHu=NH9_QWnRNIuPP2j;}s$=kZMY}+0yE!iiW-?b`AlcYA{^VJWK zCZwQ|a`%~C33*30uyFjU#E^bsN1k#3A~;4!xctVrHs9UW2?lX)_b?cD4tGDYF4y>l z(e50^3{wie-u(FZs>e}hTVw@?%T)au+0>zOW)Ql3Z`Y@LQxmZVXtCTx2hr&blxcg- zHJ1ply3nH9(Go)0=)xS0$ajMl_HlFj#Jn{WvS(DW$bu*=Fvq#xeG`U zEO`W8@H&*8y)GuSff&ri@K|kfzP4jBHRj;ao}xsq;LIoNU>;WEEkT&VvINkKj%| zk|^5|@$}U9L+BO?c;*dvY|5XlSlZ{OJT%BOq&sYKJX~E)B2Q=%!~Uq0 z7Cn*{H!D$b+Hi>KaJqXFEQH&9H@cKFWg&+^T*RBBcr}GKjt;Dkk=+2HYaedCoqE&D ze{PzxMUQsXK;Yi~;krMjeuJPjBo&Wn3_rD4&`&^!|e%OCyoV>kT&^W zyo)Vo|HX`gbMSZFdoL!~p_8vcT-#*^WoSGQ;;TE_rTDra5Ar_=Zg}f=c4*K-d2IYo zW!D)E2lMq;5M9WMmh~hyT7qa%HvT$6glHSldzWCNMO)qKSstAzQCADH>SCjd64ATW zqO)v>?%n78=lS@4c;~~Eb7tnwZ|?l=oHOUlz1>c0!vg?X?EIIgGOtUwE4dZ32 z9z5~wZ638+!O~)+TZ$y#PmeaT>E69Ry&d43@_KzR>FPfd=prpaBdXKqt+Mi&e44^K z85T89AkxXa>BI&!1Jx#R&D0yNtfZ5;v}zbyt1P837YFoS>6~M&){afPYwQNu>)!4c z+tx+u<48Y`f6GfJnW|iAZn#$c{cbcO(`TbDGR4DtVCzIGe7?C)_K0cvo{Wm84VgOE zM#H67)1JTN!PQmD?uEoRCjK31@vc0ifa)_C2FBBaL<;bB)4Z7cdFV5Lu}|#=Z5l_WEX*x{%)xDC9@zLYxfGOJB$n&#j#+B=Go< zXL?e&f>89DS6LbZiuP12xAroX3RI_^(@*Xj;pzytj36v{JcTzc{MDBrY{qVnkGaUMM{$+=<+t+ z9(6_!KOGI^%yFIisSeM)zQf4EdgJWZe^drM{M(HslKfXP5x;*YP5pT3vprEq8I@dj zJ|(puuv+|gnYdy+zSY=FMF3>QrI(Ri!~Lw5+ll6)tSEmEa?@eK-#Jidw$)Ek)Ko#q z619IRS@8S0Vk2kP&izETkbzuDO*+M&Og0hsC1J<+n4?cZMc{5c^ymQT7A(vpt3L4N zqkbNuU}H-4J^*JDO&Xe_44ev^s3tQ~&l-<8C@bhM}A;>r!=#xusDt zl{!MM@sz}D_R`3thntJjbuY}VD|2#Ey`&Z7)qdoqrf7e;Cv&>7-mrf4-5?>3<;JCx z2l`#_B1sk8aj{XNg)46D+l~yRFF23(JW>=SCDVmOf=&h+BrkPO%jrOkG#s);2jH0O=ju5kl)6tJ!Cqu(y{cg`UiTfW z=e;MKZgaG?)KFCFCo7`0@5RR5=}F_$9GgvtMYurv^=+eFAR2X9+FJT#!bt5f1dtQtCRN zt9Rpl6-j30L+eIZfV>1eNFwp!L~3#LJ2{+JZ~xpfrt_t9)w*H!dDbn}`>T}kl$`cn z(Yl4-(``ss%2P$oSXV^5H%8qjHu(aGF1}_Q+XZ$zewjE?LQRUgD!x0T*-f{y^GmU8 z_8fP3Bf$4VLMH73XN_+v*w)3h#etQJ2rk*O!MS4$@!@s4^nw?`g(kU4XnEcpZN3hB zq0Az1NEu!lNI%bB%=cV9m+$!xM+c$LCNOq0;4pM&8?u#cuyS&sb9)VMe=7;jJ*CHB zARaj}ENKywhl*Y2r;S}_V-*q>{%FsoXEnBui(UH`Yo`&r*3ZPxBHQqpK^3D{(GefI zLKz?QK(t;kWW{Iq10|T@(~EkWjgf~iY^axNYod=y?!b+CL2r69%Vx{h~gp7f@^_MI@i~cn5nSRV0X5Yndctz@YYj~Y= zc}uh|Gat?0o&6b!IWgP^X8e%aZP|cDu?k^;aWcqPQ&q9ejZn5h?fsd%no5-%1^x0W zbkorlCk{~;%#_(~**iPxtZa+#q?2S=Y|TD7-VK&3mry%}CI4_qKft6KtTc)?>L!<3 zCA*Y9pUAO~hSn5PPwOW8J7(yjpH+)$RxhK=XgD^_tz)CJrxc_AaZxt}Z-IN$3s zA@snHZLR)RJK6PFnaeRTG%GTTbp#c;mn>;NP}a=yjsNNbcYJHC4e?$qYpoE#%a*uK zp7`GBg?s>V?#nBL<*%$MxbyIWwk|&<;@Y%icpu$>9vl`f3mF8V1|Ay-OBd)u`}PJG@5p;^dc0j8 z^YHR;;69$8o^CuEkMhqsG0^4c<5lJ1ay3mMcH0mgb@JlqeG7Vq#5kyA=hyM9u)W<{pC@J z{*fil62SQ04)dD!B4lQb0cjs1d%ND`m7nlT%WbU#%i5WU`t^EhBYHV3`iyLVQKERp#Wo} ziSm`P@^Mse3>aoUEH0Rg2vfF6KwPK}qXohd9#OsZ$~J8X%&Hu1tstWJ-l?3BpUUbs z@m;^Eh`@+cRk4|Zrwk4>{l$1H#0c5=&Jw6*dbb)x9lxecFr-KG^ylGT(~Fx+uMKzE zXlu}8mAS|+IPh9HcHCRWMxH4Ya|QKW6>1Vt4}cFpLw|3$<@0a@oU#;y1)_Y#KW9sv zN+Wvha6qgk%9mq7Wws(v;n*6xOPC0tKFLFwkzWj(zco%#3a$k?f&23&Y#>Zh{dqi2 z9U$LWTf2Yc1(^frOSc%If~429bBu19G5M%a)LHrB5bko=mA($u1w;yBt4I>aO#5{a zYP@(mjI>4`$`+bUS%>0w4{q@J9;yx~zEQc3(Ld_(@!oUtpuj`hE}=NdOrx=HwIFK5 znzn^Nq*20}w!A>(jZq1RhX^r7I_c*@Y*fS=$;?KoR|LX^^ei}s+_*3CLJ$worfK@X z-E@PMq;)yxF&=^73VrM(qJPANC&x(%Q@%iL!EYLm*OVKKE40awmEWpKyJG=i0ufOW zF&d^QeXRyL)o5@kkP>N;3Sn-`4$}MU4ckAbzOnVUBFF!&z~DJ_4pel z9CJ4@`KSjdBa&7+j0i3xW#tRPg$%#mK{ENSS|BpS62e6LUmXz(3lnO{ml9g1agwQj zS%iO#on<3x2n-JKUf9(@ok=Q8)-s{a9HI01s9F5gpm(X0G3dSaI43R)ff+LQcpe*aZTGzATk4IC8~xQ1^!FS^14?tmV4Th7i2627R8JD}&z-2waZVj=CrOH?vp0Oj&uMbY_^lN|5 z)}Rj6AW5}}HMIUS)@XJvmqyq*2K7^U^{)Ixdp9eP?2qILiDs%uD#gK^SP>q+6s7L8 tTtoT_o&N*cd|1c7cB!+KMB~wm5LT2{{!^Nmbm}` From b96e02110569e1bffc7a91bc3971810e27344667 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 1 Mar 2020 09:19:15 -0500 Subject: [PATCH 266/269] updated remaining comment about property --- sklearn/linear_model/_glm/glm.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index 4a44e4a1baa58..8607d6a1828ab 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -599,10 +599,9 @@ def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True, @property def family(self): - # We use a property with a setter, since the GLM solver relies - # on self.family attribute, but we can't set it in __init__ according - # to scikit-learn API constraints. This also ensures that self.power - # and self.family.power are identical by construction. + # We use a property with a setter to make sure that the family is + # always a Tweedie distribution, and that self.power and + # self.family.power are identical by construction. dist = TweedieDistribution(power=self.power) # TODO: make the returned object immutable return dist From 293214c6dc78483e3802df8a9ac7b8ce5cede626 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 1 Mar 2020 10:54:00 -0500 Subject: [PATCH 267/269] Compare perfs of models side by side in example --- ...lot_tweedie_regression_insurance_claims.py | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 61faf7c2225fb..5235cd3287731 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -143,14 +143,9 @@ def score_estimator( tweedie_powers=None, ): """Evaluate an estimator on train and test sets with different metrics""" - if isinstance(estimator, tuple): - model_name = " * ".join(e.__class__.__name__ for e in estimator) - else: - model_name = estimator.__class__.__name__ - print("\nEvaluation of {} of target {} ".format(model_name, target)) metrics = [ - ("D² explained", None), + ("D² explained", None), # Use default scorer if it exists ("mean abs. error", mean_absolute_error), ("mean squared error", mean_squared_error), ] @@ -280,6 +275,7 @@ def score_estimator( target="Frequency", weights="Exposure", ) +print("Evaluation of PoissonRegressor on target Frequency") print(scores) ############################################################################## @@ -377,6 +373,7 @@ def score_estimator( target="AvgClaimAmount", weights="ClaimNb", ) +print("Evaluation of GammaRegressor on target AvgClaimAmount") print(scores) ############################################################################## @@ -430,25 +427,37 @@ def score_estimator( # Overall, the drivers age (``DrivAge``) has a weak impact on the claim # severity, both in observed and predicted data. # -# Pure Premium Modeling via a Product of Frequency and Severity -# ------------------------------------------------------------- +# Pure Premium Modeling via a Product Model vs single TweedieRegressor +# -------------------------------------------------------------------- # As mentioned in the introduction, the total claim amount per unit of # exposure can be modeled as the product of the prediction of the # frequency model by the prediction of the severity model. # -# To quantify the aggregate performance of this product model, one can compute +# Alternatively, one can directly model the total loss with a unique +# Compound Poisson Gamma generalized linear model (with a log link function). +# This model is a special case of the Tweedie GLM with a "power" parameter +# :math:`p \in (1, 2)`. Here, we fix apriori the `power` parameter of the +# Tweedie model to some arbitrary value (1.9) in the valid range. Ideally one +# would select this value via grid-search by minimizing the negative +# log-likelihood of the Tweedie model, but unfortunately the current +# implementation does not allow for this (yet). +# +# We will compare the performance of both approaches. +# To quantify the performance of both models, one can compute # the mean deviance of the train and test data assuming a Compound # Poisson-Gamma distribution of the total claim amount. This is equivalent to -# a Tweedie distribution with "power" parameter between 1 and 2. +# a Tweedie distribution with a `power` parameter between 1 and 2. # -# As we do not know the true value of the "power" parameter, we compute the -# mean deviances for a grid of possible values of the "power" parameter, -# hoping that a good model for one value of "power" will stay a good model for -# another. Here, every value of "power" defines a separate metric and models -# are to be compared metric by metric: +# The :func:`sklearn.metrics.mean_tweedie_deviance` depends on a `power` +# parameter. As we do not know the true value of the `power` parameter, we here +# compute the mean deviances for a grid of possible values, and compare the +# models side by side, i.e. we compare them at identical values of `power`. +# Ideally, we hope that one model will be consistently better than the other, +# regardless of `power`. tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] -scores = score_estimator( + +scores_product_model = score_estimator( (glm_freq, glm_sev), X_train, X_test, @@ -458,28 +467,12 @@ def score_estimator( weights="Exposure", tweedie_powers=tweedie_powers, ) -print(scores) - - -############################################################################## -# Pure Premium Modeling Using a Single Compound Poisson Gamma Model -# ----------------------------------------------------------------- -# Instead of taking the product of two independently fit models for frequency -# and severity one can directly model the total loss with a unique Compound -# Poisson Gamma generalized linear model (with a log link function). This -# model is a special case of the Tweedie GLM with a "power" parameter :math:`p -# \in (1, 2)`. -# -# Here we fix apriori the "power" parameter of the Tweedie model to some -# arbitrary value in the valid range. Ideally one would select this value via -# grid-search by minimizing the negative log-likelihood of the Tweedie model, -# but unfortunately the current implementation does not allow for this (yet). glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000) glm_pure_premium.fit(X_train, df_train["PurePremium"], sample_weight=df_train["Exposure"]) -scores = score_estimator( +scores_glm_pure_premium = score_estimator( glm_pure_premium, X_train, X_test, @@ -489,11 +482,18 @@ def score_estimator( weights="Exposure", tweedie_powers=tweedie_powers ) + +scores = pd.concat([scores_product_model, scores_glm_pure_premium], + axis=1, sort=True, + keys=('Product Model', 'TweedieRegressor')) +print("Evaluation of the Product Model and the Tweedie Regressor " + "on target PurePremium") print(scores) ############################################################################## # In this example, both modeling approaches yield comparable performance -# metrics. +# metrics. For implementation reasons, the percentage of explained variance +# :math:`D^2` is not available for the product model. # # We can additionally validate these models by comparing observed and # predicted total claim amount over the test and train subsets. We see that, From 987239ab23d8295d01c78536358790fb8c0a6c7c Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 1 Mar 2020 11:18:48 -0500 Subject: [PATCH 268/269] Shorten text for df to fit fully in width --- .../plot_tweedie_regression_insurance_claims.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 5235cd3287731..7df8bad102a21 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -151,7 +151,7 @@ def score_estimator( ] if tweedie_powers: metrics += [( - "mean Tweedie deviance (p={:.4f})".format(power), + "mean Tweedie dev p={:.4f}".format(power), partial(mean_tweedie_deviance, power=power) ) for power in tweedie_powers] @@ -455,6 +455,10 @@ def score_estimator( # Ideally, we hope that one model will be consistently better than the other, # regardless of `power`. +glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000) +glm_pure_premium.fit(X_train, df_train["PurePremium"], + sample_weight=df_train["Exposure"]) + tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999] scores_product_model = score_estimator( @@ -468,10 +472,6 @@ def score_estimator( tweedie_powers=tweedie_powers, ) -glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000) -glm_pure_premium.fit(X_train, df_train["PurePremium"], - sample_weight=df_train["Exposure"]) - scores_glm_pure_premium = score_estimator( glm_pure_premium, X_train, From edba3b8b5874d28eef0075a7034e2ccf9dc10fcc Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 1 Mar 2020 11:48:58 -0500 Subject: [PATCH 269/269] Use context manager instead? --- .../linear_model/plot_tweedie_regression_insurance_claims.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py index 7df8bad102a21..ccd18c8efff99 100644 --- a/examples/linear_model/plot_tweedie_regression_insurance_claims.py +++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py @@ -488,7 +488,8 @@ def score_estimator( keys=('Product Model', 'TweedieRegressor')) print("Evaluation of the Product Model and the Tweedie Regressor " "on target PurePremium") -print(scores) +with pd.option_context('display.expand_frame_repr', False): + print(scores) ############################################################################## # In this example, both modeling approaches yield comparable performance