diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 7ad12680353bb..4210b8c5b58b3 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -727,6 +727,7 @@ Kernels:
    linear_model.BayesianRidge
    linear_model.ElasticNet
    linear_model.ElasticNetCV
+   linear_model.GeneralizedLinearRegressor
    linear_model.HuberRegressor
    linear_model.Lars
    linear_model.LarsCV
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index e7fdbf978998c..eba51315d2ae8 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -67,7 +67,7 @@ Ordinary Least Squares Complexity
 
 The least squares solution is computed using the singular value
 decomposition of X. If X is a matrix of shape `(n_samples, n_features)`
-this method has a cost of 
+this method has a cost of
 :math:`O(n_{\text{samples}} n_{\text{features}}^2)`, assuming that
 :math:`n_{\text{samples}} \geq n_{\text{features}}`.
 
@@ -430,7 +430,7 @@ between the features.
 
 The advantages of LARS are:
 
-  - It is numerically efficient in contexts where the number of features 
+  - It is numerically efficient in contexts where the number of features
     is significantly greater than the number of samples.
 
   - It is computationally just as fast as forward selection and has
@@ -732,7 +732,7 @@ classifier. In this model, the probabilities describing the possible outcomes
 of a single trial are modeled using a
 `logistic function <https://en.wikipedia.org/wiki/Logistic_function>`_.
 
-Logistic regression is implemented in :class:`LogisticRegression`. 
+Logistic regression is implemented in :class:`LogisticRegression`.
 This implementation can fit binary, One-vs-Rest, or multinomial logistic 
 regression with optional :math:`\ell_1`, :math:`\ell_2` or Elastic-Net 
 regularization.
@@ -888,6 +888,129 @@ to warm-starting (see :term:`Glossary <warm_start>`).
     .. [9] `"Performance Evaluation of Lbfgs vs other solvers"
             <http://www.fuzihao.org/blog/2016/01/16/Comparison-of-Gradient-Descent-Stochastic-Gradient-Descent-and-L-BFGS/>`_
 
+.. _Generalized_linear_regression:
+
+Generalized Linear Regression
+=============================
+
+:class:`GeneralizedLinearRegressor` generalizes the :ref:`elastic_net` in two
+ways [10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+combination of the input variables :math:`X` via an inverse link function
+:math:`h` as
+
+.. math::    \hat{y}(w, x) = h(xw) = h(w_0 + w_1 x_1 + ... + w_p x_p).
+
+Secondly, the squared loss function is replaced by the deviance :math:`D` of an
+exponential dispersion model (EDM) [11]_. The objective function being minimized
+becomes
+
+.. math::    \frac{1}{2\mathrm{sum}(s)}D(y, \hat{y}; s) + \alpha \rho ||P_1w||_1
+            +\frac{\alpha(1-\rho)}{2} w^T P_2 w
+
+with sample weights :math:`s`.
+:math:`P_1` (diagonal matrix) can be used to exclude some of the coefficients in
+the L1 penalty, the matrix :math:`P_2` (must be positive semi-definite) allows
+for a more versatile L2 penalty.
+
+Use cases, where a loss different from the squared loss might be appropriate,
+are the following:
+
+  * If the target values :math:`y` are counts (non-negative integer valued) or
+    frequencies (non-negative), you might use a Poisson deviance with log-link.
+
+  * If the target values are positive valued and skewed, you might try a
+    Gamma deviance with log-link.
+
+  * If the target values seem to be heavier tailed than a Gamma distribution,
+    you might try an Inverse Gaussian deviance (or even higher variance powers
+    of the Tweedie family).
+
+Since the linear predictor :math:`Xw` can be negative and
+Poisson, Gamma and Inverse Gaussian distributions don't support negative values,
+it is convenient to apply a link function different from the identity link
+:math:`h(Xw)=Xw` that guarantees the non-negativeness, e.g. the log-link with
+:math:`h(Xw)=\exp(Xw)`.
+
+Note that the feature matrix `X` should be standardized before fitting. This
+ensures that the penalty treats features equally. The estimator can be used as
+follows:
+
+    >>> from sklearn.linear_model import GeneralizedLinearRegressor
+    >>> reg = GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    GeneralizedLinearRegressor(alpha=0.5, family='poisson', link='log')
+    >>> reg.coef_
+    array([0.24630169, 0.43373464])
+    >>> reg.intercept_
+    -0.76383633...
+
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_spline_regression.py`
+
+Mathematical formulation
+------------------------
+
+In the unpenalized case, the assumptions are the following:
+
+    * The target values :math:`y_i` are realizations of random variables
+      :math:`Y_i \overset{i.i.d}{\sim} \mathrm{EDM}(\mu_i, \frac{\phi}{s_i})`
+      with expectation :math:`\mu_i=\mathrm{E}[Y]`, dispersion parameter
+      :math:`\phi` and sample weights :math:`s_i`.
+    * The aim is to predict the expectation :math:`\mu_i` with
+      :math:`\hat{y_i} = h(\eta_i)`, linear predictor
+      :math:`\eta_i=(Xw)_i` and inverse link function :math:`h(\eta)`.
+
+Note that the first assumption implies
+:math:`\mathrm{Var}[Y_i]=\frac{\phi}{s_i} v(\mu_i)` with unit variance
+function :math:`v(\mu)`. Specifying a particular distribution of an EDM is the
+same as specifying a unit variance function (they are one-to-one).
+
+Including penalties helps to avoid overfitting or, in case of L1 penalty, to
+obtain sparse solutions. But there are also other motivations to include them,
+e.g. accounting for the dependence structure of :math:`y`.
+
+The objective function, which is independent of :math:`\phi`, is minimized with
+respect to the coefficients :math:`w`.
+
+The deviance is defined by the log of the :math:`\mathrm{EDM}(\mu, \phi)`
+likelihood as
+
+.. math::     d(y, \mu) = -2\phi\cdot
+              \left(loglike(y,\mu,\phi)
+              - loglike(y,y,\phi)\right) \\
+              D(y, \mu; s) = \sum_i s_i \cdot d(y_i, \mu_i)
+
+===================================== ===============================  ================================= ============================================
+Distribution                          Target Domain                    Variance Function :math:`v(\mu)`  Unit Deviance :math:`d(y, \mu)`
+===================================== ===============================  ================================= ============================================
+Normal ("normal")                     :math:`y \in (-\infty, \infty)`  :math:`1`                         :math:`(y-\mu)^2`
+Poisson ("poisson")                   :math:`y \in [0, \infty)`        :math:`\mu`                       :math:`2(y\log\frac{y}{\mu}-y+\mu)`
+Gamma ("gamma")                       :math:`y \in (0, \infty)`        :math:`\mu^2`                     :math:`2(\log\frac{\mu}{y}+\frac{y}{\mu}-1)`
+Inverse Gaussian ("inverse.gaussian") :math:`y \in (0, \infty)`        :math:`\mu^3`                     :math:`\frac{(y-\mu)^2}{y\mu^2}`
+===================================== ===============================  ================================= ============================================
+
+Two remarks:
+
+* The deviances for at least Normal, Poisson and Gamma distributions are
+  strictly consistent scoring functions for the mean :math:`\mu`, see Eq.
+  (19)-(20) in [12]_.
+
+* If you want to model a frequency, i.e. counts per exposure (time, volume, ...)
+  you can do so by a Poisson distribution and passing
+  :math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values together
+  with :math:`s=\mathrm{exposure}` as sample weights.
+
+
+.. topic:: References:
+
+    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models, Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models and analysis of deviance. Monografias de matemática, no. 51.
+           See also `Exponential dispersion model. <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+
+    .. [12] Gneiting, T. (2010). `Making and Evaluating Point Forecasts. <https://arxiv.org/pdf/0912.0902.pdf>`_
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
new file mode 100644
index 0000000000000..b06adcb787560
--- /dev/null
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -0,0 +1,257 @@
+"""
+======================================
+Poisson regression and non normal loss
+======================================
+
+This example illustrate the use linear Poisson regression
+on the French Motor Third-Party Liability Claims dataset [1] and compare
+it with learning models with least squared error.
+
+
+We start by defining a few helper functions for loading the data and
+visualizing results.
+
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018).
+    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+# License: BSD 3 clause
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from scipy.special import xlogy
+
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import GeneralizedLinearRegressor, LinearRegression
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+from sklearn.ensemble import GradientBoostingRegressor
+
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetcher for French Motor Third-Party Liability Claims dataset
+
+    Parameters
+    ----------
+    n_samples: int, default=100000
+      number of samples to select (for faster run time).
+    """
+
+    # Note: this should use the OpenML DataFrame fetcher in the future
+    df_freq = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
+        dtype={"IDpol": np.int},
+        index_col=0,
+    )
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
+        index_col=0,
+    )
+
+    # sum ClaimAmount over identical IDs
+    df_sev = df_sev.groupby(level=0).sum()
+
+    df = df_freq.join(df_sev, how="left")
+    df["ClaimAmount"].fillna(0, inplace=True)
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    return df.iloc[:n_samples]
+
+
+##############################################################################
+#
+# 1. Loading datasets and pre-processing
+# --------------------------------------
+#
+# We construct the freMTPL2 dataset by joining the  freMTPL2freq table,
+# containing the number of claims (``ClaimNb``) with the freMTPL2sev table
+# containing the claim amount (``ClaimAmount``) for the same user ids.
+
+df = load_mtpl2(n_samples=100000)
+
+# Note: filter out claims with zero amount, as the severity model
+# requires a strictly positive target values.
+df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+# correct for unreasonable observations (that might be data error)
+df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+df["Exposure"] = df["Exposure"].clip(upper=1)
+
+column_trans = ColumnTransformer(
+    [
+        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "Veh_Brand_Gas_Region",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("BonusMalus", "passthrough", ["BonusMalus"]),
+        (
+            "Density_log",
+            make_pipeline(
+                FunctionTransformer(np.log, validate=False), StandardScaler()
+            ),
+            ["Density"],
+        ),
+    ],
+    remainder="drop",
+)
+X = column_trans.fit_transform(df)
+
+##############################################################################
+#
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occurring with a constant rate in a given time interval
+# (``Exposure``). Here we model the frequency ``y = ClaimNb / Exposure``,
+# which is still a (scaled) Poisson distribution.
+#
+# A very important property of the Poisson distribution is its mean-variance
+# relation: The variance is proportional to the mean.
+
+df["Frequency"] = df.ClaimNb / df.Exposure
+
+print(
+   pd.cut(df.Frequency, [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
+)
+
+##############################################################################
+#
+# It worth noting that 96 % of users have 0 claims, and if we were to convert
+# this problem into a binary classification task, it would be significantly
+# imbalanced.
+#
+# To evaluate the pertinence of the used metrics, we will consider as a
+# baseline an estimator that returns 0 for any input.
+
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+
+
+def mean_poisson_deviance_score(y_true, y_pred, sample_weights=None):
+    y_true = np.atleast_1d(y_true)
+    y_pred = np.atleast_1d(y_pred)
+    dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred)
+    return np.average(dev, weights=sample_weights)
+
+
+eps = 1e-5
+print("MSE: %.3f" % mean_squared_error(
+        df_test.Frequency.values, np.zeros(len(df_test)),
+        df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+        df_test.Frequency.values, np.zeros(len(df_test)),
+        df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, eps + np.zeros(len(df_test)),
+        df_test.Exposure.values))
+
+
+##############################################################################
+#
+# We start by modeling the target variable with the least squares linear
+# regression model,
+
+
+linregr = LinearRegression()
+linregr.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+print("LinearRegression")
+print("MSE: %.3f" % mean_squared_error(
+          df_test.Frequency.values, linregr.predict(X_test),
+          df_test.Exposure.values))
+print("MSE: %.3f" % mean_absolute_error(
+          df_test.Frequency.values, linregr.predict(X_test),
+          df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, np.fmax(linregr.predict(X_test), eps),
+        df_test.Exposure.values))
+
+##############################################################################
+#
+# The Poisson deviance cannot be computed because negative values are
+# predicted by the model,
+
+print('Number Negatives: %s / total: %s' % (
+      (linregr.predict(X_test) < 0).sum(), X_test.shape[0]))
+
+##############################################################################
+#
+# Next we fit the Poisson regressor on the target variable,
+
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=0)
+glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+print("PoissonRegressor")
+print("MSE: %.3f" % mean_squared_error(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+        df_test.Frequency.values, glm_freq.predict(X_test),
+        df_test.Exposure.values))
+
+##############################################################################
+#
+# Finally we will consider a non linear model  with Gradient boosting that
+# still minimizes the least square error.
+
+
+gbr = GradientBoostingRegressor(max_depth=3)
+gbr.fit(X_train, df_train.Frequency.values,
+        sample_weight=df_train.Exposure.values)
+
+
+print("GradientBoostingRegressor")
+print("MSE: %.3f" % mean_squared_error(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+print("MAE: %.3f" % mean_absolute_error(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+print("mean Poisson deviance: %.3f" % mean_poisson_deviance_score(
+      df_test.Frequency.values, gbr.predict(X_test), df_test.Exposure.values))
+
+##############################################################################
+#
+# In this example, although Gradient boosting minimizes the least square error,
+# because of a higher predictive power it also results in a smaller Poisson
+# deviance than the Poisson regression model.
+#
+# Evaluating models with a single train / test split is prone to numerical
+# errors, we can verify that we would also get equivalent resuts with the
+# cross-validation score.
+#
+# The difference between these models can also be visualized by comparing the
+# histogram of observed target values with that of predicted values,
+
+
+fig, ax = plt.subplots(1, 4, figsize=(16, 3))
+
+df_train.Frequency.hist(bins=np.linspace(-1, 10, 50), ax=ax[0])
+
+ax[0].set_title('Experimental data')
+
+for idx, model in enumerate([linregr, glm_freq, gbr]):
+    y_pred = model.predict(X_train)
+
+    pd.Series(y_pred).hist(bins=np.linspace(-1, 8, 50), ax=ax[idx+1])
+    ax[idx+1].set_title(model.__class__.__name__)
+
+for axi in ax:
+    axi.set(
+        yscale='log',
+        xlabel="y (Frequency)"
+    )
diff --git a/examples/linear_model/plot_poisson_spline_regression.py b/examples/linear_model/plot_poisson_spline_regression.py
new file mode 100644
index 0000000000000..30b5881bba1f5
--- /dev/null
+++ b/examples/linear_model/plot_poisson_spline_regression.py
@@ -0,0 +1,85 @@
+"""
+=================================
+Poisson Regression with B-Splines
+=================================
+
+As in the :ref:`sphx_glr_auto_examples_ensemble_plot_adaboost_regression.py`
+example, a Poisson regression with penalized B-splines (P-splines) [1]_ is
+fitted on slightly different sinusoidal, Poisson distributed data and
+compared to an AdaBoost model with decision trees.
+One can see, that this is a hard problem for both estimators.
+
+.. [1] Eilers, Paul H. C.; Marx, Brian D. "Flexible smoothing with B -splines
+       and penalties". Statist. Sci. 11 (1996), no. 2, 89--121.
+       `doi:10.1214/ss/1038425655
+       <https://projecteuclid.org/euclid.ss/1038425655>`_
+
+"""
+print(__doc__)
+
+# Author: Christian Lorentzen <lorentzen.ch@gmail.com>
+# based on the AdaBoost regression example from Noel Dawe <noel.dawe@gmail.com>
+# License: BSD 3 clause
+
+# importing necessary libraries
+import numpy as np
+from scipy.linalg import toeplitz
+# from scipy.interpolate import BSpline
+from scipy.interpolate import splev
+import matplotlib.pyplot as plt
+from sklearn.tree import DecisionTreeRegressor
+from sklearn.ensemble import AdaBoostRegressor
+from sklearn.linear_model import GeneralizedLinearRegressor
+
+
+# Create the dataset
+xmin, xmax = 0, 6
+rng = np.random.RandomState(1)
+X = np.linspace(xmin, xmax, 500)[:, np.newaxis]
+y_true = 0.5 * (2.1 + np.sin(X).ravel() + np.sin(6 * X).ravel())
+y = rng.poisson(y_true, X.shape[0])
+
+# b-spline basis
+nknots, degree = 40, 3
+ns = nknots - degree - 1  # number of base spline functions
+dx = (xmax - xmin) / (nknots - 1 - 2 * degree)
+knots = np.linspace(xmin - degree * dx, 6 + degree * dx, nknots)
+coef = np.zeros(ns)
+splineBasis = np.empty((X.shape[0], ns), dtype=float)
+for i in range(ns):
+    coef[i] = 1
+#    splineBasis[:, i] = BSpline(knots, coef, degree, extrapolate=False)(X) \
+#        .ravel()
+    splineBasis[:, i] = splev(X, (knots, coef, degree)).ravel()
+    coef[i] = 0
+
+# second order difference matrix
+P2 = toeplitz([2, -1] + [0] * (ns - 2)).astype(float)
+P2[0, 0] = P2[-1, -1] = 1
+
+# Fit regression model
+regr_1 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
+                           n_estimators=10, random_state=rng)
+
+regr_2 = GeneralizedLinearRegressor(family='poisson', link='log',
+                                    fit_intercept=True, alpha=0.02,
+                                    l1_ratio=0.1, P2=P2)
+
+regr_1.fit(X, y)
+regr_2.fit(splineBasis, y)
+
+# Predict
+y_1 = regr_1.predict(X)
+y_2 = regr_2.predict(splineBasis)
+
+# Plot the results
+plt.figure()
+plt.plot(X, y_true, c="b", label="true mean")
+plt.scatter(X, y, c="k", marker='.', label="training samples")
+plt.plot(X, y_1, c="g", label="AdaBoost n_estimator=10", linewidth=2)
+plt.plot(X, y_2, c="r", label="Poisson GLM with B-splines", linewidth=2)
+plt.xlabel("data")
+plt.ylabel("target")
+plt.title("Regression Comparison")
+plt.legend()
+plt.show()
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
new file mode 100644
index 0000000000000..1c8dd42df336d
--- /dev/null
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -0,0 +1,524 @@
+"""
+======================================
+Tweedie regression on insurance claims
+======================================
+
+This example illustrates the use of Poisson, Gamma and Tweedie regression
+on the French Motor Third-Party Liability Claims dataset, and is inspired
+by an R tutorial [1].
+
+Insurance claims data consist of the number of claims and the total claim
+amount. Often, the final goal is to predict the expected value, i.e. the mean,
+of the total claim amount. There are several possibilities to do that, two of
+which are:
+
+1. Model the number of claims with a Poisson distribution, the average
+   claim amount as a Gamma distribution and multiply the predictions of both in
+   order to get the total claim amount.
+2. Model total claim amount directly, typically with a Tweedie distribution of
+   Tweedie power :math:`p \\in (1, 2)`.
+
+In this example we will illustrate both approaches. We start by defining a few
+helper functions for loading the data and visualizing results.
+
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018).
+    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+# License: BSD 3 clause
+from functools import partial
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import TweedieDistribution
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetch the French Motor Third-Party Liability Claims dataset.
+
+    Parameters
+    ----------
+    n_samples: int, default=100000
+      number of samples to select (for faster run time).
+    """
+
+    # Note: this should use the OpenML DataFrame fetcher in the future
+    df_freq = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649148/freMTPL2freq.csv",
+        dtype={"IDpol": np.int},
+        index_col=0,
+    )
+
+    df_sev = pd.read_csv(
+        "https://www.openml.org/data/get_csv/20649149/freMTPL2sev.arff",
+        index_col=0,
+    )
+
+    # sum ClaimAmount over identical IDs
+    df_sev = df_sev.groupby(level=0).sum()
+
+    df = df_freq.join(df_sev, how="left")
+    df["ClaimAmount"].fillna(0, inplace=True)
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    return df.iloc[:n_samples]
+
+
+def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
+                  title=None, ax=None):
+    """Plot observed and predicted - aggregated per feature level.
+
+    Parameters
+    ----------
+    df : DataFrame with at least three columns named feature, weight and
+         observed
+    feature: str
+        a column name of df for the feature to be plotted
+    weight : str
+        column name of df with the values of weights or exposure
+    observed : str
+        a column name of df with the observed target
+    predicted : frame
+        a dataframe, with the same index as df, with the predicted target
+    """
+    # aggregate observed and predicted variables by feature level
+    df_ = df.loc[:, [feature, weight]].copy()
+    df_["observed"] = df[observed] * df[weight]
+    df_["predicted"] = predicted * df[weight]
+    df_ = (
+        df_.groupby([feature])[weight, "observed", "predicted"]
+        .sum()
+        .assign(observed=lambda x: x["observed"] / x[weight])
+        .assign(predicted=lambda x: x["predicted"] / x[weight])
+    )
+
+    ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
+    y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8
+    ax.fill_between(
+        df_.index,
+        0,
+        y_max * df_[weight] / df_[weight].values.max(),
+        color="g",
+        alpha=0.1,
+    )
+    ax.set(
+        ylabel=y_label if y_label is not None else None,
+        title=title if title is not None else "Train: Observed vs Predicted",
+    )
+
+
+##############################################################################
+#
+# 1. Loading datasets and pre-processing
+# --------------------------------------
+#
+# We construct the freMTPL2 dataset by joining the freMTPL2freq table,
+# containing the number of claims (``ClaimNb``), with the freMTPL2sev table,
+# containing the claim amount (``ClaimAmount``) for the same policy ids
+# (``IDpol``).
+
+df = load_mtpl2(n_samples=100000)
+
+# Note: filter out claims with zero amount, as the severity model
+# requires a strictly positive target values.
+df.loc[(df.ClaimAmount == 0) & (df.ClaimNb >= 1), "ClaimNb"] = 0
+
+# Correct for unreasonable observations (that might be data error)
+# and a few exceptionally large claim amounts
+df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+df["Exposure"] = df["Exposure"].clip(upper=1)
+df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
+
+column_trans = ColumnTransformer(
+    [
+        ("Veh_Driv_Age", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]),
+        (
+            "Veh_Brand_Gas_Region",
+            OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"],
+        ),
+        ("BonusMalus", "passthrough", ["BonusMalus"]),
+        (
+            "Density_log",
+            make_pipeline(
+                FunctionTransformer(np.log, validate=False), StandardScaler()
+            ),
+            ["Density"],
+        ),
+    ],
+    remainder="drop",
+)
+X = column_trans.fit_transform(df)
+
+
+df["Frequency"] = df.ClaimNb / df.Exposure
+df["AvgClaimAmount"] = df.ClaimAmount / np.fmax(df.ClaimNb, 1)
+
+print(df[df.ClaimAmount > 0].head())
+
+##############################################################################
+#
+# 2. Frequency model -- Poisson distribution
+# -------------------------------------------
+#
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occuring with a constant rate in a given time interval (``Exposure``).
+# Here we model the frequency ``y = ClaimNb / Exposure``,
+# which is still a (scaled) Poisson distribution.
+#
+# A very important property of the Poisson distribution is its mean-variance
+# relation: The variance is proportional to the mean.
+
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=2)
+
+# Some of the features are colinear, we use a weak penalization to avoid
+# numerical issues.
+glm_freq = GeneralizedLinearRegressor(family="poisson", alpha=1e-2)
+glm_freq.fit(X_train, df_train.Frequency, sample_weight=df_train.Exposure)
+
+
+def mean_deviance(estimator, y, y_pred, weights):
+    if hasattr(estimator, "_family_instance"):
+        return estimator._family_instance.deviance(y, y_pred, weights) / len(y)
+    else:
+        return np.nan
+
+
+def score_estimator(
+    estimator, X_train, X_test, df_train, df_test, target, weights
+):
+    res = []
+
+    for subset_label, X, df in [
+        ("train", X_train, df_train),
+        ("test", X_test, df_test),
+    ]:
+        y, _weights = df[target], df[weights]
+
+        for score_label, metric in [
+            ("D² explained", None),
+            ("mean deviance", partial(mean_deviance, estimator)),
+            ("mean abs. error", mean_absolute_error),
+            ("mean squared error", mean_squared_error),
+        ]:
+            if estimator.__class__.__name__ == "ClaimProdEstimator":
+                # ClaimProdEstimator is the product of frequency and severity
+                # models, denormalized by the exposure values.
+                # It does not fully follow the scikit-learn API and we
+                # must handle it separately.
+                y_pred = estimator.predict(X, exposure=df.Exposure.values)
+            else:
+                y_pred = estimator.predict(X)
+            if metric is None:
+                if not hasattr(estimator, "score"):
+                    continue
+                score = estimator.score(X, y, _weights)
+            else:
+                score = metric(y, y_pred, _weights)
+
+            res.append(
+                {"subset": subset_label, "metric": score_label, "score": score}
+            )
+
+    res = (
+        pd.DataFrame(res)
+        .set_index(["metric", "subset"])
+        .score.unstack(-1)
+        .round(3)
+    )
+    return res
+
+
+scores = score_estimator(
+    glm_freq,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="Frequency",
+    weights="Exposure",
+)
+print(scores)
+
+##############################################################################
+#
+# We can visually compare observed and predicted values, aggregated by
+# the drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
+# bonus/malus (``BonusMalus``).
+
+fig, ax = plt.subplots(2, 2, figsize=(16, 8))
+fig.subplots_adjust(hspace=0.3, wspace=0.2)
+
+plot_obs_pred(
+    df=df_train,
+    feature="DrivAge",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_train),
+    y_label="Claim Frequency",
+    title="train data",
+    ax=ax[0, 0],
+)
+
+plot_obs_pred(
+    df=df_test,
+    feature="DrivAge",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[0, 1],
+)
+
+plot_obs_pred(
+    df=df_test,
+    feature="VehAge",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 0],
+)
+
+plot_obs_pred(
+    df=df_test,
+    feature="BonusMalus",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 1],
+)
+
+
+##############################################################################
+#
+# 3. Severity model -  Gamma Distribution
+# ---------------------------------------
+# The mean claim amount or severity (`AvgClaimAmount`) can be empirically
+# shown to follow approximately a Gamma distribution. We fit a GLM model for
+# the severity with the same features as the frequency model.
+#
+# Note:
+#
+# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support
+#   on :math:`(0, \infty)`, not :math:`[0, \infty)`.
+# - We use ``ClaimNb`` as sample weights.
+
+mask_train = df_train["ClaimAmount"] > 0
+mask_test = df_test["ClaimAmount"] > 0
+
+glm_sev = GeneralizedLinearRegressor(family="gamma")
+
+glm_sev.fit(
+    X_train[mask_train.values],
+    df_train.loc[mask_train, "AvgClaimAmount"],
+    sample_weight=df_train.loc[mask_train, "ClaimNb"],
+)
+
+
+scores = score_estimator(
+    glm_sev,
+    X_train[mask_train.values],
+    X_test[mask_test.values],
+    df_train[mask_train],
+    df_test[mask_test],
+    target="AvgClaimAmount",
+    weights="ClaimNb",
+)
+print(scores)
+
+##############################################################################
+#
+# Note that the resulting model is the average claim amount per claim. As such,
+# it is conditional on having at least one claim, and cannot be used to predict
+# the average claim amount per policy in general.
+
+print(
+    "Mean AvgClaim Amount per policy:              %.2f "
+    % df_train.AvgClaimAmount.mean()
+)
+print(
+    "Mean AvgClaim Amount | NbClaim > 0:           %.2f"
+    % df_train.AvgClaimAmount[df_train.AvgClaimAmount > 0].mean()
+)
+print(
+    "Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
+    % glm_sev.predict(X_train).mean()
+)
+
+
+##############################################################################
+#
+# We can visually compare observed and predicted values, aggregated for
+# the drivers age (``DrivAge``).
+
+fig, ax = plt.subplots(1, 2, figsize=(16, 4))
+
+# plot DivAge
+plot_obs_pred(
+    df=df_train.loc[mask_train],
+    feature="DrivAge",
+    weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_train[mask_train.values]),
+    y_label="Average Claim Severity",
+    title="train data",
+    ax=ax[0],
+)
+
+plot_obs_pred(
+    df=df_test.loc[mask_test],
+    feature="DrivAge",
+    weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_test[mask_test.values]),
+    y_label="Average Claim Severity",
+    title="test data",
+    ax=ax[1],
+)
+
+
+##############################################################################
+#
+# 4. Total Claims Amount -- Compound Poisson distribution
+# -------------------------------------------------------
+#
+# As mentionned in the introduction, the total claim amount can be modeled
+# either as the product of the frequency model by the severity model,
+
+
+class ClaimProdEstimator:
+    """Total claim amount estimator.
+
+    Computed as the product of the frequency model by the serverity model,
+    denormalized by exposure. Use Tweedie deviance with `p=1.5`.
+    """
+
+    def __init__(self, est_freq, est_sev):
+        self.est_freq = est_freq
+        self.est_sev = est_sev
+        self._family_instance = TweedieDistribution(power=1.5)
+
+    def predict(self, X, exposure):
+        """Predict the total claim amount.
+
+        The predict method is not compatible with the scikit-learn API.
+        """
+        return exposure * self.est_freq.predict(X) * self.est_sev.predict(X)
+
+    def score(self, X, y, sample_weight=None):
+        """Compute D², the percentage of deviance explained."""
+        mu = self.predict(X, exposure=sample_weight)
+        dev = self._family_instance.deviance(y, mu, weights=sample_weight)
+        y_mean = np.average(y, weights=sample_weight)
+        dev_null = self._family_instance.deviance(y, y_mean,
+                                                  weights=sample_weight)
+        return 1. - dev / dev_null
+
+
+est_prod = ClaimProdEstimator(glm_freq, glm_sev)
+
+scores = score_estimator(
+    est_prod,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="ClaimAmount",
+    weights="Exposure",
+)
+print(scores)
+
+
+##############################################################################
+#
+# or as a unique Compound Poisson model, also corresponding to a Tweedie model
+# with a power :math:`p \in (1, 2)`. We determine the optimal hyperparameter
+# ``p`` with a grid search,
+
+from sklearn.model_selection import GridSearchCV
+
+# this takes a while
+params = {
+    "family": [
+        TweedieDistribution(power=power) for power in np.linspace(1, 2, 8)
+    ]
+}
+
+glm_total = GridSearchCV(
+    GeneralizedLinearRegressor(), cv=3, param_grid=params, n_jobs=-1
+)
+glm_total.fit(
+    X_train, df_train["ClaimAmount"], sample_weight=df_train["Exposure"]
+)
+
+
+print(
+    "Best hyperparameters: power=%.2f\n"
+    % glm_total.best_estimator_.family.power
+)
+
+scores = score_estimator(
+    glm_total.best_estimator_,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="ClaimAmount",
+    weights="Exposure",
+)
+print(scores)
+
+##############################################################################
+#
+# In this example, the mean absolute error is lower for the Compound Poisson
+# model than when using separate models for frequency and severity.
+#
+# We can additionally validate these models by comparing observed and predicted
+# total claim amount over the test and train subsets. We see that in our case
+# the frequency-severity model underestimates the total claim amount, whereas
+# the Tweedie model overestimates.
+
+res = []
+for subset_label, X, df in [
+    ("train", X_train, df_train),
+    ("test", X_test, df_test),
+]:
+    res.append(
+        {
+            "subset": subset_label,
+            "observed": df.ClaimAmount.values.sum(),
+            "predicted, frequency*severity model": np.sum(
+                est_prod.predict(X, exposure=df.Exposure.values)
+            ),
+            "predicted, tweedie, p=%.2f"
+            % glm_total.best_estimator_.family.power: np.sum(
+                glm_total.best_estimator_.predict(X)
+            ),
+        }
+    )
+
+print(pd.DataFrame(res).set_index("subset").T)
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 770a1a49b600e..121418f901a1a 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -18,6 +18,8 @@
                                  lasso_path, enet_path, MultiTaskLasso,
                                  MultiTaskElasticNet, MultiTaskElasticNetCV,
                                  MultiTaskLassoCV)
+from ._glm import (TweedieDistribution,
+                   GeneralizedLinearRegressor, PoissonRegressor)
 from .huber import HuberRegressor
 from .sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from .stochastic_gradient import SGDClassifier, SGDRegressor
@@ -78,4 +80,7 @@
            'orthogonal_mp',
            'orthogonal_mp_gram',
            'ridge_regression',
-           'RANSACRegressor']
+           'RANSACRegressor',
+           'GeneralizedLinearRegressor',
+           'TweedieDistribution',
+           'PoissonRegressor']
diff --git a/sklearn/linear_model/_glm.py b/sklearn/linear_model/_glm.py
new file mode 100644
index 0000000000000..b3c303ee9770b
--- /dev/null
+++ b/sklearn/linear_model/_glm.py
@@ -0,0 +1,2490 @@
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# some parts and tricks stolen from other sklearn files.
+# License: BSD 3 clause
+
+# TODO: Add cross validation support, e.g. GCV?
+# TODO: Should GeneralizedLinearRegressor inherit from LinearModel?
+#       So far, it does not.
+# TODO: Include further classes in class.rst? ExponentialDispersionModel?
+#       TweedieDistribution?
+# TODO: Negative values in P1 are not allowed so far. They could be used
+#       for group lasso.
+
+# Design Decisions:
+# - Which name? GeneralizedLinearModel vs GeneralizedLinearRegressor.
+#   Estimators in sklearn are either regressors or classifiers. A GLM can do
+#   both depending on the distr (Normal => regressor, Binomial => classifier).
+#   Solution: GeneralizedLinearRegressor since this is the focus.
+# - Allow for finer control of penalty terms:
+#   L1: ||P1*w||_1 with P1*w as element-wise product, this allows to exclude
+#       factors from the L1 penalty.
+#   L2: w*P2*w with P2 a positive (semi-) definite matrix, e.g. P2 could be
+#   a 1st or 2nd order difference matrix (compare B-spline penalties and
+#   Tikhonov regularization).
+# - The link function (instance of class Link) is necessary for the evaluation
+#   of deviance, score, Fisher and Hessian matrix as functions of the
+#   coefficients, which is needed by optimizers.
+#   Solution: link as argument in those functions
+# - Which name/symbol for sample_weight in docu?
+#   sklearn.linear_models uses w for coefficients, standard literature on
+#   GLMs use beta for coefficients and w for (sample) weights.
+#   So far, coefficients=w and sample weights=s.
+# - The intercept term is the first index, i.e. coef[0]
+
+
+from __future__ import division
+from abc import ABCMeta, abstractmethod
+import numbers
+import numpy as np
+from scipy import linalg, sparse, special
+import scipy.sparse.linalg as splinalg
+from scipy.optimize import fmin_l_bfgs_b
+import warnings
+from ..base import BaseEstimator, RegressorMixin
+from ..exceptions import ConvergenceWarning
+from ..utils import check_array, check_X_y
+from ..utils.optimize import newton_cg
+from ..utils.validation import check_is_fitted, check_random_state
+
+
+def _check_weights(sample_weight, n_samples):
+    """Check that sample weights are non-negative and have the right shape."""
+    if sample_weight is None:
+        weights = np.ones(n_samples)
+    elif np.isscalar(sample_weight):
+        if sample_weight <= 0:
+            raise ValueError("Sample weights must be non-negative.")
+        weights = sample_weight * np.ones(n_samples)
+    else:
+        _dtype = [np.float64, np.float32]
+        weights = check_array(sample_weight, accept_sparse=False,
+                              force_all_finite=True, ensure_2d=False,
+                              dtype=_dtype)
+        if weights.ndim > 1:
+            raise ValueError("Sample weight must be 1D array or scalar")
+        elif weights.shape[0] != n_samples:
+            raise ValueError("Sample weights must have the same length as "
+                             "y")
+        if not np.all(weights >= 0):
+            raise ValueError("Sample weights must be non-negative.")
+        elif not np.sum(weights) > 0:
+            raise ValueError("Sample weights must have at least one positive "
+                             "element.")
+
+    return weights
+
+
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+def _safe_toarray(X):
+    """Returns a numpy array."""
+    if sparse.issparse(X):
+        return X.toarray()
+    else:
+        return np.asarray(X)
+
+
+def _safe_sandwich_dot(X, d, intercept=False):
+    """Compute sandwich product X.T @ diag(d) @ X.
+
+    With ``intercept=True``, X is treated as if a column of 1 were appended as
+    first column of X.
+    X can be sparse, d must be an ndarray. Always returns a ndarray."""
+    if sparse.issparse(X):
+        temp = (X.transpose() @ X.multiply(d[:, np.newaxis]))
+        # for older versions of numpy and scipy, temp may be a np.matrix
+        temp = _safe_toarray(temp)
+    else:
+        temp = (X.T * d) @ X
+    if intercept:
+        dim = X.shape[1] + 1
+        if sparse.issparse(X):
+            order = 'F' if sparse.isspmatrix_csc(X) else 'C'
+        else:
+            order = 'F' if X.flags['F_CONTIGUOUS'] else 'C'
+        res = np.empty((dim, dim), dtype=max(X.dtype, d.dtype), order=order)
+        res[0, 0] = d.sum()
+        res[1:, 0] = d @ X
+        res[0, 1:] = res[1:, 0]
+        res[1:, 1:] = temp
+    else:
+        res = temp
+    return res
+
+
+def _min_norm_sugrad(coef, grad, P2, P1):
+    """Compute the gradient of all subgradients with minimal L2-norm.
+
+    subgrad = grad + P2 * coef + P1 * subgrad(|coef|_1)
+
+    g_i = grad_i + (P2*coef)_i
+
+    if coef_i > 0:   g_i + P1_i
+    if coef_i < 0:   g_i - P1_i
+    if coef_i = 0:   sign(g_i) * max(|g_i|-P1_i, 0)
+
+    Parameters
+    ----------
+    coef : ndarray
+        coef[0] may be intercept.
+
+    grad : ndarray, shape=coef.shape
+
+    P2 : {1d or 2d array, None}
+        always without intercept, ``None`` means P2 = 0
+
+    P1 : ndarray
+        always without intercept
+    """
+    intercept = (coef.size == P1.size + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    # compute grad + coef @ P2 without intercept
+    grad_wP2 = grad[idx:].copy()
+    if P2 is None:
+        pass
+    elif P2.ndim == 1:
+        grad_wP2 += coef[idx:] * P2
+    else:
+        grad_wP2 += coef[idx:] @ P2
+    res = np.where(coef[idx:] == 0,
+                   np.sign(grad_wP2) * np.maximum(np.abs(grad_wP2) - P1, 0),
+                   grad_wP2 + np.sign(coef[idx:]) * P1)
+    if intercept:
+        return np.concatenate(([grad[0]], res))
+    else:
+        return res
+
+
+class Link(metaclass=ABCMeta):
+    """Abstract base class for Link functions."""
+
+    @abstractmethod
+    def link(self, mu):
+        """Compute the link function g(mu).
+
+        The link function links the mean mu=E[Y] to the so called linear
+        predictor (X*w), i.e. g(mu) = linear predictor.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        pass
+
+    @abstractmethod
+    def derivative(self, mu):
+        """Compute the derivative of the link g'(mu).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+        pass
+
+    @abstractmethod
+    def inverse(self, lin_pred):
+        """Compute the inverse link function h(lin_pred).
+
+        Gives the inverse relationship between linear predictor and the mean
+        mu=E[Y], i.e. h(linear predictor) = mu.
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        pass
+
+    @abstractmethod
+    def inverse_derivative(self, lin_pred):
+        """Compute the derivative of the inverse link function h'(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        pass
+
+    @abstractmethod
+    def inverse_derivative2(self, lin_pred):
+        """Compute 2nd derivative of the inverse link function h''(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array, shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+        pass
+
+
+class IdentityLink(Link):
+    """The identity link function g(x)=x."""
+
+    def link(self, mu):
+        return mu
+
+    def derivative(self, mu):
+        return np.ones_like(mu)
+
+    def inverse(self, lin_pred):
+        return lin_pred
+
+    def inverse_derivative(self, lin_pred):
+        return np.ones_like(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.zeros_like(lin_pred)
+
+
+class LogLink(Link):
+    """The log link function g(x)=log(x)."""
+
+    def link(self, mu):
+        return np.log(mu)
+
+    def derivative(self, mu):
+        return 1./mu
+
+    def inverse(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative2(self, lin_pred):
+        return np.exp(lin_pred)
+
+
+class LogitLink(Link):
+    """The logit link function g(x)=logit(x)."""
+
+    def link(self, mu):
+        return special.logit(mu)
+
+    def derivative(self, mu):
+        return 1. / (mu * (1 - mu))
+
+    def inverse(self, lin_pred):
+        return special.expit(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep)
+
+    def inverse_derivative2(self, lin_pred):
+        ep = special.expit(lin_pred)
+        return ep * (1. - ep) * (1. - 2 * ep)
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(\mu, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, \mu)}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = \mu`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(\mu)`,
+    unit variance :math:`v(\mu)` and
+    unit deviance :math:`d(y,\mu)`.
+
+    Attributes
+    ----------
+    lower_bound
+    upper_bound
+    include_lower_bound
+    include_upper_bound
+
+    Methods
+    -------
+    in_y_range
+    unit_variance
+    unit_variance_derivative
+    variance
+    variance_derivative
+    unit_deviance
+    unit_deviance_derivative
+    deviance
+    deviance_derivative
+    starting_mu
+
+    _mu_deviance_derivative
+    _score
+    _fisher_matrix
+    _observed_information
+    _eta_mu_score_fisher
+
+    References
+    ----------
+
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+    @property
+    def lower_bound(self):
+        """Get the lower bound of values for Y~EDM."""
+        return self._lower_bound
+
+    @property
+    def upper_bound(self):
+        """Get the upper bound of values for Y~EDM."""
+        return self._upper_bound
+
+    @property
+    def include_lower_bound(self):
+        """Get True if lower bound for y is included: y >= lower_bound."""
+        return self._include_lower_bound
+
+    @property
+    def include_upper_bound(self):
+        """Get True if upper bound for y is included: y <= upper_bound."""
+        return self._include_upper_bound
+
+    def in_y_range(self, x):
+        """Returns ``True`` if x is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        x : array, shape (n_samples,)
+            Target values.
+        """
+        if self.include_lower_bound:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater_equal(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+        else:
+            if self.include_upper_bound:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less_equal(x, self.upper_bound))
+            else:
+                return np.logical_and(np.greater(x, self.lower_bound),
+                                      np.less(x, self.upper_bound))
+
+    @abstractmethod
+    def unit_variance(self, mu):
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(\mu)` determines the variance as
+        a function of the mean :math:`\mu` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(\mu_i)`.
+        It can also be derived from the unit deviance :math:`d(y,\mu)` as
+
+        .. math:: v(\mu) = \frac{2}{\frac{\partial^2 d(y,\mu)}{
+            \partial\mu^2}}\big|_{y=\mu}
+
+        See also :func:`variance`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        pass
+
+    @abstractmethod
+    def unit_variance_derivative(self, mu):
+        r"""Compute the derivative of the unit variance w.r.t. mu.
+
+        Return :math:`v'(\mu)`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Target values.
+        """
+        pass
+
+    def variance(self, mu, phi=1, weights=1):
+        r"""Compute the variance function.
+
+        The variance of :math:`Y_i \sim \mathrm{EDM}(\mu_i,\phi/s_i)` is
+        :math:`\mathrm{Var}[Y_i]=\phi/s_i*v(\mu_i)`,
+        with unit variance :math:`v(\mu)` and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance(mu)
+
+    def variance_derivative(self, mu, phi=1, weights=1):
+        r"""Compute the derivative of the variance w.r.t. mu.
+
+        Returns
+        :math:`\frac{\partial}{\partial\mu}\mathrm{Var}[Y_i]
+        =phi/s_i*v'(\mu_i)`, with unit variance :math:`v(\mu)`
+        and weights :math:`s_i`.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        phi : float (default=1)
+            Dispersion parameter.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return phi/weights * self.unit_variance_derivative(mu)
+
+    @abstractmethod
+    def unit_deviance(self, y, mu):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,\mu)` can be defined by the
+        log-likelihood as
+        :math:`d(y,\mu) = -2\phi\cdot
+        \left(loglike(y,\mu,\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        pass
+
+    def unit_deviance_derivative(self, y, mu):
+        r"""Compute the derivative of the unit deviance w.r.t. mu.
+
+        The derivative of the unit deviance is given by
+        :math:`\frac{\partial}{\partial\mu}d(y,\mu) = -2\frac{y-\mu}{v(\mu)}`
+        with unit variance :math:`v(\mu)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return -2 * (y - mu) / self.unit_variance(mu)
+
+    def deviance(self, y, mu, weights=1):
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, \mu_i)`
+        with weights :math:`s_i` and unit deviance :math:`d(y,\mu)`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,\mu,\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return np.sum(weights * self.unit_deviance(y, mu))
+
+    def deviance_derivative(self, y, mu, weights=1):
+        """Compute the derivative of the deviance w.r.t. mu.
+
+        It gives :math:`\\frac{\\partial}{\\partial\\mu} D(y, \\mu; weights)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        mu : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return weights * self.unit_deviance_derivative(y, mu)
+
+    def starting_mu(self, y, weights=1, ind_weight=0.5):
+        """Set starting values for the mean mu.
+
+        These may be good starting points for the (unpenalized) IRLS solver.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        weights : array, shape (n_samples,) (default=1)
+            Weights or exposure to which variance is inverse proportional.
+
+        ind_weight : float (default=0.5)
+            Must be between 0 and 1. Specifies how much weight is given to the
+            individual observations instead of the mean of y.
+        """
+        return (ind_weight * y +
+                (1. - ind_weight) * np.average(y, weights=weights))
+
+    def _mu_deviance_derivative(self, coef, X, y, weights, link):
+        """Compute mu and the derivative of the deviance w.r.t coef."""
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        d1 = link.inverse_derivative(lin_pred)
+        temp = d1 * self.deviance_derivative(y, mu, weights)
+        if coef.size == X.shape[1] + 1:
+            devp = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            devp = temp @ X  # sampe as X.T @ temp
+        return mu, devp
+
+    def _score(self, coef, phi, X, y, weights, link):
+        r"""Compute the score function.
+
+        The score function is the derivative of the
+        log-likelihood w.r.t. `coef` (:math:`w`).
+        It is given by
+
+        .. math:
+
+            \mathbf{score}(\boldsymbol{w})
+            = \frac{\partial loglike}{\partial\boldsymbol{w}}
+            = \mathbf{X}^T \mathbf{D}
+            \boldsymbol{\Sigma}^-1 (\mathbf{y} - \boldsymbol{\mu})\,,
+
+        with :math:`\mathbf{D}=\mathrm{diag}(h'(\eta_1),\ldots)` and
+        :math:`\boldsymbol{\Sigma}=\mathrm{diag}(\mathbf{V}[y_1],\ldots)`.
+        Note: The derivative of the deviance w.r.t. coef equals -2 * score.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        d = link.inverse_derivative(lin_pred)
+        temp = sigma_inv * d * (y - mu)
+        if coef.size == X.shape[1] + 1:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X  # sampe as X.T @ temp
+        return score
+
+    def _fisher_matrix(self, coef, phi, X, y, weights, link):
+        r"""Compute the Fisher information matrix.
+
+        The Fisher information matrix, also known as expected information
+        matrix is given by
+
+        .. math:
+
+            \mathbf{F}(\boldsymbol{w}) =
+            \mathrm{E}\left[-\frac{\partial\mathbf{score}}{\partial
+            \boldsymbol{w}} \right]
+            = \mathrm{E}\left[
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}\right]
+            = \mathbf{X}^T W \mathbf{X} \,,
+
+        with :math:`\mathbf{W} = \mathbf{D}^2 \boldsymbol{\Sigma}^{-1}`,
+        see func:`_score`.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        d = link.inverse_derivative(lin_pred)
+        d2_sigma_inv = sigma_inv * d * d
+        intercept = (coef.size == X.shape[1] + 1)
+        fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                           intercept=intercept)
+        return fisher_matrix
+
+    def _observed_information(self, coef, phi, X, y, weights, link):
+        r"""Compute the observed information matrix.
+
+        The observed information matrix, also known as the negative of
+        the Hessian matrix of the log-likelihood, is given by
+
+        .. math:
+
+            \mathbf{H}(\boldsymbol{w}) =
+            -\frac{\partial^2 loglike}{\partial\boldsymbol{w}
+            \partial\boldsymbol{w}^T}
+            = \mathbf{X}^T \left[
+            - \mathbf{D}' \mathbf{R}
+            + \mathbf{D}^2 \mathbf{V} \mathbf{R}
+            + \mathbf{D}^2
+            \right] \boldsymbol{\Sigma}^{-1} \mathbf{X} \,,
+
+        with :math:`\mathbf{R} = \mathrm{diag}(y_i - \mu_i)`,
+        :math:`\mathbf{V} = \mathrm{diag}\left(\frac{v'(\mu_i)}{
+        v(\mu_i)}
+        \right)`,
+        see :func:`score_` function and :func:`_fisher_matrix`.
+        """
+        lin_pred = _safe_lin_pred(X, coef)
+        mu = link.inverse(lin_pred)
+        sigma_inv = 1/self.variance(mu, phi=phi, weights=weights)
+        dp = link.inverse_derivative2(lin_pred)
+        d2 = link.inverse_derivative(lin_pred)**2
+        v = self.unit_variance_derivative(mu)/self.unit_variance(mu)
+        r = y - mu
+        temp = sigma_inv * (-dp * r + d2 * v * r + d2)
+        intercept = (coef.size == X.shape[1] + 1)
+        observed_information = _safe_sandwich_dot(X, temp,
+                                                  intercept=intercept)
+        return observed_information
+
+    def _eta_mu_score_fisher(self, coef, phi, X, y, weights, link,
+                             diag_fisher=False):
+        """Compute linear predictor, mean, score function and fisher matrix.
+
+        It calculates the linear predictor, the mean, score function
+        (derivative of log-likelihood) and Fisher information matrix
+        all in one go as function of `coef` (:math:`w`) and the data.
+
+        Parameters
+        ----------
+        diag_fisher : boolean, optional (default=False)
+            If ``True``, returns only an array d such that
+            fisher = X.T @ np.diag(d) @ X.
+
+        Returns
+        -------
+        (eta, mu, score, fisher) : tuple with 4 elements
+            The 4 elements are:
+
+            * eta: ndarray, shape (X.shape[0],)
+            * mu: ndarray, shape (X.shape[0],)
+            * score: ndarray, shape (X.shape[0],)
+            * fisher:
+
+                * If diag_fisher is ``False``, the full fisher matrix,
+                  an array of shape (X.shape[1], X.shape[1])
+                * If diag_fisher is ``True`, an array of shape (X.shape[0])
+        """
+        intercept = (coef.size == X.shape[1] + 1)
+        # eta = linear predictor
+        eta = _safe_lin_pred(X, coef)
+        mu = link.inverse(eta)
+        sigma_inv = 1./self.variance(mu, phi=phi, weights=weights)
+        d1 = link.inverse_derivative(eta)  # = h'(eta)
+        # Alternatively:
+        # h'(eta) = h'(g(mu)) = 1/g'(mu), note that h is inverse of g
+        # d1 = 1./link.derivative(mu)
+        d1_sigma_inv = d1 * sigma_inv
+        temp = d1_sigma_inv * (y - mu)
+        if intercept:
+            score = np.concatenate(([temp.sum()], temp @ X))
+        else:
+            score = temp @ X
+
+        d2_sigma_inv = d1 * d1_sigma_inv
+        if diag_fisher:
+            fisher_matrix = d2_sigma_inv
+        else:
+            fisher_matrix = _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=intercept)
+        return eta, mu, score, fisher_matrix
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    r"""A class for the Tweedie distribution.
+
+    A Tweedie distribution with mean :math:`\mu=\mathrm{E}[Y]` is uniquely
+    defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto \mu^power`.
+
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (0,1) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
+    Parameters
+    ----------
+    power : float (default=0)
+            The variance power of the `unit_variance`
+            :math:`v(\mu) = \mu^{power}`.
+            For ``0<power<1``, no distribution exists.
+    """
+    def __init__(self, power=0):
+        # validate power and set _upper_bound, _include_upper_bound attrs
+        self.power = power
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+
+        self._upper_bound = np.Inf
+        self._include_upper_bound = False
+        if power < 0:
+            # Extreme Stable
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif power == 0:
+            # NormalDistribution
+            self._lower_bound = -np.Inf
+            self._include_lower_bound = False
+        elif (power > 0) and (power < 1):
+            raise ValueError('For 0<power<1, no distribution exists.')
+        elif power == 1:
+            # PoissonDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif (power > 1) and (power < 2):
+            # Compound Poisson
+            self._lower_bound = 0
+            self._include_lower_bound = True
+        elif power == 2:
+            # GammaDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif (power > 2) and (power < 3):
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power == 3:
+            # InverseGaussianDistribution
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        elif power > 3:
+            # Positive Stable
+            self._lower_bound = 0
+            self._include_lower_bound = False
+        else:  # pragma: no cover
+            # this branch should be unreachable.
+            raise ValueError
+
+        self._power = power
+
+    def unit_variance(self, mu):
+        """Compute the unit variance of a Tweedie distribution v(mu)=mu**power.
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return np.power(mu, self.power)
+
+    def unit_variance_derivative(self, mu):
+        """Compute the derivative of the unit variance of a Tweedie
+        distribution v(mu)=power*mu**(power-1).
+
+        Parameters
+        ----------
+        mu : array, shape (n_samples,)
+            Predicted mean.
+        """
+        return self.power * np.power(mu, self.power - 1)
+
+    def unit_deviance(self, y, mu):
+        p = self.power
+        if p == 0:
+            # NormalDistribution
+            return (y - mu)**2
+        if p == 1:
+            # PoissonDistribution
+            # 2 * (y*log(y/mu) - y + mu), with y*log(y/mu)=0 if y=0
+            return 2 * (special.xlogy(y, y/mu) - y + mu)
+        elif p == 2:
+            # GammaDistribution
+            return 2 * (np.log(mu/y) + y/mu - 1)
+        else:
+            # return 2 * (np.maximum(y,0)**(2-p)/((1-p)*(2-p))
+            #    - y*mu**(1-p)/(1-p) + mu**(2-p)/(2-p))
+            return 2 * (np.power(np.maximum(y, 0), 2-p)/((1-p)*(2-p)) -
+                        y*np.power(mu, 1-p)/(1-p) + np.power(mu, 2-p)/(2-p))
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super(NormalDistribution, self).__init__(power=0)
+
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super(PoissonDistribution, self).__init__(power=1)
+
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super(GammaDistribution, self).__init__(power=2)
+
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution"""
+    def __init__(self):
+        super(InverseGaussianDistribution, self).__init__(power=3)
+
+
+class GeneralizedHyperbolicSecant(ExponentialDispersionModel):
+    """A class for the Generalized Hyperbolic Secant (GHS) distribution.
+
+    The GHS distribution is for targets y in (-inf, inf).
+    """
+    def __init__(self):
+        self._lower_bound = -np.Inf
+        self._upper_bound = np.Inf
+        self._include_lower_bound = False
+        self._include_upper_bound = False
+
+    def unit_variance(self, mu):
+        return 1 + mu**2
+
+    def unit_variance_derivative(self, mu):
+        return 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return (2 * y * (np.arctan(y) - np.arctan(mu)) +
+                np.log((1 + mu**2)/(1 + y**2)))
+
+
+class BinomialDistribution(ExponentialDispersionModel):
+    """A class for the Binomial distribution.
+
+    The Binomial distribution is for targets y in [0, 1].
+    """
+    def __init__(self):
+        self._lower_bound = 0
+        self._upper_bound = 1
+        self._include_lower_bound = True
+        self._include_upper_bound = True
+
+    def unit_variance(self, mu):
+        return mu * (1 - mu)
+
+    def unit_variance_derivative(self, mu):
+        return 1 - 2 * mu
+
+    def unit_deviance(self, y, mu):
+        return 2 * (special.xlogy(y, y/mu) + special.xlogy(1-y, (1-y)/(1-mu)))
+
+
+def _irls_step(X, W, P2, z, fit_intercept=True):
+    """Compute one step in iteratively reweighted least squares.
+
+    Solve A w = b for w with
+    A = (X' W X + P2)
+    b = X' W z
+    z = eta + D^-1 (y-mu)
+
+    See also fit method of :class:`GeneralizedLinearRegressor`.
+
+    Parameters
+    ----------
+    X : {ndarray, sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present)
+
+    W : ndarray, shape (n_samples,)
+
+    P2 : {ndarray, sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix)
+
+    z : ndarray, shape (n_samples,)
+        Working observations
+
+    fit_intercept : boolean, optional (default=True)
+
+    Returns
+    -------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shapee[1] + 1.
+    """
+    # Note: solve vs least squares, what is more appropriate?
+    #       scipy.linalg.solve seems faster, but scipy.linalg.lstsq
+    #       is more robust.
+    # Note: X.T @ W @ X is not sparse, even when X is sparse.
+    #      Sparse solver would splinalg.spsolve(A, b) or splinalg.lsmr(A, b)
+    if fit_intercept:
+        Wz = W * z
+        if sparse.issparse(X):
+            b = np.concatenate(([Wz.sum()], X.transpose() @ Wz))
+        else:
+            b = np.concatenate(([Wz.sum()], X.T @ Wz))
+        A = _safe_sandwich_dot(X, W, intercept=fit_intercept)
+        if P2.ndim == 1:
+            idx = np.arange(start=1, stop=A.shape[0])
+            A[(idx, idx)] += P2  # add to diag elements without intercept
+        elif sparse.issparse(P2):
+            A[1:, 1:] += P2.toarray()
+        else:
+            A[1:, 1:] += P2
+    else:
+        if sparse.issparse(X):
+            XtW = X.transpose().multiply(W)
+            # for older versions of numpy and scipy, A may be a np.matrix
+            A = _safe_toarray(XtW @ X)
+        else:
+            XtW = (X.T * W)
+            A = XtW @ X
+        b = XtW @ z
+        if P2.ndim == 1:
+            A[np.diag_indices_from(A)] += P2
+        elif sparse.issparse(P2):
+            A += P2.toarray()
+        else:
+            A += P2
+
+    coef, *_ = linalg.lstsq(A, b, overwrite_a=True, overwrite_b=True)
+    return coef
+
+
+def _irls_solver(coef, X, y, weights, P2, fit_intercept, family, link,
+                 max_iter, tol):
+    """Solve GLM with L2 penalty by IRLS algorithm.
+
+    Note: If X is sparse, P2 must also be sparse.
+    """
+    # Solve Newton-Raphson (1): Obj'' (w - w_old) = -Obj'
+    #   Obj = objective function = 1/2 Dev + l2/2 w P2 w
+    #   Dev = deviance, s = normalized weights, variance V(mu) but phi=1
+    #   D   = link.inverse_derivative(eta) = diag_matrix(h'(X w))
+    #   D2  = link.inverse_derivative(eta)^2 = D^2
+    #   W   = D2/V(mu)
+    #   l2  = alpha * (1 - l1_ratio)
+    #   Obj' = d(Obj)/d(w) = 1/2 Dev' + l2 P2 w
+    #        = -X' D (y-mu)/V(mu) + l2 P2 w
+    #   Obj''= d2(Obj)/d(w)d(w') = Hessian = -X'(...) X + l2 P2
+    #   Use Fisher matrix instead of full info matrix -X'(...) X,
+    #    i.e. E[Dev''] with E[y-mu]=0:
+    #   Obj'' ~ X' W X + l2 P2
+    # (1): w = (X' W X + l2 P2)^-1 X' W z,
+    #      with z = eta + D^-1 (y-mu)
+    # Note: P2 must be symmetrized
+    # Note: ' denotes derivative, but also transpose for matrices
+
+    eta = _safe_lin_pred(X, coef)
+    mu = link.inverse(eta)
+    # D = h'(eta)
+    hp = link.inverse_derivative(eta)
+    V = family.variance(mu, phi=1, weights=weights)
+
+    converged = False
+    n_iter = 0
+    while n_iter < max_iter:
+        n_iter += 1
+        # coef_old not used so far.
+        # coef_old = coef
+        # working weights W, in principle a diagonal matrix
+        # therefore here just as 1d array
+        W = hp**2 / V
+        # working observations
+        z = eta + (y - mu) / hp
+        # solve A*coef = b
+        # A = X' W X + P2, b = X' W z
+        coef = _irls_step(X, W, P2, z, fit_intercept=fit_intercept)
+        # updated linear predictor
+        # do it here for updated values for tolerance
+        eta = _safe_lin_pred(X, coef)
+        mu = link.inverse(eta)
+        hp = link.inverse_derivative(eta)
+        V = family.variance(mu, phi=1, weights=weights)
+
+        # which tolerace? |coef - coef_old| or gradient?
+        # use gradient for compliance with newton-cg and lbfgs
+        # gradient = -X' D (y-mu)/V(mu) + l2 P2 w
+        temp = hp * (y - mu) / V
+        if sparse.issparse(X):
+            gradient = -(X.transpose() @ temp)
+        else:
+            gradient = -(X.T @ temp)
+        idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
+        if P2.ndim == 1:
+            gradient += P2 * coef[idx:]
+        else:
+            gradient += P2 @ coef[idx:]
+        if fit_intercept:
+            gradient = np.concatenate(([-temp.sum()], gradient))
+        if (np.max(np.abs(gradient)) <= tol):
+            converged = True
+            break
+
+    if not converged:
+        warnings.warn("irls failed to converge. Increase the number "
+                      "of iterations (currently {0})"
+                      .format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter
+
+
+def _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+              max_inner_iter=1000, selection='cyclic',
+              random_state=None, diag_fisher=False):
+    """Compute inner loop of coordinate descent, i.e. cycles through features.
+
+    Minimization of 1-d subproblems::
+
+        min_z q(d+z*e_j) - q(d)
+        = min_z A_j z + 1/2 B_jj z^2 + ||P1_j (w_j+d_j+z)||_1
+
+    A = f'(w) + d*H(w) + (w+d)*P2
+    B = H+P2
+    Note: f'=-score and H=fisher are updated at the end of outer iteration.
+    """
+    # TODO: use sparsity (coefficient already 0 due to L1 penalty)
+    #       => active set of features for featurelist, see paper
+    #          of Improved GLMNET or Gap Safe Screening Rules
+    #          https://arxiv.org/abs/1611.05780
+    n_samples, n_features = X.shape
+    intercept = (coef.size == X.shape[1] + 1)
+    idx = 1 if intercept else 0  # offset if coef[0] is intercept
+    B = fisher
+    if P2.ndim == 1:
+        coef_P2 = coef[idx:] * P2
+        if not diag_fisher:
+            idiag = np.arange(start=idx, stop=B.shape[0])
+            # B[np.diag_indices_from(B)] += P2
+            B[(idiag, idiag)] += P2
+    else:
+        coef_P2 = coef[idx:] @ P2
+        if not diag_fisher:
+            if sparse.issparse(P2):
+                B[idx:, idx:] += P2.toarray()
+            else:
+                B[idx:, idx:] += P2
+    A = -score
+    A[idx:] += coef_P2
+    # A += d @ (H+P2) but so far d=0
+    # inner loop
+    for inner_iter in range(1, max_inner_iter+1):
+        inner_iter += 1
+        n_cycles += 1
+        # cycle through features, update intercept separately at the end
+        if selection == 'random':
+            featurelist = random_state.permutation(n_features)
+        else:
+            featurelist = np.arange(n_features)
+        for j in featurelist:
+            # minimize_z: a z + 1/2 b z^2 + c |d+z|
+            # a = A_j
+            # b = B_jj > 0
+            # c = |P1_j| = P1_j > 0, see 1.3
+            # d = w_j + d_j
+            # cf. https://arxiv.org/abs/0708.1485 Eqs. (3) - (4)
+            # with beta = z+d, beta_hat = d-a/b and gamma = c/b
+            # z = 1/b * S(bd-a,c) - d
+            # S(a,b) = sign(a) max(|a|-b, 0) soft thresholding
+            jdx = j+idx  # index for arrays containing entries for intercept
+            a = A[jdx]
+            if diag_fisher:
+                # Note: fisher is ndarray of shape (n_samples,) => no idx
+                # Calculate Bj = B[j, :] = B[:, j] as it is needed later anyway
+                Bj = np.zeros_like(A)
+                if intercept:
+                    Bj[0] = fisher.sum()
+                if sparse.issparse(X):
+                    Bj[idx:] = _safe_toarray(X[:, j].transpose() @
+                                             X.multiply(fisher[:, np.newaxis])
+                                             ).ravel()
+                else:
+                    Bj[idx:] = (fisher * X[:, j]) @ X
+
+                if P2.ndim == 1:
+                    Bj[idx:] += P2[j]
+                else:
+                    if sparse.issparse(P2):
+                        # slice columns as P2 is csc
+                        Bj[idx:] += P2[:, j].toarray().ravel()
+                    else:
+                        Bj[idx:] += P2[:, j]
+                b = Bj[jdx]
+            else:
+                b = B[jdx, jdx]
+
+            # those ten lines are what it is all about
+            if b <= 0:
+                z = 0
+            elif P1[j] == 0:
+                z = -a/b
+            elif a + P1[j] < b * (coef[jdx] + d[jdx]):
+                z = -(a + P1[j])/b
+            elif a - P1[j] > b * (coef[jdx] + d[jdx]):
+                z = -(a - P1[j])/b
+            else:
+                z = -(coef[jdx] + d[jdx])
+
+            # update direction d
+            d[jdx] += z
+            # update A because d_j is now d_j+z
+            # A = f'(w) + d*H(w) + (w+d)*P2
+            # => A += (H+P2)*e_j z = B_j * z
+            # Note: B is symmetric B = B.transpose
+            if diag_fisher:
+                # Bj = B[:, j] calculated above, still valid
+                A += Bj * z
+            else:
+                # B is symmetric, C- or F-contiguous, but never sparse
+                if B.flags['F_CONTIGUOUS']:
+                    # slice columns like for sparse csc
+                    A += B[:, jdx] * z
+                else:  # B.flags['C_CONTIGUOUS'] might be true
+                    # slice rows
+                    A += B[jdx, :] * z
+            # end of cycle over features
+        # update intercept
+        if intercept:
+            if diag_fisher:
+                Bj = np.zeros_like(A)
+                Bj[0] = fisher.sum()
+                Bj[1:] = fisher @ X
+                b = Bj[0]
+            else:
+                b = B[0, 0]
+            z = 0 if b <= 0 else -A[0]/b
+            d[0] += z
+            if diag_fisher:
+                A += Bj * z
+            else:
+                if B.flags['F_CONTIGUOUS']:
+                    A += B[:, 0] * z
+                else:
+                    A += B[0, :] * z
+        # end of complete cycle
+        # stopping criterion for inner loop
+        # sum_i(|minimum of norm of subgrad of q(d)_i|)
+        # subgrad q(d) = A + subgrad ||P1*(w+d)||_1
+        mn_subgrad = _min_norm_sugrad(coef=coef + d, grad=A, P2=None, P1=P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= inner_tol:
+            if inner_iter == 1:
+                inner_tol = inner_tol/4.
+            break
+        # end of inner loop
+    return d, coef_P2, n_cycles, inner_tol
+
+
+def _cd_solver(coef, X, y, weights, P1, P2, fit_intercept, family, link,
+               max_iter=100, max_inner_iter=1000, tol=1e-4,
+               selection='cyclic ', random_state=None,
+               diag_fisher=False, copy_X=True):
+    """Solve GLM with L1 and L2 penalty by coordinate descent algorithm.
+
+    The objective being minimized in the coefficients w=coef is::
+
+        F = f + g, f(w) = 1/2 deviance, g = 1/2 w*P2*w + ||P1*w||_1
+
+    An Improved GLMNET for L1-regularized Logistic Regression:
+
+    1. Find optimal descent direction d by minimizing
+       min_d F(w+d) = min_d F(w+d) - F(w)
+    2. Quadratic approximation of F(w+d)-F(w) = q(d):
+       using f(w+d) = f(w) + f'(w)*d + 1/2 d*H(w)*d + O(d^3) gives:
+       q(d) = (f'(w) + w*P2)*d + 1/2 d*(H(w)+P2)*d
+       + ||P1*(w+d)||_1 - ||P1*w||_1
+       Then minimize q(d): min_d q(d)
+    3. Coordinate descent by updating coordinate j (d -> d+z*e_j):
+       min_z q(d+z*e_j)
+       = min_z q(d+z*e_j) - q(d)
+       = min_z A_j z + 1/2 B_jj z^2
+               + ||P1_j (w_j+d_j+z)||_1 - ||P1_j (w_j+d_j)||_1
+       A = f'(w) + d*H(w) + (w+d)*P2
+       B = H + P2
+
+    Repeat steps 1-3 until convergence.
+    Note: Use Fisher matrix instead of Hessian for H.
+    Note: f' = -score, H = Fisher matrix
+
+    Parameters
+    ----------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shape[1] + 1.
+
+    X : {ndarray, csc sparse matrix}, shape (n_samples, n_features)
+        Training data (with intercept included if present). If not sparse,
+        pass directly as Fortran-contiguous data to avoid
+        unnecessary memory duplication.
+
+    y : ndarray, shape (n_samples,)
+        Target values.
+
+    weights: ndarray, shape (n_samples,)
+        Sample weights with which the deviance is weighted. The weights must
+        bee normalized and sum to 1.
+
+    P1 : {ndarray}, shape (n_features,)
+        The L1-penalty vector (=diagonal matrix)
+
+    P2 : {ndarray, csc sparse matrix}, shape (n_features, n_features)
+        The L2-penalty matrix or vector (=diagonal matrix). If a matrix is
+        passed, it must be symmetric. If X is sparse, P2 must also be sparse.
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    family : ExponentialDispersionModel
+
+    link : Link
+
+    max_iter : int, optional (default=100)
+        Maximum numer of outer (Newton) iterations.
+
+    max_inner_iter : int, optional (default=1000)
+        Maximum number of iterations in each inner loop, i.e. max number of
+        cycles over all features per inner loop.
+
+    tol : float, optional (default=1e-4)
+        Convergence criterion is
+        sum_i(|minimum of norm of subgrad of objective_i|)<=tol.
+
+    selection : str, optional (default='cyclic')
+        If 'random', randomly chose features in inner loop.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+
+    diag_fisher : boolean, optional (default=False)
+        ``False`` calculates full fisher matrix, ``True`` only diagonal matrix
+        s.t. fisher = X.T @ diag @ X. This saves storage but needs more
+        matrix-vector multiplications.
+
+    copy_X : boolean, optional (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    Returns
+    -------
+    coef : ndarray, shape (c,)
+        If fit_intercept=False, shape c=X.shape[1].
+        If fit_intercept=True, then c=X.shape[1] + 1.
+
+    n_iter : number of outer iterations = newton iterations
+
+    n_cycles : number of cycles over features
+
+    References
+    ----------
+    Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+    An Improved GLMNET for L1-regularized Logistic Regression,
+    Journal of Machine Learning Research 13 (2012) 1999-2030
+    https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    X = check_array(X, 'csc', dtype=[np.float64, np.float32],
+                    order='F', copy=copy_X)
+    if P2.ndim == 2:
+        P2 = check_array(P2, 'csc', dtype=[np.float64, np.float32],
+                         order='F', copy=copy_X)
+    if sparse.issparse(X):
+        if not sparse.isspmatrix_csc(P2):
+            raise ValueError("If X is sparse, P2 must also be sparse csc"
+                             "format. Got P2 not sparse.")
+    random_state = check_random_state(random_state)
+    # Note: we already set P2 = l2*P2, P1 = l1*P1
+    # Note: we already symmetrized P2 = 1/2 (P2 + P2')
+    n_iter = 0  # number of outer iterations
+    n_cycles = 0  # number of (complete) cycles over features
+    converged = False
+    n_samples, n_features = X.shape
+    idx = 1 if fit_intercept else 0  # offset if coef[0] is intercept
+    # line search parameters
+    (beta, sigma) = (0.5, 0.01)
+    # some precalculations
+    # Note: For diag_fisher=False, fisher = X.T @ fisher @ X and fisher is a
+    #       1d array representing a diagonal matrix.
+    eta, mu, score, fisher = family._eta_mu_score_fisher(
+        coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+        diag_fisher=diag_fisher)
+    # set up space for search direction d for inner loop
+    d = np.zeros_like(coef)
+    # initial stopping tolerance of inner loop
+    # use L1-norm of minimum of norm of subgradient of F
+    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
+    inner_tol = linalg.norm(inner_tol, ord=1)
+    # outer loop
+    while n_iter < max_iter:
+        n_iter += 1
+        # initialize search direction d (to be optimized) with zero
+        d.fill(0)
+        # inner loop = _cd_cycle
+        d, coef_P2, n_cycles, inner_tol = \
+            _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles, inner_tol,
+                      max_inner_iter=max_inner_iter, selection=selection,
+                      random_state=random_state, diag_fisher=diag_fisher)
+        # line search by sequence beta^k, k=0, 1, ..
+        # F(w + lambda d) - F(w) <= lambda * bound
+        # bound = sigma * (f'(w)*d + w*P2*d
+        #                  +||P1 (w+d)||_1 - ||P1 w||_1)
+        P1w_1 = linalg.norm(P1 * coef[idx:], ord=1)
+        P1wd_1 = linalg.norm(P1 * (coef + d)[idx:], ord=1)
+        # Note: coef_P2 already calculated and still valid
+        bound = sigma * (-(score @ d) + coef_P2 @ d[idx:] + P1wd_1 - P1w_1)
+        Fw = (0.5 * family.deviance(y, mu, weights) +
+              0.5 * (coef_P2 @ coef[idx:]) + P1w_1)
+        la = 1./beta
+        for k in range(20):
+            la *= beta  # starts with la=1
+            coef_wd = coef + la * d
+            mu_wd = link.inverse(_safe_lin_pred(X, coef_wd))
+            Fwd = (0.5 * family.deviance(y, mu_wd, weights) +
+                   linalg.norm(P1 * coef_wd[idx:], ord=1))
+            if P2.ndim == 1:
+                Fwd += 0.5 * ((coef_wd[idx:] * P2) @ coef_wd[idx:])
+            else:
+                Fwd += 0.5 * (coef_wd[idx:] @ (P2 @ coef_wd[idx:]))
+            if Fwd - Fw <= sigma * la * bound:
+                break
+        # update coefficients
+        coef += la * d
+        # calculate eta, mu, score, Fisher matrix for next iteration
+        eta, mu, score, fisher = family._eta_mu_score_fisher(
+            coef=coef, phi=1, X=X, y=y, weights=weights, link=link,
+            diag_fisher=diag_fisher)
+        # stopping criterion for outer loop
+        # sum_i(|minimum-norm of subgrad of F(w)_i|)
+        # fp_wP2 = f'(w) + w*P2
+        # Note: eta, mu and score are already updated
+        mn_subgrad = _min_norm_sugrad(coef=coef, grad=-score, P2=P2, P1=P1)
+        mn_subgrad = linalg.norm(mn_subgrad, ord=1)
+        if mn_subgrad <= tol:
+            converged = True
+            break
+        # end of outer loop
+    if not converged:
+        warnings.warn("Coordinate descent failed to converge. Increase"
+                      " the maximum number of iterations max_iter"
+                      " (currently {0})".format(max_iter), ConvergenceWarning)
+
+    return coef, n_iter, n_cycles
+
+
+class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
+    """Regression via a Generalized Linear Model (GLM) with penalties.
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w). Therefore,
+    the fit minimizes the following objective function with combined L1 and L2
+    priors as regularizer::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s)
+            + alpha * l1_ratio * ||P1*w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * w*P2*w
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+    For ``P1=P2='identity'``, the penalty is the elastic net::
+
+            alpha * l1_ratio * ||w||_1
+            + 1/2 * alpha * (1 - l1_ratio) * ||w||_2^2
+
+    If you are interested in controlling the L1 and L2 penalties
+    separately, keep in mind that this is equivalent to::
+
+            a * L1 + b * L2
+
+    where::
+
+            alpha = a + b and l1_ratio = a / (a + b)
+
+    The parameter ``l1_ratio`` corresponds to alpha in the R package glmnet,
+    while ``alpha`` corresponds to the lambda parameter in glmnet.
+    Specifically, l1_ratio = 1 is the lasso penalty.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms and thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    l1_ratio : float, optional (default=0)
+        The elastic net mixing parameter, with ``0 <= l1_ratio <= 1``. For
+        ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it
+        is an L1 penalty.  For ``0 < l1_ratio < 1``, the penalty is a
+        combination of L1 and L2.
+
+    P1 : {'identity', array-like}, shape (n_features,), optional \
+            (default='identity')
+        With this array, you can exclude coefficients from the L1 penalty.
+        Set the corresponding value to 1 (include) or 0 (exclude). The
+        default value ``'identity'`` is the same as a 1d array of ones.
+        Note that n_features = X.shape[1].
+
+    P2 : {'identity', array-like, sparse matrix}, shape \
+            (n_features,) or (n_features, n_features), optional \
+            (default='identity')
+        With this option, you can set the P2 matrix in the L2 penalty `w*P2*w`.
+        This gives a fine control over this penalty (Tikhonov regularization).
+        A 2d array is directly used as the square matrix P2. A 1d array is
+        interpreted as diagonal (square) matrix. The default 'identity' sets
+        the identity matrix, which gives the usual squared L2-norm. If you just
+        want to exclude certain coefficients, pass a 1d array filled with 1,
+        and 0 for the coefficients to be excluded.
+        Note that P2 must be positive semi-definite.
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    family : {'normal', 'poisson', 'gamma', 'inverse.gaussian', 'binomial'} \
+            or an instance of class ExponentialDispersionModel, \
+            optional(default='normal')
+        The distributional assumption of the GLM, i.e. which distribution from
+        the EDM, specifies the loss function to be minimized.
+
+    link : {'auto', 'identity', 'log', 'logit'} or an instance of class Link, \
+            optional (default='auto')
+        The link function of the GLM, i.e. mapping from linear predictor
+        (X*coef) to expectation (mu). Option 'auto' sets the link depending on
+        the chosen family as follows:
+
+        - 'identity' for family 'normal'
+
+        - 'log' for families 'poisson', 'gamma', 'inverse.gaussian'
+
+        - 'logit' for family 'binomial'
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statistic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'auto', 'cd', 'irls', 'lbfgs', 'newton-cg'}, \
+            optional (default='auto')
+        Algorithm to use in the optimization problem:
+
+        'auto'
+            Sets 'irls' if l1_ratio equals 0, else 'cd'.
+
+        'cd'
+            Coordinate descent algorithm. It can deal with L1 as well as L2
+            penalties. Note that in order to avoid unnecessary memory
+            duplication of X in the ``fit`` method, X should be directly passed
+            as a Fortran-contiguous numpy array or sparse csc matrix.
+
+        'irls'
+            Iterated reweighted least squares.
+            It is the standard algorithm for GLMs. It cannot deal with
+            L1 penalties.
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer. It cannot deal with L1 penalties.
+
+        'newton-cg', 'lbfgs'
+            Newton conjugate gradient algorithm cannot deal with L1 penalties.
+
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function. For the cd solver, convergence is reached
+        when ``sum_i(|minimum-norm of g_i|)``, where ``g_i`` is the
+        subgradient of the objective and minimum-norm of ``g_i`` is the element
+        of the subgradient ``g_i`` with the smallest L2-norm.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to ``fit``), option ``start_params`` sets the
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'guess'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step, else the Newton step is
+            calculated by the cd solver.
+            This gives usually good starting values.
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the weighted average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
+
+    selection : str, optional (default='cyclic')
+        For the solver 'cd' (coordinate descent), the coordinates (features)
+        can be updated in either cyclic or random order.
+        If set to 'random', a random coefficient is updated every iteration
+        rather than looping over features sequentially in the same order. This
+        (setting to 'random') often leads to significantly faster convergence
+        especially when tol is higher than 1e-4.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+        The seed of the pseudo random number generator that selects a random
+        feature to be updated for solver 'cd' (coordinate descent).
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
+    diag_fisher : boolean, optional, (default=False)
+        Only relevant for solver 'cd' (see also ``start_params='guess'``).
+        If ``False``, the full Fisher matrix (expected Hessian) is computed in
+        each outer iteration (Newton iteration). If ``True``, only a diagonal
+        matrix (stored as 1d array) is computed, such that
+        fisher = X.T @ diag @ X. This saves memory and matrix-matrix
+        multiplications, but needs more matrix-vector multiplications. If you
+        use large sparse X or if you have many features,
+        i.e. n_features >> n_samples, you might set this option to ``True``.
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    check_input : boolean, optional (default=True)
+        Allow to bypass several checks on input: y values in range of family,
+        sample_weight non-negative, P2 positive semi-definite.
+        Don't use this parameter unless you know what you do.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+    For alpha > 0, the feature matrix X should be standardized in order to
+    penalize features equally strong. Call
+    :class:`sklearn.preprocessing.StandardScaler` before calling ``fit``.
+
+    If the target y is a ratio, appropriate sample weights s should be
+    provided.
+    As an example, consider Poisson distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``GeneralizedLinearModel(family='poisson').fit(X, y,
+    sample_weight=s)``. The weights are necessary for the right (finite
+    sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that y has a 'scaled' Poisson distributions.
+    The same holds for other distributions.
+
+    References
+    ----------
+    For the coordinate descent implementation:
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    def __init__(self, alpha=1.0, l1_ratio=0, P1='identity', P2='identity',
+                 fit_intercept=True, family='normal', link='auto',
+                 fit_dispersion=None, solver='auto', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params='guess',
+                 selection='cyclic', random_state=None, diag_fisher=False,
+                 copy_X=True, check_input=True, verbose=0):
+        self.alpha = alpha
+        self.l1_ratio = l1_ratio
+        self.P1 = P1
+        self.P2 = P2
+        self.fit_intercept = fit_intercept
+        self.family = family
+        self.link = link
+        self.fit_dispersion = fit_dispersion
+        self.solver = solver
+        self.max_iter = max_iter
+        self.tol = tol
+        self.warm_start = warm_start
+        self.start_params = start_params
+        self.selection = selection
+        self.random_state = random_state
+        self.diag_fisher = diag_fisher
+        self.copy_X = copy_X
+        self.check_input = check_input
+        self.verbose = verbose
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit a Generalized Linear Model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        sample_weight : {None, array-like}, shape (n_samples,),\
+                optional (default=None)
+            Individual weights w_i for each sample. Note that for an
+            Exponential Dispersion Model (EDM), one has
+            Var[Y_i]=phi/w_i * v(mu).
+            If Y_i ~ EDM(mu, phi/w_i), then
+            sum(w*Y)/sum(w) ~ EDM(mu, phi/sum(w)), i.e. the mean of y is a
+            weighted average with weights=sample_weight.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        #######################################################################
+        # 1. input validation                                                 #
+        #######################################################################
+        # 1.1 validate arguments of __init__ ##################################
+        # Guarantee that self._family_instance is an instance of class
+        # ExponentialDispersionModel
+        if isinstance(self.family, ExponentialDispersionModel):
+            self._family_instance = self.family
+        else:
+            if self.family == 'normal':
+                self._family_instance = NormalDistribution()
+            elif self.family == 'poisson':
+                self._family_instance = PoissonDistribution()
+            elif self.family == 'gamma':
+                self._family_instance = GammaDistribution()
+            elif self.family == 'inverse.gaussian':
+                self._family_instance = InverseGaussianDistribution()
+            elif self.family == 'binomial':
+                self._family_instance = BinomialDistribution()
+            else:
+                raise ValueError(
+                    "The family must be an instance of class"
+                    " ExponentialDispersionModel or an element of"
+                    " ['normal', 'poisson', 'gamma', 'inverse.gaussian', "
+                    "'binomial']; got (family={0})".format(self.family))
+
+        # Guarantee that self._link_instance is set to an instance of
+        # class Link
+        if isinstance(self.link, Link):
+            self._link_instance = self.link
+        else:
+            if self.link == 'auto':
+                if isinstance(self._family_instance, TweedieDistribution):
+                    if self._family_instance.power <= 0:
+                        self._link_instance = IdentityLink()
+                    if self._family_instance.power >= 1:
+                        self._link_instance = LogLink()
+                elif isinstance(self._family_instance,
+                                GeneralizedHyperbolicSecant):
+                    self._link_instance = IdentityLink()
+                elif isinstance(self._family_instance, BinomialDistribution):
+                    self._link_instance = LogitLink()
+                else:
+                    raise ValueError("No default link known for the "
+                                     "specified distribution family. Please "
+                                     "set link manually, i.e. not to 'auto'; "
+                                     "got (link='auto', family={}"
+                                     .format(self.family))
+            elif self.link == 'identity':
+                self._link_instance = IdentityLink()
+            elif self.link == 'log':
+                self._link_instance = LogLink()
+            elif self.link == 'logit':
+                self._link_instance = LogitLink()
+            else:
+                raise ValueError(
+                    "The link must be an instance of class Link or "
+                    "an element of ['auto', 'identity', 'log', 'logit']; "
+                    "got (link={0})".format(self.link))
+
+        if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
+            raise ValueError("Penalty term must be a non-negative number;"
+                             " got (alpha={0})".format(self.alpha))
+        if (not isinstance(self.l1_ratio, numbers.Number) or
+                self.l1_ratio < 0 or self.l1_ratio > 1):
+            raise ValueError("l1_ratio must be a number in interval [0, 1];"
+                             " got (l1_ratio={0})".format(self.l1_ratio))
+        if not isinstance(self.fit_intercept, bool):
+            raise ValueError("The argument fit_intercept must be bool;"
+                             " got {0}".format(self.fit_intercept))
+        if self.solver not in ['auto', 'irls', 'lbfgs', 'newton-cg', 'cd']:
+            raise ValueError("GeneralizedLinearRegressor supports only solvers"
+                             " 'auto', 'irls', 'lbfgs', 'newton-cg' and 'cd';"
+                             " got {0}".format(self.solver))
+        solver = self.solver
+        if self.solver == 'auto':
+            if self.l1_ratio == 0:
+                solver = 'irls'
+            else:
+                solver = 'cd'
+        if (self.alpha > 0 and self.l1_ratio > 0 and solver not in ['cd']):
+            raise ValueError("The chosen solver (solver={0}) can't deal "
+                             "with L1 penalties, which are included with "
+                             "(alpha={1}) and (l1_ratio={2})."
+                             .format(solver, self.alpha, self.l1_ratio))
+        if (not isinstance(self.max_iter, int)
+                or self.max_iter <= 0):
+            raise ValueError("Maximum number of iteration must be a positive "
+                             "integer;"
+                             " got (max_iter={0!r})".format(self.max_iter))
+        if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol={0!r})".format(self.tol))
+        if not isinstance(self.warm_start, bool):
+            raise ValueError("The argument warm_start must be bool;"
+                             " got {0}".format(self.warm_start))
+        if self.selection not in ['cyclic', 'random']:
+            raise ValueError("The argument selection must be 'cyclic' or "
+                             "'random'; got (selection={0})"
+                             .format(self.selection))
+        random_state = check_random_state(self.random_state)
+        if not isinstance(self.diag_fisher, bool):
+            raise ValueError("The argument diag_fisher must be bool;"
+                             " got {0}".format(self.diag_fisher))
+        if not isinstance(self.copy_X, bool):
+            raise ValueError("The argument copy_X must be bool;"
+                             " got {0}".format(self.copy_X))
+        if not isinstance(self.check_input, bool):
+            raise ValueError("The argument check_input must be bool; got "
+                             "(check_input={0})".format(self.check_input))
+
+        family = self._family_instance
+        link = self._link_instance
+
+        # 1.2 validate arguments of fit #######################################
+        _dtype = [np.float64, np.float32]
+        if solver == 'cd':
+            _stype = ['csc']
+        else:
+            _stype = ['csc', 'csr']
+        X, y = check_X_y(X, y, accept_sparse=_stype,
+                         dtype=_dtype, y_numeric=True, multi_output=False,
+                         copy=self.copy_X)
+        # Without converting y to float, deviance might raise
+        # ValueError: Integers to negative integer powers are not allowed.
+        # Also, y must not be sparse.
+        y = np.asarray(y, dtype=np.float64)
+
+        weights = _check_weights(sample_weight, y.shape[0])
+
+        n_samples, n_features = X.shape
+
+        # 1.3 arguments to take special care ##################################
+        # P1, P2, start_params
+        if isinstance(self.P1, str) and self.P1 == 'identity':
+            P1 = np.ones(n_features)
+        else:
+            P1 = np.atleast_1d(self.P1)
+            try:
+                P1 = P1.astype(np.float64, casting='safe', copy=False)
+            except TypeError:
+                raise TypeError("The given P1 cannot be converted to a numeric"
+                                "array; got (P1.dtype={0})."
+                                .format(P1.dtype))
+            if (P1.ndim != 1) or (P1.shape[0] != n_features):
+                raise ValueError("P1 must be either 'identity' or a 1d array "
+                                 "with the length of X.shape[1]; "
+                                 "got (P1.shape[0]={0}), "
+                                 "needed (X.shape[1]={1})."
+                                 .format(P1.shape[0], n_features))
+        # If X is sparse, make P2 sparse, too.
+        if isinstance(self.P2, str) and self.P2 == 'identity':
+            if sparse.issparse(X):
+                P2 = (sparse.dia_matrix((np.ones(n_features), 0),
+                      shape=(n_features, n_features))).tocsc()
+            else:
+                P2 = np.ones(n_features)
+        else:
+            P2 = check_array(self.P2, copy=True,
+                             accept_sparse=_stype,
+                             dtype=_dtype, ensure_2d=False)
+            if P2.ndim == 1:
+                P2 = np.asarray(P2)
+                if P2.shape[0] != n_features:
+                    raise ValueError("P2 should be a 1d array of shape "
+                                     "(n_features,) with "
+                                     "n_features=X.shape[1]; "
+                                     "got (P2.shape=({0},)), needed ({1},)"
+                                     .format(P2.shape[0], X.shape[1]))
+                if sparse.issparse(X):
+                    P2 = (sparse.dia_matrix((P2, 0),
+                          shape=(n_features, n_features))).tocsc()
+            elif (P2.ndim == 2 and P2.shape[0] == P2.shape[1] and
+                    P2.shape[0] == X.shape[1]):
+                if sparse.issparse(X):
+                    P2 = sparse.csc_matrix(P2)
+            else:
+                raise ValueError("P2 must be either None or an array of shape "
+                                 "(n_features, n_features) with "
+                                 "n_features=X.shape[1]; "
+                                 "got (P2.shape=({0}, {1})), needed ({2}, {2})"
+                                 .format(P2.shape[0], P2.shape[1], X.shape[1]))
+
+        start_params = self.start_params
+        if isinstance(start_params, str):
+            if start_params not in ['guess',  'zero']:
+                raise ValueError("The argument start_params must be 'guess', "
+                                 "'zero' or an array of correct length; "
+                                 "got(start_params={0})".format(start_params))
+        else:
+            start_params = check_array(start_params, accept_sparse=False,
+                                       force_all_finite=True, ensure_2d=False,
+                                       dtype=_dtype, copy=True)
+            if ((start_params.shape[0] != X.shape[1] + self.fit_intercept) or
+                    (start_params.ndim != 1)):
+                raise ValueError("Start values for parameters must have the"
+                                 "right length and dimension; required (length"
+                                 "={0}, ndim=1); got (length={1}, ndim={2})."
+                                 .format(X.shape[1] + self.fit_intercept,
+                                         start_params.shape[0],
+                                         start_params.ndim))
+
+        l1 = self.alpha * self.l1_ratio
+        l2 = self.alpha * (1 - self.l1_ratio)
+        # P1 and P2 are now for sure copies
+        P1 = l1 * P1
+        P2 = l2 * P2
+        # one only ever needs the symmetrized L2 penalty matrix 1/2 (P2 + P2')
+        # reason: w' P2 w = (w' P2 w)', i.e. it is symmetric
+        if P2.ndim == 2:
+            if sparse.issparse(P2):
+                if sparse.isspmatrix_csc(P2):
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsc()
+                else:
+                    P2 = 0.5 * (P2 + P2.transpose()).tocsr()
+            else:
+                P2 = 0.5 * (P2 + P2.T)
+
+        # For coordinate descent, if X is sparse, P2 must also be csc
+        if solver == 'cd' and sparse.issparse(X):
+            P2 = sparse.csc_matrix(P2)
+
+        # 1.4 additional validations ##########################################
+        if self.check_input:
+            if not np.all(family.in_y_range(y)):
+                raise ValueError("Some value(s) of y are out of the valid "
+                                 "range for family {0}"
+                                 .format(family.__class__.__name__))
+            # check if P1 has only non-negative values, negative values might
+            # indicate group lasso in the future.
+            if not isinstance(self.P1, str):  # if self.P1 != 'identity':
+                if not np.all(P1 >= 0):
+                    raise ValueError("P1 must not have negative values.")
+            # check if P2 is positive semidefinite
+            # np.linalg.cholesky(P2) 'only' asserts positive definite
+            if not isinstance(self.P2, str):  # self.P2 != 'identity'
+                # due to numerical precision, we allow eigenvalues to be a
+                # tiny bit negative
+                epsneg = -10 * np.finfo(P2.dtype).epsneg
+                if P2.ndim == 1 or P2.shape[0] == 1:
+                    p2 = P2
+                    if sparse.issparse(P2):
+                        p2 = P2.toarray()
+                    if not np.all(p2 >= 0):
+                        raise ValueError("1d array P2 must not have negative "
+                                         "values.")
+                elif sparse.issparse(P2):
+                    # for sparse matrices, not all eigenvals can be computed
+                    # efficiently, use only half of n_features
+                    # k = how many eigenvals to compute
+                    k = np.min([10, n_features // 10 + 1])
+                    sigma = -1000 * epsneg  # start searching near this value
+                    which = 'SA'  # find smallest algebraic eigenvalues first
+                    eigenvalues = splinalg.eigsh(P2, k=k, sigma=sigma,
+                                                 which=which,
+                                                 return_eigenvectors=False)
+                    if not np.all(eigenvalues >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+                else:
+                    if not np.all(linalg.eigvalsh(P2) >= epsneg):
+                        raise ValueError("P2 must be positive semi-definite.")
+            # TODO: if alpha=0 check that X is not rank deficient
+            # TODO: what else to check?
+
+        #######################################################################
+        # 2. rescaling of weights (sample_weight)                             #
+        #######################################################################
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L1 + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L1 + L2 with deviance=sum(weights * unit_deviance)
+        weights_sum = np.sum(weights)
+        weights = weights/weights_sum
+
+        #######################################################################
+        # 3. initialization of coef = (intercept_, coef_)                     #
+        #######################################################################
+        # Note: Since phi=self.dispersion_ does not enter the estimation
+        #       of mu_i=E[y_i], set it to 1.
+
+        # set start values for coef
+        coef = None
+        if self.warm_start and hasattr(self, 'coef_'):
+            if self.fit_intercept:
+                coef = np.concatenate((np.array([self.intercept_]),
+                                       self.coef_))
+            else:
+                coef = self.coef_
+        elif isinstance(start_params, str):
+            if start_params == 'guess':
+                # Set mu=starting_mu of the family and do one Newton step
+                # If solver=cd use cd, else irls
+                mu = family.starting_mu(y, weights=weights)
+                eta = link.link(mu)  # linear predictor
+                if solver in ['cd', 'lbfgs', 'newton-cg']:
+                    # see function _cd_solver
+                    sigma_inv = 1/family.variance(mu, phi=1, weights=weights)
+                    d1 = link.inverse_derivative(eta)
+                    temp = sigma_inv * d1 * (y - mu)
+                    if self.fit_intercept:
+                        score = np.concatenate(([temp.sum()], temp @ X))
+                    else:
+                        score = temp @ X  # same as X.T @ temp
+
+                    d2_sigma_inv = d1 * d1 * sigma_inv
+                    diag_fisher = self.diag_fisher
+                    if diag_fisher:
+                        fisher = d2_sigma_inv
+                    else:
+                        fisher = \
+                            _safe_sandwich_dot(X, d2_sigma_inv,
+                                               intercept=self.fit_intercept)
+                    # set up space for search direction d for inner loop
+                    if self.fit_intercept:
+                        coef = np.zeros(n_features+1)
+                    else:
+                        coef = np.zeros(n_features)
+                    d = np.zeros_like(coef)
+                    # initial stopping tolerance of inner loop
+                    # use L1-norm of minimum of norm of subgradient of F
+                    # use less restrictive tolerance for initial guess
+                    inner_tol = _min_norm_sugrad(coef=coef, grad=-score, P2=P2,
+                                                 P1=P1)
+                    inner_tol = 4 * linalg.norm(inner_tol, ord=1)
+                    # just one outer loop = Newton step
+                    n_cycles = 0
+                    d, coef_P2, n_cycles, inner_tol = \
+                        _cd_cycle(d, X, coef, score, fisher, P1, P2, n_cycles,
+                                  inner_tol, max_inner_iter=1000,
+                                  selection=self.selection,
+                                  random_state=random_state,
+                                  diag_fisher=self.diag_fisher)
+                    coef += d  # for simplicity no line search here
+                else:
+                    # See _irls_solver
+                    # h'(eta)
+                    hp = link.inverse_derivative(eta)
+                    # working weights W, in principle a diagonal matrix
+                    # therefore here just as 1d array
+                    W = (hp**2 / family.variance(mu, phi=1, weights=weights))
+                    # working observations
+                    z = eta + (y-mu)/hp
+                    # solve A*coef = b
+                    # A = X' W X + l2 P2, b = X' W z
+                    coef = _irls_step(X, W, P2, z,
+                                      fit_intercept=self.fit_intercept)
+            else:  # start_params == 'zero'
+                if self.fit_intercept:
+                    coef = np.zeros(n_features+1)
+                    coef[0] = link.link(np.average(y, weights=weights))
+                else:
+                    coef = np.zeros(n_features)
+        else:  # assign given array as start values
+            coef = start_params
+
+        #######################################################################
+        # 4. fit                                                              #
+        #######################################################################
+        # algorithms for optimization
+        # TODO: Parallelize it?
+
+        # 4.1 IRLS ############################################################
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
+        if solver == 'irls':
+            coef, self.n_iter_ = \
+                _irls_solver(coef=coef, X=X, y=y, weights=weights, P2=P2,
+                             fit_intercept=self.fit_intercept, family=family,
+                             link=link, max_iter=self.max_iter, tol=self.tol)
+
+        # 4.2 L-BFGS ##########################################################
+        elif solver == 'lbfgs':
+            def func(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                dev = family.deviance(y, mu, weights)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                obj = 0.5 * dev + 0.5 * (coef[idx:] @ L2)
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return obj, objp
+
+            args = (X, y, weights, P2, family, link)
+            coef, loss, info = fmin_l_bfgs_b(
+                func, coef, fprime=None, args=args,
+                iprint=(self.verbose > 0) - 1, pgtol=self.tol,
+                maxiter=self.max_iter, factr=1e3)
+            if info["warnflag"] == 1:
+                warnings.warn("lbfgs failed to converge."
+                              " Increase the number of iterations.",
+                              ConvergenceWarning)
+            elif info["warnflag"] == 2:
+                warnings.warn("lbfgs failed for the reason: {0}"
+                              .format(info["task"]))
+            self.n_iter_ = info['nit']
+
+        # 4.3 Newton-CG #######################################################
+        # We use again the fisher matrix instead of the hessian. More
+        # precisely, expected hessian of deviance.
+        elif solver == 'newton-cg':
+            def func(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = coef[idx:] @ (P2 * coef[idx:])
+                else:
+                    L2 = coef[idx:] @ (P2 @ coef[idx:])
+                mu = link.inverse(_safe_lin_pred(X, coef))
+                return 0.5 * family.deviance(y, mu, weights) + 0.5 * L2
+
+            def grad(coef, X, y, weights, P2, family, link):
+                mu, devp = \
+                    family._mu_deviance_derivative(coef, X, y, weights, link)
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                objp = 0.5 * devp
+                objp[idx:] += L2
+                return objp
+
+            def grad_hess(coef, X, y, weights, P2, family, link):
+                intercept = (coef.size == X.shape[1] + 1)
+                idx = 1 if intercept else 0  # offset if coef[0] is intercept
+                if P2.ndim == 1:
+                    L2 = P2 * coef[idx:]
+                else:
+                    L2 = P2 @ coef[idx:]
+                eta = _safe_lin_pred(X, coef)
+                mu = link.inverse(eta)
+                d1 = link.inverse_derivative(eta)
+                temp = d1 * family.deviance_derivative(y, mu, weights)
+                if intercept:
+                    grad = np.concatenate(([0.5 * temp.sum()],
+                                           0.5 * temp @ X + L2))
+                else:
+                    grad = 0.5 * temp @ X + L2  # same as 0.5* X.T @ temp + L2
+
+                # expected hessian = fisher = X.T @ diag_matrix @ X
+                # calculate only diag_matrix
+                diag = d1**2 / family.variance(mu, phi=1, weights=weights)
+                if intercept:
+                    h0i = np.concatenate(([diag.sum()], diag @ X))
+
+                def Hs(coef):
+                    # return (0.5 * fisher + P2) @ coef
+                    # ret = 0.5 * (X.T @ (diag * (X @ coef)))
+                    ret = 0.5 * ((diag * (X @ coef[idx:])) @ X)
+                    if P2.ndim == 1:
+                        ret += P2 * coef[idx:]
+                    else:
+                        ret += P2 @ coef[idx:]
+                    if intercept:
+                        ret = np.concatenate(([0.5 * (h0i @ coef)],
+                                             ret + 0.5 * coef[0] * h0i[1:]))
+                    return ret
+
+                return grad, Hs
+
+            args = (X, y, weights, P2, family, link)
+            coef, self.n_iter_ = newton_cg(grad_hess, func, grad, coef,
+                                           args=args, maxiter=self.max_iter,
+                                           tol=self.tol)
+
+        # 4.4 coordinate descent ##############################################
+        # Note: we already set P1 = l1*P1, see above
+        # Note: we already set P2 = l2*P2, see above
+        # Note: we already symmetrized P2 = 1/2 (P2 + P2')
+        elif solver == 'cd':
+            coef, self.n_iter_, self._n_cycles = \
+                _cd_solver(coef=coef, X=X, y=y, weights=weights, P1=P1,
+                           P2=P2, fit_intercept=self.fit_intercept,
+                           family=family, link=link,
+                           max_iter=self.max_iter, tol=self.tol,
+                           selection=self.selection, random_state=random_state,
+                           diag_fisher=self.diag_fisher, copy_X=self.copy_X)
+
+        #######################################################################
+        # 5. postprocessing                                                   #
+        #######################################################################
+        if self.fit_intercept:
+            self.intercept_ = coef[0]
+            self.coef_ = coef[1:]
+        else:
+            # set intercept to zero as the other linear models do
+            self.intercept_ = 0.
+            self.coef_ = coef
+
+        if self.fit_dispersion in ['chisqr', 'deviance']:
+            # attention because of rescaling of weights
+            self.dispersion_ = self.estimate_phi(X, y, weights)*weights_sum
+
+        return self
+
+    def linear_predictor(self, X):
+        """Compute the linear_predictor = X*coef_ + intercept_.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values of linear predictor.
+        """
+        check_is_fitted(self, "coef_")
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
+        return X @ self.coef_ + self.intercept_
+
+    def predict(self, X, sample_weight=None):
+        """Predict using GLM with feature matrix X.
+
+        If sample_weight is given, returns prediction*sample_weight.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Samples.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+
+        Returns
+        -------
+        C : array, shape (n_samples,)
+            Returns predicted values times sample_weight.
+        """
+        # TODO: Is copy=True necessary?
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype='numeric', copy=True, ensure_2d=True,
+                        allow_nd=False)
+        eta = self.linear_predictor(X)
+        mu = self._link_instance.inverse(eta)
+        weights = _check_weights(sample_weight, X.shape[0])
+
+        return mu*weights
+
+    def estimate_phi(self, X, y, sample_weight=None):
+        """Estimate/fit the dispersion parameter phi.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Training data.
+
+        y : array-like, shape (n_samples,)
+            Target values.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        phi : float
+            Dispersion parameter.
+        """
+        check_is_fitted(self, "coef_")
+        _dtype = [np.float64, np.float32]
+        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                         dtype=_dtype, y_numeric=True, multi_output=False)
+        n_samples, n_features = X.shape
+        weights = _check_weights(sample_weight, n_samples)
+        eta = X @ self.coef_
+        if self.fit_intercept is True:
+            eta += self.intercept_
+            n_features += 1
+        if n_samples <= n_features:
+            raise ValueError("Estimation of dispersion parameter phi requires"
+                             " more samples than features, got"
+                             " samples=X.shape[0]={0} and"
+                             " n_features=X.shape[1]+fit_intercept={1}."
+                             .format(n_samples, n_features))
+        mu = self._link_instance.inverse(eta)
+        if self.fit_dispersion == 'chisqr':
+            chisq = np.sum(weights*(y-mu)**2 /
+                           self._family_instance.unit_variance(mu))
+            return chisq/(n_samples - n_features)
+        elif self.fit_dispersion == 'deviance':
+            dev = self._family_instance.deviance(y, mu, weights)
+            return dev/(n_samples - n_features)
+
+    # Note: check_estimator(GeneralizedLinearRegressor) might raise
+    # "AssertionError: -0.28014056555724598 not greater than 0.5"
+    # unless GeneralizedLinearRegressor has a score which passes the test.
+    def score(self, X, y, sample_weight=None):
+        """Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 deviance. Note that those two are equal
+        for family='normal'.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like, shape (n_samples,)
+            True values of target.
+
+        sample_weight : {None, array-like}, shape (n_samples,), optional \
+                (default=None)
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) w.r.t. y.
+        """
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        weights = _check_weights(sample_weight, y.shape[0])
+        mu = self.predict(X)
+        dev = self._family_instance.deviance(y, mu, weights=weights)
+        y_mean = np.average(y, weights=weights)
+        dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
+        return 1. - dev / dev_null
+
+    def _more_tags(self):
+        return {"requires_positive_y": True}
+
+
+class PoissonRegressor(GeneralizedLinearRegressor):
+    """Regression with the response variable y following a Poisson distribution
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as mu=h(X*w).
+    The fit minimizes the following objective function with L2 regularization::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s) + 1/2 * alpha * ||w||_2^2
+
+    with inverse link function h and s=sample_weight. Note that for
+    ``sample_weight=None``, one has s_i=1 and sum(s)=n_samples).
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, optional (default=1)
+        Constant that multiplies the penalty terms and thus determines the
+        regularization strength.
+        See the notes for the exact mathematical meaning of this
+        parameter.``alpha = 0`` is equivalent to unpenalized GLMs. In this
+        case, the design matrix X must have full column rank
+        (no collinearities).
+
+    fit_intercept : boolean, optional (default=True)
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X*coef+intercept).
+
+    fit_dispersion : {None, 'chisqr', 'deviance'}, optional (default=None)
+        Method for estimation of the dispersion parameter phi. Whether to use
+        the chi squared statistic or the deviance statistic. If None, the
+        dispersion is not estimated.
+
+    solver : {'irls', 'lbfgs', 'newton-cg'}, optional (default='irls')
+        Algorithm to use in the optimization problem:
+
+        'irls'
+            Iterated reweighted least squares. It is the standard algorithm
+            for GLMs.
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+        'newton-cg'
+            Newton conjugate gradient algorithm.
+
+        Note that all solvers except lbfgs use the fisher matrix, i.e. the
+        expected Hessian instead of the Hessian matrix.
+
+    max_iter : int, optional (default=100)
+        The maximal number of iterations for solver algorithms.
+
+    tol : float, optional (default=1e-4)
+        Stopping criterion. For the irls, newton-cg and lbfgs solvers,
+        the iteration will stop when ``max{|g_i|, i = 1, ..., n} <= tol``
+        where ``g_i`` is the i-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : boolean, optional (default=False)
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` (supersedes option
+        ``start_params``). If set to ``True`` or if the attribute ``coef_``
+        does not exit (first call to ``fit``), option ``start_params`` sets the
+        start values for ``coef_`` and ``intercept_``.
+
+    start_params : {'guess', 'zero', array of shape (n_features*, )}, \
+            optional (default='guess')
+        Relevant only if ``warm_start=False`` or if fit is called
+        the first time (``self.coef_`` does not yet exist).
+
+        'guess'
+            Start values of mu are calculated by family.starting_mu(..). Then,
+            one Newton step obtains start values for ``coef_``. If
+            ``solver='irls'``, it uses one irls step. This gives usually good
+            starting values.
+
+        'zero'
+        All coefficients are set to zero. If ``fit_intercept=True``, the
+        start value for the intercept is obtained by the weighted average of y.
+
+        array
+        The array of size n_features* is directly used as start values
+        for ``coef_``. If ``fit_intercept=True``, the first element
+        is assumed to be the start value for the ``intercept_``.
+        Note that n_features* = X.shape[1] + fit_intercept, i.e. it includes
+        the intercept in counting.
+
+    random_state : {int, RandomState instance, None}, optional (default=None)
+        If int, random_state is the seed used by the random
+        number generator; if RandomState instance, random_state is the random
+        number generator; if None, the random number generator is the
+        RandomState instance used by `np.random`. Used when ``selection`` ==
+        'random'.
+
+    copy_X : boolean, optional, (default=True)
+        If ``True``, X will be copied; else, it may be overwritten.
+
+    verbose : int, optional (default=0)
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array, shape (n_features,)
+        Estimated coefficients for the linear predictor (X*coef_+intercept_) in
+        the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    dispersion_ : float
+        The dispersion parameter :math:`\\phi` if ``fit_dispersion`` was set.
+
+    n_iter_ : int
+        Actual number of iterations used in solver.
+
+    Notes
+    -----
+    The fit itself does not need Y to be from an EDM, but only assumes
+    the first two moments to be :math:`E[Y_i]=\\mu_i=h((Xw)_i)` and
+    :math:`Var[Y_i]=\\frac{\\phi}{s_i} v(\\mu_i)`. The unit variance function
+    :math:`v(\\mu_i)` is a property of and given by the specific EDM, see
+    :ref:`User Guide <Generalized_linear_regression>`.
+
+    The parameters :math:`w` (`coef_` and `intercept_`) are estimated by
+    minimizing the deviance plus penalty term, which is equivalent to
+    (penalized) maximum likelihood estimation.
+
+    For alpha > 0, the feature matrix X should be standardized in order to
+    penalize features equally strong.
+
+    If the target y is a ratio, appropriate sample weights s should be
+    provided.
+    As an example, consider Poisson distributed counts z (integers) and
+    weights s=exposure (time, money, persons years, ...). Then you fit
+    y = z/s, i.e. ``PoissonRegressor().fit(X, y, sample_weight=s)``.
+    The weights are necessary for the right (finite sample) mean.
+    Consider :math:`\\bar{y} = \\frac{\\sum_i s_i y_i}{\\sum_i s_i}`,
+    in this case one might say that y has a 'scaled' Poisson distributions.
+
+    References
+    ----------
+    For the coordinate descent implementation:
+        * Guo-Xun Yuan, Chia-Hua Ho, Chih-Jen Lin
+          An Improved GLMNET for L1-regularized Logistic Regression,
+          Journal of Machine Learning Research 13 (2012) 1999-2030
+          https://www.csie.ntu.edu.tw/~cjlin/papers/l1_glmnet/long-glmnet.pdf
+    """
+    def __init__(self, alpha=1.0, fit_intercept=True, fit_dispersion=None,
+                 solver='irls', max_iter=100,
+                 tol=1e-4, warm_start=False, start_params='guess',
+                 random_state=None, copy_X=True, check_input=True, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family="poisson", link='log',
+                         fit_dispersion=fit_dispersion, solver=solver,
+                         max_iter=max_iter, tol=tol, warm_start=warm_start,
+                         start_params=start_params, random_state=random_state,
+                         copy_X=copy_X, verbose=verbose)
diff --git a/sklearn/linear_model/tests/test_glm.py b/sklearn/linear_model/tests/test_glm.py
new file mode 100644
index 0000000000000..1416bdcfad680
--- /dev/null
+++ b/sklearn/linear_model/tests/test_glm.py
@@ -0,0 +1,710 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from numpy.testing import assert_allclose
+import pytest
+import scipy as sp
+from scipy import linalg, optimize, sparse
+
+from sklearn.datasets import make_classification, make_regression
+from sklearn.linear_model import GeneralizedLinearRegressor
+from sklearn.linear_model._glm import (
+    Link,
+    IdentityLink,
+    LogLink,
+    LogitLink,
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+    GeneralizedHyperbolicSecant, BinomialDistribution,
+)
+from sklearn.linear_model import ElasticNet, LogisticRegression, Ridge
+from sklearn.metrics import mean_absolute_error
+from sklearn.exceptions import ConvergenceWarning
+
+from sklearn.utils.testing import assert_array_equal
+
+GLM_SOLVERS = ['irls', 'lbfgs', 'newton-cg', 'cd']
+
+
+@pytest.fixture(scope="module")
+def regression_data():
+    X, y = make_regression(n_samples=107,
+                           n_features=10,
+                           n_informative=80, noise=0.5,
+                           random_state=2)
+    return X, y
+
+
+@pytest.mark.parametrize('link', Link.__subclasses__())
+def test_link_properties(link):
+    """Test link inverse and derivative."""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100)*100
+    link = link()  # instantiate object
+    if isinstance(link, LogitLink):
+        # careful for large x, note expit(36) = 1
+        # limit max eta to 15
+        x = x / 100 * 15
+    assert_allclose(link.link(link.inverse(x)), x)
+    # if f(g(x)) = x, then f'(g(x)) = 1/g'(x)
+    assert_allclose(link.derivative(link.inverse(x)),
+                    1./link.inverse_derivative(x))
+
+    assert (
+      link.inverse_derivative2(x).shape == link.inverse_derivative(x).shape)
+
+    # for LogitLink, in the following x should be between 0 and 1.
+    # assert_almost_equal(link.inverse_derivative(link.link(x)),
+    #                     1./link.derivative(x), decimal=decimal)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (InverseGaussianDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
+    result = family.in_y_range([-1, 0, 1])
+    assert_array_equal(result, expected)
+
+
+def test_tweedie_distribution_power():
+    with pytest.raises(ValueError, match="no distribution exists"):
+        TweedieDistribution(power=0.5)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        TweedieDistribution(power=1j)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        dist = TweedieDistribution()
+        dist.power = 1j
+
+    dist = TweedieDistribution()
+    assert dist._include_lower_bound is False
+    dist.power = 1
+    assert dist._include_lower_bound is True
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (InverseGaussianDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5]),
+     (GeneralizedHyperbolicSecant(), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
+
+
+@pytest.mark.parametrize(
+    'family, link',
+    [(NormalDistribution(), IdentityLink()),
+     (PoissonDistribution(), LogLink()),
+     (GammaDistribution(), LogLink()),
+     (InverseGaussianDistribution(), LogLink()),
+     (TweedieDistribution(power=1.5), LogLink()),
+     (TweedieDistribution(power=4.5), LogLink())],
+    ids=lambda args: args.__class__.__name__)
+def test_fisher_matrix(family, link):
+    """Test the Fisher matrix numerically.
+    Trick: Use numerical differentiation with y = mu"""
+    coef = np.array([-2, 1, 0, 1, 2.5])
+    phi = 0.5
+    rng = np.random.RandomState(42)
+    X = rng.randn(10, 5)
+    lin_pred = np.dot(X, coef)
+    mu = link.inverse(lin_pred)
+    weights = rng.randn(10)**2 + 1
+    fisher = family._fisher_matrix(coef=coef, phi=phi, X=X, y=mu,
+                                   weights=weights, link=link)
+    # check that the Fisher matrix is square and positive definite
+    assert fisher.ndim == 2
+    assert fisher.shape[0] == fisher.shape[1]
+    assert np.all(np.linalg.eigvals(fisher) >= 0)
+
+    approx = np.array([]).reshape(0, coef.shape[0])
+    for i in range(coef.shape[0]):
+        def f(coef):
+            return -family._score(coef=coef, phi=phi, X=X, y=mu,
+                                  weights=weights, link=link)[i]
+        approx = np.vstack(
+            [approx, sp.optimize.approx_fprime(xk=coef, f=f, epsilon=1e-5)])
+    assert_allclose(fisher, approx, rtol=1e-3)
+
+    # check the observed information matrix
+    oim = family._observed_information(coef=coef, phi=phi, X=X, y=mu,
+                                       weights=weights, link=link)
+    assert oim.ndim == 2
+    assert oim.shape == fisher.shape
+    assert_allclose(oim, fisher)
+
+
+def test_sample_weights_validation():
+    """Test the raised errors in the validation of sample_weight."""
+    # scalar value but not positive
+    X = [[1]]
+    y = [1]
+    weights = 0
+    glm = GeneralizedLinearRegressor(fit_intercept=False)
+    with pytest.raises(ValueError, match="weights must be non-negative"):
+        glm.fit(X, y, weights)
+
+    # Positive weights are accepted
+    glm.fit(X, y, sample_weight=1)
+
+    # 2d array
+    weights = [[0]]
+    with pytest.raises(ValueError, match="must be 1D array or scalar"):
+        glm.fit(X, y, weights)
+
+    # 1d but wrong length
+    weights = [1, 0]
+    with pytest.raises(ValueError,
+                       match="weights must have the same length as y"):
+        glm.fit(X, y, weights)
+
+    # 1d but only zeros (sum not greater than 0)
+    weights = [0, 0]
+    X = [[0], [1]]
+    y = [1, 2]
+    with pytest.raises(ValueError,
+                       match="must have at least one positive element"):
+        glm.fit(X, y, weights)
+
+    # 5. 1d but with a negative value
+    weights = [2, -1]
+    with pytest.raises(ValueError, match="weights must be non-negative"):
+        glm.fit(X, y, weights)
+
+
+@pytest.mark.parametrize('f, fam',
+                         [('normal', NormalDistribution()),
+                          ('poisson', PoissonDistribution()),
+                          ('gamma', GammaDistribution()),
+                          ('inverse.gaussian', InverseGaussianDistribution()),
+                          ('binomial', BinomialDistribution())])
+def test_glm_family_argument(f, fam):
+    """Test GLM family argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family=f, alpha=0).fit(X, y)
+    assert isinstance(glm._family_instance, fam.__class__)
+
+    glm = GeneralizedLinearRegressor(family='not a family',
+                                     fit_intercept=False)
+    with pytest.raises(ValueError, match="family must be"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('l, link',
+                         [('identity', IdentityLink()),
+                          ('log', LogLink()),
+                          ('logit', LogitLink())])
+def test_glm_link_argument(l, link):
+    """Test GLM link argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', link=l).fit(X, y)
+    assert isinstance(glm._link_instance, link.__class__)
+
+    glm = GeneralizedLinearRegressor(family='normal', link='not a link')
+    with pytest.raises(ValueError, match="link must be"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('alpha', ['not a number', -4.2])
+def test_glm_alpha_argument(alpha):
+    """Test GLM for invalid alpha argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
+    with pytest.raises(ValueError,
+                       match="Penalty term must be a non-negative"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('l1_ratio', ['not a number', -4.2, 1.1, [1]])
+def test_glm_l1_ratio_argument(l1_ratio):
+    """Test GLM for invalid l1_ratio argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', l1_ratio=l1_ratio)
+    with pytest.raises(ValueError,
+                       match="l1_ratio must be a number in interval.*0, 1"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('P1', [['a string', 'a string'], [1, [2]], [1, 2, 3],
+                                [-1]])
+def test_glm_P1_argument(P1):
+    """Test GLM for invalid P1 argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P1=P1, l1_ratio=0.5, check_input=True)
+    with pytest.raises((ValueError, TypeError)):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('P2', ['a string', [1, 2, 3], [[2, 3]],
+                                sparse.csr_matrix([1, 2, 3]), [-1]])
+def test_glm_P2_argument(P2):
+    """Test GLM for invalid P2 argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(P2=P2, check_input=True)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+def test_glm_P2_positive_semidefinite():
+    """Test GLM for a positive semi-definite P2 argument."""
+    n_samples, n_features = 10, 5
+    y = np.arange(n_samples)
+    X = np.zeros((n_samples, n_features))
+    P2 = np.diag([100, 10, 5, 0, -1E-5])
+    rng = np.random.RandomState(42)
+    # construct random orthogonal matrix Q
+    Q, R = linalg.qr(rng.randn(n_features, n_features))
+    P2 = Q.T @ P2 @ Q
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
+        glm.fit(X, y)
+
+    P2 = sparse.csr_matrix(P2)
+    glm = GeneralizedLinearRegressor(P2=P2, fit_intercept=False,
+                                     check_input=True)
+    with pytest.raises(ValueError, match="P2 must be positive semi-definite"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
+def test_glm_fit_intercept_argument(fit_intercept):
+    """Test GLM for invalid fit_intercept argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+    with pytest.raises(ValueError, match="fit_intercept must be bool"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('solver, l1_ratio',
+                         [('not a solver', 0), (1, 0), ([1], 0),
+                          ('irls', 0.5), ('lbfgs', 0.5), ('newton-cg', 0.5)])
+def test_glm_solver_argument(solver, l1_ratio):
+    """Test GLM for invalid solver argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(solver=solver, l1_ratio=l1_ratio)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]])
+def test_glm_max_iter_argument(max_iter):
+    """Test GLM for invalid max_iter argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(max_iter=max_iter)
+    with pytest.raises(ValueError, match="must be a positive integer"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]])
+def test_glm_tol_argument(tol):
+    """Test GLM for invalid tol argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(tol=tol)
+    with pytest.raises(ValueError, match="stopping criteria must be positive"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]])
+def test_glm_warm_start_argument(warm_start):
+    """Test GLM for invalid warm_start argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(warm_start=warm_start)
+    with pytest.raises(ValueError, match="warm_start must be bool"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('start_params',
+                         ['not a start_params', ['zero'], [0, 0, 0],
+                          [[0, 0]], ['a', 'b']])
+def test_glm_start_params_argument(start_params):
+    """Test GLM for invalid start_params argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(start_params=start_params)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('selection', ['not a selection', 1, 0, ['cyclic']])
+def test_glm_selection_argument(selection):
+    """Test GLM for invalid selection argument"""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(selection=selection)
+    with pytest.raises(ValueError, match="argument selection must be"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('random_state', ['a string', 0.5, [0]])
+def test_glm_random_state_argument(random_state):
+    """Test GLM for invalid random_state argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(random_state=random_state)
+    with pytest.raises(ValueError, match="cannot be used to seed"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('diag_fisher', ['not bool', 1, 0, [True]])
+def test_glm_diag_fisher_argument(diag_fisher):
+    """Test GLM for invalid diag_fisher arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(diag_fisher=diag_fisher)
+    with pytest.raises(ValueError, match="diag_fisher must be bool"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('copy_X', ['not bool', 1, 0, [True]])
+def test_glm_copy_X_argument(copy_X):
+    """Test GLM for invalid copy_X arguments."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(copy_X=copy_X)
+    with pytest.raises(ValueError, match="copy_X must be bool"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('check_input', ['not bool', 1, 0, [True]])
+def test_glm_check_input_argument(check_input):
+    """Test GLM for invalid check_input argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(check_input=check_input)
+    with pytest.raises(ValueError, match="check_input must be bool"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('solver', GLM_SOLVERS)
+def test_glm_identity_regression(solver):
+    """Test GLM regression with identity link on a simple dataset."""
+    coef = [1., 2.]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.dot(X, coef)
+    glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
+                                     fit_intercept=False, solver=solver,
+                                     start_params='zero', tol=1e-7)
+    res = glm.fit(X, y)
+    assert_allclose(res.coef_, coef, rtol=1e-6)
+
+
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5),
+     GeneralizedHyperbolicSecant()])
+@pytest.mark.parametrize('solver, tol', [('irls', 1e-6),
+                                         ('lbfgs', 1e-6),
+                                         ('newton-cg', 1e-7),
+                                         ('cd', 1e-7)])
+def test_glm_log_regression(family, solver, tol):
+    """Test GLM regression with log link on a simple dataset."""
+    coef = [0.2, -0.1]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.exp(np.dot(X, coef))
+    glm = GeneralizedLinearRegressor(
+                alpha=0, family=family, link='log', fit_intercept=False,
+                solver=solver, start_params='guess', tol=tol)
+    res = glm.fit(X, y)
+    assert_allclose(res.coef_, coef, rtol=5e-6)
+
+
+# newton-cg may issue a LineSearchWarning, which we filter out
+@pytest.mark.filterwarnings('ignore:The line search algorithm')
+@pytest.mark.filterwarnings('ignore:Line Search failed')
+@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize('solver', GLM_SOLVERS)
+def test_normal_ridge_comparison(n_samples, n_features, fit_intercept, solver):
+    """Test ridge regression for Normal distributions.
+
+    Case n_samples >> n_features
+
+    Compare to test_ridge in test_ridge.py.
+    """
+    alpha = 1.0
+    n_predict = 10
+    X, y, coef = make_regression(n_samples=n_samples+n_predict,
+                                 n_features=n_features,
+                                 n_informative=n_features-2, noise=0.5,
+                                 coef=True, random_state=42)
+    y = y[0:n_samples]
+    X, T = X[0:n_samples], X[n_samples:]
+
+    if n_samples > n_features:
+        ridge_params = {"solver": "svd"}
+    else:
+        ridge_params = {"solver": "sag", "max_iter": 10000, "tol": 1e-9}
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha*n_samples, normalize=False,
+                  random_state=42, **ridge_params)
+    ridge.fit(X, y)
+
+    glm = GeneralizedLinearRegressor(alpha=1.0, l1_ratio=0, family='normal',
+                                     link='identity', fit_intercept=True,
+                                     max_iter=300, solver=solver, tol=1e-6,
+                                     check_input=False, random_state=42)
+    glm.fit(X, y)
+    assert glm.coef_.shape == (X.shape[1], )
+    assert_allclose(glm.coef_, ridge.coef_, rtol=5e-6)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-6)
+    assert_allclose(glm.predict(T), ridge.predict(T), rtol=1e-5)
+
+
+@pytest.mark.parametrize('solver, tol',
+                         [('irls', 1e-7),
+                          ('lbfgs', 1e-7),
+                          ('newton-cg', 1e-7),
+                          ('cd', 1e-7)])
+def test_poisson_ridge(solver, tol):
+    """Test ridge regression with poisson family and LogLink.
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.12889386979
+    # a            0.29019207995
+    # b            0.03741173122
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    rng = np.random.RandomState(42)
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0,
+                                     fit_intercept=True, family='poisson',
+                                     link='log', tol=1e-7,
+                                     solver=solver, max_iter=300,
+                                     random_state=rng)
+    glm.fit(X, y)
+    assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
+    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-6)
+
+
+@pytest.mark.parametrize('diag_fisher', [False, True])
+def test_normal_enet(diag_fisher):
+    """Test elastic net regression with normal/gaussian family."""
+    alpha, l1_ratio = 0.3, 0.7
+    n_samples, n_features = 20, 2
+    rng = np.random.RandomState(42)
+    X = rng.randn(n_samples, n_features).copy(order='F')
+    beta = rng.randn(n_features)
+    y = 2 + np.dot(X, beta) + rng.randn(n_samples)
+
+    # 1. test normal enet on dense data
+    glm = GeneralizedLinearRegressor(alpha=alpha, l1_ratio=l1_ratio,
+                                     family='normal', link='identity',
+                                     fit_intercept=True, tol=1e-8,
+                                     max_iter=100, selection='cyclic',
+                                     solver='cd', start_params='zero',
+                                     check_input=False,
+                                     diag_fisher=diag_fisher)
+    glm.fit(X, y)
+
+    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=True,
+                      normalize=False, tol=1e-8, copy_X=True)
+    enet.fit(X, y)
+
+    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
+    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
+
+    # 2. test normal enet on sparse data
+    X = sparse.csc_matrix(X)
+    glm.fit(X, y)
+    assert_allclose(glm.intercept_, enet.intercept_, rtol=2e-7)
+    assert_allclose(glm.coef_, enet.coef_, rtol=5e-5)
+
+
+def test_poisson_enet():
+    """Test elastic net regression with poisson family and LogLink.
+
+    Compare to R's glmnet"""
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0.5, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.03550978409
+    # a            0.16936423283
+    # b            .
+    glmnet_intercept = -0.03550978409
+    glmnet_coef = [0.16936423283, 0.]
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    rng = np.random.RandomState(42)
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-8,
+                                     selection='random', random_state=rng,
+                                     start_params='guess')
+    glm.fit(X, y)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=2e-6)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=2e-7)
+
+    # test results with general optimization procedure
+    def obj(coef):
+        pd = PoissonDistribution()
+        link = LogLink()
+        N = y.shape[0]
+        mu = link.inverse(X @ coef[1:] + coef[0])
+        alpha, l1_ratio = (1, 0.5)
+        return 1./(2.*N) * pd.deviance(y, mu) \
+            + 0.5 * alpha * (1-l1_ratio) * (coef[1:]**2).sum() \
+            + alpha * l1_ratio * np.sum(np.abs(coef[1:]))
+    res = optimize.minimize(obj, [0, 0, 0], method='nelder-mead', tol=1e-10,
+                            options={'maxiter': 1000, 'disp': False})
+    assert_allclose(glm.intercept_, res.x[0], rtol=5e-5)
+    assert_allclose(glm.coef_, res.x[1:], rtol=1e-5, atol=1e-9)
+    assert_allclose(obj(np.concatenate(([glm.intercept_], glm.coef_))),
+                    res.fun, rtol=1e-8)
+
+    # same for start_params='zero' and selection='cyclic'
+    # with reduced precision
+    glm = GeneralizedLinearRegressor(alpha=1, l1_ratio=0.5, family='poisson',
+                                     link='log', solver='cd', tol=1e-5,
+                                     selection='cyclic', start_params='zero')
+    glm.fit(X, y)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
+
+    # check warm_start, therefore start with different alpha
+    glm = GeneralizedLinearRegressor(alpha=0.005, l1_ratio=0.5,
+                                     family='poisson', max_iter=300,
+                                     link='log', solver='cd', tol=1e-5,
+                                     selection='cyclic', start_params='zero')
+    glm.fit(X, y)
+    # warm start with original alpha and use of sparse matrices
+    glm.warm_start = True
+    glm.alpha = 1
+    X = sparse.csr_matrix(X)
+    glm.fit(X, y)
+    assert_allclose(glm.intercept_, glmnet_intercept, rtol=1e-4)
+    assert_allclose(glm.coef_, glmnet_coef, rtol=1e-4)
+
+
+@pytest.mark.parametrize('alpha', [0.01, 0.1, 1, 10])
+def test_binomial_enet(alpha):
+    """Test elastic net regression with binomial family and LogitLink.
+
+    Compare to LogisticRegression.
+    """
+    l1_ratio = 0.5
+    n_samples = 500
+    rng = np.random.RandomState(42)
+    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=6,
+                               n_informative=5, n_redundant=0, n_repeated=0,
+                               random_state=rng)
+    log = LogisticRegression(
+        penalty='elasticnet', random_state=rng, fit_intercept=False, tol=1e-6,
+        max_iter=1000, l1_ratio=l1_ratio, C=1./(n_samples * alpha),
+        solver='saga')
+    log.fit(X, y)
+
+    glm = GeneralizedLinearRegressor(
+        family=BinomialDistribution(), link=LogitLink(), fit_intercept=False,
+        alpha=alpha, l1_ratio=l1_ratio, solver='cd', selection='cyclic',
+        tol=1e-7)
+    glm.fit(X, y)
+    assert_allclose(log.intercept_[0], glm.intercept_, rtol=1e-6)
+    assert_allclose(log.coef_[0, :], glm.coef_, rtol=5e-6)
+
+
+@pytest.mark.parametrize(
+        "params",
+        [
+            {"solver": "irls", "start_params": "guess"},
+            {"solver": "irls", "start_params": "zero"},
+            {"solver": "lbfgs", "start_params": "guess"},
+            {"solver": "lbfgs", "start_params": "zero"},
+            {"solver": "newton-cg"},
+            {"solver": "cd", "selection": "cyclic", "diag_fisher": False},
+            {"solver": "cd", "selection": "cyclic", "diag_fisher": True},
+            {"solver": "cd", "selection": "random", "diag_fisher": False},
+        ],
+        ids=lambda params: ', '.join("%s=%s" % (key, val)
+                                     for key,  val in params.items())
+)
+def test_solver_equivalence(params, regression_data):
+    X, y = regression_data
+    est_ref = GeneralizedLinearRegressor(random_state=2)
+    est_ref.fit(X, y)
+
+    estimator = GeneralizedLinearRegressor(**params)
+    estimator.set_params(random_state=2)
+
+    estimator.fit(X, y)
+
+    assert_allclose(estimator.intercept_, est_ref.intercept_, rtol=1e-4)
+    assert_allclose(estimator.coef_, est_ref.coef_, rtol=1e-4)
+    assert_allclose(
+        mean_absolute_error(estimator.predict(X), y),
+        mean_absolute_error(est_ref.predict(X), y),
+        rtol=1e-4
+    )
+
+
+def test_fit_dispersion(regression_data):
+    X, y = regression_data
+
+    est1 = GeneralizedLinearRegressor(random_state=2)
+    est1.fit(X, y)
+    assert not hasattr(est1, "dispersion_")
+
+    est2 = GeneralizedLinearRegressor(random_state=2, fit_dispersion="chisqr")
+    est2.fit(X, y)
+    assert isinstance(est2.dispersion_, float)
+
+    est3 = GeneralizedLinearRegressor(
+            random_state=2, fit_dispersion="deviance")
+    est3.fit(X, y)
+    assert isinstance(est3.dispersion_, float)
+
+    assert_allclose(est2.dispersion_,  est3.dispersion_)
+
+
+@pytest.mark.parametrize("solver", GLM_SOLVERS)
+def test_convergence_warning(solver, regression_data):
+    X, y = regression_data
+
+    est = GeneralizedLinearRegressor(solver=solver, random_state=2,
+                                     max_iter=1, tol=1e-20)
+    with pytest.warns(ConvergenceWarning):
+        est.fit(X, y)