diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 752b41151fca0..c138f51f6c06f 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -837,6 +837,21 @@ Any estimator using the Huber loss would also be robust to outliers, e.g.
    linear_model.RANSACRegressor
    linear_model.TheilSenRegressor
 
+Generalized linear models (GLM) for regression
+----------------------------------------------
+
+These models allow for response variables to have error distributions other
+than a normal distribution:
+
+.. autosummary::
+   :toctree: generated/
+   :template: class.rst
+
+   linear_model.PoissonRegressor
+   linear_model.TweedieRegressor
+   linear_model.GammaRegressor
+
+
 Miscellaneous
 -------------
 
diff --git a/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png b/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png
new file mode 100644
index 0000000000000..3b95b724a6623
Binary files /dev/null and b/doc/modules/glm_data/poisson_gamma_tweedie_distributions.png differ
diff --git a/doc/modules/linear_model.rst b/doc/modules/linear_model.rst
index 19205385f311b..fc5f254035a53 100644
--- a/doc/modules/linear_model.rst
+++ b/doc/modules/linear_model.rst
@@ -556,13 +556,13 @@ orthogonal matching pursuit can approximate the optimum solution vector with a
 fixed number of non-zero elements:
 
 .. math::
-    \underset{\gamma}{\operatorname{arg\,min\,}}  ||y - X\gamma||_2^2 \text{ subject to } ||\gamma||_0 \leq n_{\text{nonzero\_coefs}}
+    \underset{w}{\operatorname{arg\,min\,}}  ||y - Xw||_2^2 \text{ subject to } ||w||_0 \leq n_{\text{nonzero\_coefs}}
 
 Alternatively, orthogonal matching pursuit can target a specific error instead
 of a specific number of non-zero coefficients. This can be expressed as:
 
 .. math::
-    \underset{\gamma}{\operatorname{arg\,min\,}} ||\gamma||_0 \text{ subject to } ||y-X\gamma||_2^2 \leq \text{tol}
+    \underset{w}{\operatorname{arg\,min\,}} ||w||_0 \text{ subject to } ||y-Xw||_2^2 \leq \text{tol}
 
 
 OMP is based on a greedy algorithm that includes at each step the atom most
@@ -906,7 +906,7 @@ with 'log' loss, which might be even faster but requires more tuning.
     It is possible to obtain the p-values and confidence intervals for
     coefficients in cases of regression without penalization. The `statsmodels
     package <https://pypi.org/project/statsmodels/>` natively supports this.
-    Within sklearn, one could use bootstrapping instead as well.  
+    Within sklearn, one could use bootstrapping instead as well.
 
 
 :class:`LogisticRegressionCV` implements Logistic Regression with built-in
@@ -928,6 +928,149 @@ to warm-starting (see :term:`Glossary <warm_start>`).
     .. [9] `"Performance Evaluation of Lbfgs vs other solvers"
             <http://www.fuzihao.org/blog/2016/01/16/Comparison-of-Gradient-Descent-Stochastic-Gradient-Descent-and-L-BFGS/>`_
 
+.. _Generalized_linear_regression:
+
+Generalized Linear Regression
+=============================
+
+Generalized Linear Models (GLM) extend linear models in two ways
+[10]_. First, the predicted values :math:`\hat{y}` are linked to a linear
+combination of the input variables :math:`X` via an inverse link function
+:math:`h` as
+
+.. math::    \hat{y}(w, X) = h(Xw).
+
+Secondly, the squared loss function is replaced by the unit deviance
+:math:`d` of a distribution in the exponential family (or more precisely, a
+reproductive exponential dispersion model (EDM) [11]_).
+
+The minimization problem becomes:
+
+.. math::    \min_{w} \frac{1}{2 n_{\text{samples}}} \sum_i d(y_i, \hat{y}_i) + \frac{\alpha}{2} ||w||_2,
+
+where :math:`\alpha` is the L2 regularization penalty. When sample weights are
+provided, the average becomes a weighted average.
+
+The following table lists some specific EDMs and their unit deviance (all of
+these are instances of the Tweedie family):
+
+================= ===============================  ============================================
+Distribution       Target Domain                    Unit Deviance :math:`d(y, \hat{y})`
+================= ===============================  ============================================
+Normal            :math:`y \in (-\infty, \infty)`  :math:`(y-\hat{y})^2`
+Poisson           :math:`y \in [0, \infty)`        :math:`2(y\log\frac{y}{\hat{y}}-y+\hat{y})`
+Gamma             :math:`y \in (0, \infty)`        :math:`2(\log\frac{\hat{y}}{y}+\frac{y}{\hat{y}}-1)`
+Inverse Gaussian  :math:`y \in (0, \infty)`        :math:`\frac{(y-\hat{y})^2}{y\hat{y}^2}`
+================= ===============================  ============================================
+
+The Probability Density Functions (PDF) of these distributions are illustrated
+in the following figure,
+
+.. figure:: ./glm_data/poisson_gamma_tweedie_distributions.png
+   :align: center
+   :scale: 100%
+
+   PDF of a random variable Y following Poisson, Tweedie (power=1.5) and Gamma
+   distributions with different mean values (:math:`\mu`). Observe the point
+   mass at :math:`Y=0` for the Poisson distribution and the Tweedie (power=1.5)
+   distribution, but not for the Gamma distribution which has a strictly
+   positive target domain.
+
+The choice of the distribution depends on the problem at hand:
+
+* If the target values :math:`y` are counts (non-negative integer valued) or
+  relative frequencies (non-negative), you might use a Poisson deviance
+  with log-link.
+* If the target values are positive valued and skewed, you might try a
+  Gamma deviance with log-link.
+* If the target values seem to be heavier tailed than a Gamma distribution,
+  you might try an Inverse Gaussian deviance (or even higher variance powers
+  of the Tweedie family).
+
+
+Examples of use cases include:
+
+* Agriculture / weather modeling:  number of rain events per year (Poisson),
+  amount of rainfall per event (Gamma), total rainfall per year (Tweedie /
+  Compound Poisson Gamma).
+* Risk modeling / insurance policy pricing:  number of claim events /
+  policyholder per year (Poisson), cost per event (Gamma), total cost per
+  policyholder per year (Tweedie / Compound Poisson Gamma).
+* Predictive maintenance: number of production interruption events per year:
+  Poisson, duration of interruption: Gamma, total interruption time per year
+  (Tweedie / Compound Poisson Gamma).
+
+
+.. topic:: References:
+
+    .. [10] McCullagh, Peter; Nelder, John (1989). Generalized Linear Models,
+       Second Edition. Boca Raton: Chapman and Hall/CRC. ISBN 0-412-31760-5.
+
+    .. [11] Jørgensen, B. (1992). The theory of exponential dispersion models
+       and analysis of deviance. Monografias de matemática, no. 51.  See also
+       `Exponential dispersion model.
+       <https://en.wikipedia.org/wiki/Exponential_dispersion_model>`_
+
+Usage
+-----
+
+:class:`TweedieRegressor` implements a generalized linear model for the
+Tweedie distribution, that allows to model any of the above mentioned
+distributions using the appropriate ``power`` parameter. In particular:
+
+- ``power = 0``: Normal distribution. Specific estimators such as
+  :class:`Ridge`, :class:`ElasticNet` are generally more appropriate in
+  this case.
+- ``power = 1``: Poisson distribution. :class:`PoissonRegressor` is exposed
+  for convenience. However, it is strictly equivalent to
+  `TweedieRegressor(power=1, link='log')`.
+- ``power = 2``: Gamma distribution. :class:`GammaRegressor` is exposed for
+  convenience. However, it is strictly equivalent to
+  `TweedieRegressor(power=2, link='log')`.
+- ``power = 3``: Inverse Gaussian distribution.
+
+The link function is determined by the `link` parameter.
+
+Usage example::
+
+    >>> from sklearn.linear_model import TweedieRegressor
+    >>> reg = TweedieRegressor(power=1, alpha=0.5, link='log')
+    >>> reg.fit([[0, 0], [0, 1], [2, 2]], [0, 1, 2])
+    TweedieRegressor(alpha=0.5, link='log', power=1)
+    >>> reg.coef_
+    array([0.2463..., 0.4337...])
+    >>> reg.intercept_
+    -0.7638...
+
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_poisson_regression_non_normal_loss.py`
+  * :ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`
+
+Practical considerations
+------------------------
+
+The feature matrix `X` should be standardized before fitting. This ensures
+that the penalty treats features equally.
+
+Since the linear predictor :math:`Xw` can be negative and Poisson,
+Gamma and Inverse Gaussian distributions don't support negative values, it
+is necessary to apply an inverse link function that guarantees the
+non-negativeness. For example with `link='log'`, the inverse link function
+becomes :math:`h(Xw)=\exp(Xw)`.
+
+If you want to model a relative frequency, i.e. counts per exposure (time,
+volume, ...) you can do so by using a Poisson distribution and passing
+:math:`y=\frac{\mathrm{counts}}{\mathrm{exposure}}` as target values
+together with :math:`\mathrm{exposure}` as sample weights. For a concrete
+example see e.g.
+:ref:`sphx_glr_auto_examples_linear_model_plot_tweedie_regression_insurance_claims.py`.
+
+When performing cross-validation for the `power` parameter of
+`TweedieRegressor`, it is advisable to specify an explicit `scoring` function,
+because the default scorer :meth:`TweedieRegressor.score` is a function of
+`power` itself.
 
 Stochastic Gradient Descent - SGD
 =================================
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
index 368d92a012097..da3588026d35d 100644
--- a/doc/whats_new/v0.23.rst
+++ b/doc/whats_new/v0.23.rst
@@ -222,6 +222,13 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |MajorFeature| Added generalized linear models (GLM) with non normal error
+  distributions, including :class:`linear_model.PoissonRegressor`,
+  :class:`linear_model.GammaRegressor` and :class:`linear_model.TweedieRegressor`
+  which use Poisson, Gamma and Tweedie distributions respectively.
+  :pr:`14300` by :user:`Christian Lorentzen <lorentzenchr>`, `Roman Yurchak`_,
+  and `Olivier Grisel`_.
+
 - |Feature| Support of `sample_weight` in :class:`linear_model.ElasticNet` and
   :class:`linear_model:Lasso` for dense feature matrix `X`.
   :pr:`15436` by :user:`Christian Lorentzen <lorentzenchr>`.
diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
new file mode 100644
index 0000000000000..ee863dd4198ba
--- /dev/null
+++ b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -0,0 +1,455 @@
+"""
+======================================
+Poisson regression and non-normal loss
+======================================
+
+This example illustrates the use of log-linear Poisson regression
+on the `French Motor Third-Party Liability Claims dataset
+<https://www.openml.org/d/41214>`_ from [1]_ and compares
+it with models learned with least squared error. In this dataset, each sample
+corresponds to an insurance policy, i.e. a contract within an insurance
+company and an individual (policiholder). Available features include driver
+age, vehicle age, vehicle power, etc.
+
+A few definitions: a *claim* is the request made by a policyholder to the
+insurer to compensate for a loss covered by the insurance. The *exposure* is
+the duration of the insurance coverage of a given policy, in years.
+
+Our goal is to predict the expected number of insurance claims (or frequency)
+following car accidents for a policyholder given the historical data over a
+population of policyholders.
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018).
+    `doi:10.2139/ssrn.3164764 <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+# License: BSD 3 clause
+import warnings
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.datasets import fetch_openml
+from sklearn.dummy import DummyRegressor
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import Ridge, PoissonRegressor
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import OrdinalEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.utils import gen_even_slices
+from sklearn.metrics import auc
+
+from sklearn.metrics import mean_squared_error, mean_absolute_error
+from sklearn.metrics import mean_poisson_deviance
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetch the French Motor Third-Party Liability Claims dataset.
+
+    Parameters
+    ----------
+    n_samples: int or None, default=100000
+      Number of samples to select (for faster run time). If None, the full
+      dataset with 678013 samples is returned.
+    """
+
+    # freMTPL2freq dataset from https://www.openml.org/d/41214
+    df = fetch_openml(data_id=41214, as_frame=True)['data']
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    if n_samples is not None:
+        return df.iloc[:n_samples]
+    return df
+
+
+##############################################################################
+# Let's load the motor claim dataset. We ignore the severity data for this
+# study for the sake of simplicitly.
+#
+# We also subsample the data for the sake of computational cost and running
+# time. Using the full dataset would lead to similar conclusions.
+
+df = load_mtpl2(n_samples=300000)
+
+# Correct for unreasonable observations (that might be data error)
+df["Exposure"] = df["Exposure"].clip(upper=1)
+
+##############################################################################
+# The remaining columns can be used to predict the frequency of claim events.
+# Those columns are very heterogeneous with a mix of categorical and numeric
+# variables with different scales, possibly very unevenly distributed.
+#
+# In order to fit linear models with those predictors it is therefore
+# necessary to perform standard feature transformations as follows:
+
+log_scale_transformer = make_pipeline(
+    FunctionTransformer(np.log, validate=False),
+    StandardScaler()
+)
+
+linear_model_preprocessor = ColumnTransformer(
+    [
+        ("passthrough_numeric", "passthrough",
+            ["BonusMalus"]),
+        ("binned_numeric", KBinsDiscretizer(n_bins=10),
+            ["VehAge", "DrivAge"]),
+        ("log_scaled_numeric", log_scale_transformer,
+            ["Density"]),
+        ("onehot_categorical", OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
+    ],
+    remainder="drop",
+)
+
+##############################################################################
+# The number of claims (``ClaimNb``) is a positive integer that can be modeled
+# as a Poisson distribution. It is then assumed to be the number of discrete
+# events occurring with a constant rate in a given time interval
+# (``Exposure``, in units of years). Here we model the frequency
+# ``y = ClaimNb / Exposure``, which is still a (scaled) Poisson distribution,
+# and use ``Exposure`` as ``sample_weight``.
+
+df["Frequency"] = df["ClaimNb"] / df["Exposure"]
+
+print(
+   pd.cut(df["Frequency"], [-1e-6, 1e-6, 1, 2, 3, 4, 5]).value_counts()
+)
+
+print("Average Frequency = {}"
+      .format(np.average(df["Frequency"], weights=df["Exposure"])))
+
+print("Percentage of zero claims = {0:%}"
+      .format(df.loc[df["ClaimNb"] == 0, "Exposure"].sum() /
+              df["Exposure"].sum()))
+
+##############################################################################
+# It is worth noting that 92 % of policyholders have zero claims, and if we
+# were to convert this problem into a binary classification task, it would be
+# significantly imbalanced.
+#
+# To evaluate the pertinence of the used metrics, we will consider as a
+# baseline a "dummy" estimator that constantly predicts the mean frequency of
+# the training sample.
+
+df_train, df_test = train_test_split(df, random_state=0)
+
+dummy = make_pipeline(
+    linear_model_preprocessor,
+    DummyRegressor(strategy='mean')
+)
+dummy.fit(df_train, df_train["Frequency"],
+          dummyregressor__sample_weight=df_train["Exposure"])
+
+
+def score_estimator(estimator, df_test):
+    """Score an estimator on the test set."""
+
+    y_pred = estimator.predict(df_test)
+
+    print("MSE: %.3f" %
+          mean_squared_error(df_test["Frequency"], y_pred,
+                             df_test["Exposure"]))
+    print("MAE: %.3f" %
+          mean_absolute_error(df_test["Frequency"], y_pred,
+                              df_test["Exposure"]))
+
+    # ignore non-positive predictions, as they are invalid for
+    # the Poisson deviance
+    mask = y_pred > 0
+    if (~mask).any():
+        warnings.warn("Estimator yields non-positive predictions for {} "
+                      "samples out of {}. These will be ignored while "
+                      "computing the Poisson deviance"
+                      .format((~mask).sum(), mask.shape[0]))
+
+    print("mean Poisson deviance: %.3f" %
+          mean_poisson_deviance(df_test["Frequency"][mask],
+                                y_pred[mask],
+                                df_test["Exposure"][mask]))
+
+
+print("Constant mean frequency evaluation:")
+score_estimator(dummy, df_test)
+
+##############################################################################
+# We start by modeling the target variable with the least squares linear
+# regression model,
+
+ridge = make_pipeline(linear_model_preprocessor, Ridge(alpha=1.0))
+ridge.fit(df_train, df_train["Frequency"],
+          ridge__sample_weight=df_train["Exposure"])
+
+##############################################################################
+# The Poisson deviance cannot be computed on non-positive values predicted by
+# the model. For models that do return a few non-positive predictions
+# (e.g. :class:`linear_model.Ridge`) we ignore the corresponding samples,
+# meaning that the obtained Poisson deviance is approximate. An alternative
+# approach could be to use :class:`compose.TransformedTargetRegressor`
+# meta-estimator to map ``y_pred`` to a strictly positive domain.
+
+print("Ridge evaluation:")
+score_estimator(ridge, df_test)
+
+##############################################################################
+# Next we fit the Poisson regressor on the target variable. We set the
+# regularization strength ``alpha`` to 1 over number of samples in oder to
+# mimic the Ridge regressor whose L2 penalty term scales differently with the
+# number of samples.
+
+poisson = make_pipeline(
+    linear_model_preprocessor,
+    PoissonRegressor(alpha=1/df_train.shape[0], max_iter=1000)
+)
+poisson.fit(df_train, df_train["Frequency"],
+            poissonregressor__sample_weight=df_train["Exposure"])
+
+print("PoissonRegressor evaluation:")
+score_estimator(poisson, df_test)
+
+##############################################################################
+# Finally, we will consider a non-linear model, namely a random forest. Random
+# forests do not require the categorical data to be one-hot encoded: instead,
+# we can encode each category label with an arbitrary integer using
+# :class:`preprocessing.OrdinalEncoder`. With this encoding, the forest will
+# treat the categorical features as ordered features, which might not be always
+# a desired behavior. However this effect is limited for deep enough trees
+# which are able to recover the categorical nature of the features. The main
+# advantage of the :class:`preprocessing.OrdinalEncoder` over the
+# :class:`preprocessing.OneHotEncoder` is that it will make training faster.
+
+rf_preprocessor = ColumnTransformer(
+    [
+        ("categorical", OrdinalEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
+        ("numeric", "passthrough",
+            ["VehAge", "DrivAge", "BonusMalus", "Density"]),
+    ],
+    remainder="drop",
+)
+rf = make_pipeline(
+    rf_preprocessor,
+    RandomForestRegressor(min_weight_fraction_leaf=0.01, n_jobs=2)
+)
+rf.fit(df_train, df_train["Frequency"].values,
+       randomforestregressor__sample_weight=df_train["Exposure"].values)
+
+
+print("RandomForestRegressor evaluation:")
+score_estimator(rf, df_test)
+
+
+##############################################################################
+# Like the Ridge regression above, the random forest model minimizes the
+# conditional squared error, too. However, because of a higher predictive
+# power, it also results in a smaller Poisson deviance than the Poisson
+# regression model.
+#
+# Evaluating models with a single train / test split is prone to random
+# fluctuations. If computing resources allow, it should be verified that
+# cross-validated performance metrics would lead to similar conclusions.
+#
+# The qualitative difference between these models can also be visualized by
+# comparing the histogram of observed target values with that of predicted
+# values:
+
+fig, axes = plt.subplots(2, 4, figsize=(16, 6), sharey=True)
+fig.subplots_adjust(bottom=0.2)
+n_bins = 20
+for row_idx, label, df in zip(range(2),
+                              ["train", "test"],
+                              [df_train, df_test]):
+    df["Frequency"].hist(bins=np.linspace(-1, 30, n_bins),
+                         ax=axes[row_idx, 0])
+
+    axes[row_idx, 0].set_title("Data")
+    axes[row_idx, 0].set_yscale('log')
+    axes[row_idx, 0].set_xlabel("y (observed Frequency)")
+    axes[row_idx, 0].set_ylim([1e1, 5e5])
+    axes[row_idx, 0].set_ylabel(label + " samples")
+
+    for idx, model in enumerate([ridge, poisson, rf]):
+        y_pred = model.predict(df)
+
+        pd.Series(y_pred).hist(bins=np.linspace(-1, 4, n_bins),
+                               ax=axes[row_idx, idx+1])
+        axes[row_idx, idx + 1].set(
+            title=model[-1].__class__.__name__,
+            yscale='log',
+            xlabel="y_pred (predicted expected Frequency)"
+        )
+plt.tight_layout()
+
+##############################################################################
+# The experimental data presents a long tail distribution for ``y``. In all
+# models we predict a mean expected value, so we will have necessarily fewer
+# extreme values. Additionally, the normal distribution used in ``Ridge`` and
+# ``RandomForestRegressor`` has a constant variance, while for the Poisson
+# distribution used in ``PoissonRegressor``, the variance is proportional to
+# the mean predicted value.
+#
+# Thus, among the considered estimators, ``PoissonRegressor`` is better suited
+# for modeling the long tail distribution of the data as compared to the
+# ``Ridge`` and ``RandomForestRegressor`` estimators.
+#
+# To ensure that estimators yield reasonable predictions for different
+# policyholder types, we can bin test samples according to ``y_pred`` returned
+# by each model. Then for each bin, we compare the mean predicted ``y_pred``,
+# with the mean observed target:
+
+
+def _mean_frequency_by_risk_group(y_true, y_pred, sample_weight=None,
+                                  n_bins=100):
+    """Compare predictions and observations for bins ordered by y_pred.
+
+    We order the samples by ``y_pred`` and split it in bins.
+    In each bin the observed mean is compared with the predicted mean.
+
+    Parameters
+    ----------
+    y_true: array-like of shape (n_samples,)
+        Ground truth (correct) target values.
+    y_pred: array-like of shape (n_samples,)
+        Estimated target values.
+    sample_weight : array-like of shape (n_samples,)
+        Sample weights.
+    n_bins: int
+        Number of bins to use.
+
+    Returns
+    -------
+    bin_centers: ndarray of shape (n_bins,)
+        bin centers
+    y_true_bin: ndarray of shape (n_bins,)
+        average y_pred for each bin
+    y_pred_bin: ndarray of shape (n_bins,)
+        average y_pred for each bin
+    """
+    idx_sort = np.argsort(y_pred)
+    bin_centers = np.arange(0, 1, 1/n_bins) + 0.5/n_bins
+    y_pred_bin = np.zeros(n_bins)
+    y_true_bin = np.zeros(n_bins)
+
+    for n, sl in enumerate(gen_even_slices(len(y_true), n_bins)):
+        weights = sample_weight[idx_sort][sl]
+        y_pred_bin[n] = np.average(
+            y_pred[idx_sort][sl], weights=weights
+        )
+        y_true_bin[n] = np.average(
+            y_true[idx_sort][sl],
+            weights=weights
+        )
+    return bin_centers, y_true_bin, y_pred_bin
+
+
+fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(12, 3.5))
+plt.subplots_adjust(wspace=0.3)
+
+for axi, model in zip(ax, [ridge, poisson, rf]):
+    y_pred = model.predict(df_test)
+
+    q, y_true_seg, y_pred_seg = _mean_frequency_by_risk_group(
+        df_test["Frequency"].values,
+        y_pred,
+        sample_weight=df_test["Exposure"].values,
+        n_bins=10)
+
+    axi.plot(q, y_pred_seg, marker='o', linestyle="-", label="predictions")
+    axi.plot(q, y_true_seg, marker='x', linestyle="--", label="observations")
+    axi.set_xlim(0, 1.0)
+    axi.set_ylim(0, 0.6)
+    axi.set(
+        title=model[-1].__class__.__name__,
+        xlabel='Fraction of samples sorted by y_pred',
+        ylabel='Mean Frequency (y_pred)'
+    )
+    axi.legend()
+plt.tight_layout()
+
+##############################################################################
+# The ``Ridge`` regression model can predict very low expected frequencies
+# that do not match the data. It can therefore severly under-estimate the risk
+# for some policyholders.
+#
+# ``PoissonRegressor`` and ``RandomForestRegressor`` show better consistency
+# between predicted and observed targets, especially for low predicted target
+# values.
+#
+# However, for some business applications, we are not necessarily interested
+# in the ability of the model to predict the expected frequency value, but
+# instead to predict which policyholder groups are the riskiest and which are
+# the safest. In this case, the model evaluation would cast the problem as a
+# ranking problem rather than a regression problem.
+#
+# To compare the 3 models within this perspective, one can plot the fraction of
+# the number of claims vs the fraction of exposure for test samples ordered by
+# the model predictions, from riskiest to safest according to each model:
+
+
+def _cumulated_claims(y_true, y_pred, exposure):
+    idx_sort = np.argsort(y_pred)[::-1]  # from riskiest to safest
+    sorted_exposure = exposure[idx_sort]
+    sorted_frequencies = y_true[idx_sort]
+    cumulated_exposure = np.cumsum(sorted_exposure)
+    cumulated_exposure /= cumulated_exposure[-1]
+    cumulated_claims = np.cumsum(sorted_exposure * sorted_frequencies)
+    cumulated_claims /= cumulated_claims[-1]
+    return cumulated_exposure, cumulated_claims
+
+
+fig, ax = plt.subplots(figsize=(8, 8))
+
+for model in [ridge, poisson, rf]:
+    y_pred = model.predict(df_test)
+    cum_exposure, cum_claims = _cumulated_claims(
+        df_test["Frequency"].values,
+        y_pred,
+        df_test["Exposure"].values)
+    area = auc(cum_exposure, cum_claims)
+    label = "{} (area under curve: {:.3f})".format(
+        model[-1].__class__.__name__, area)
+    ax.plot(cum_exposure, cum_claims, linestyle="-", label=label)
+
+# Oracle model: y_pred == y_test
+cum_exposure, cum_claims = _cumulated_claims(
+    df_test["Frequency"].values,
+    df_test["Frequency"].values,
+    df_test["Exposure"].values)
+area = auc(cum_exposure, cum_claims)
+label = "Oracle (area under curve: {:.3f})".format(area)
+ax.plot(cum_exposure, cum_claims, linestyle="-.", color="gray", label=label)
+
+# Random Baseline
+ax.plot([0, 1], [0, 1], linestyle="--", color="black",
+        label="Random baseline")
+ax.set(
+    title="Cumulated number of claims by model",
+    xlabel='Fraction of exposure (from riskiest to safest)',
+    ylabel='Fraction of number of claims'
+)
+ax.legend(loc="lower right")
+
+##############################################################################
+# This plot reveals that the random forest model is slightly better at ranking
+# policyholders by risk profiles even if the absolute value of the predicted
+# expected frequencies are less well calibrated than for the linear Poisson
+# model.
+#
+# All three models are significantly better than chance but also very far from
+# making perfect predictions.
+#
+# This last point is expected due to the nature of the problem: the occurrence
+# of accidents is mostly dominated by circumstantial causes that are not
+# captured in the columns of the dataset or that are indeed random.
+
+plt.show()
diff --git a/examples/linear_model/plot_tweedie_regression_insurance_claims.py b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
new file mode 100644
index 0000000000000..ccd18c8efff99
--- /dev/null
+++ b/examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -0,0 +1,596 @@
+"""
+======================================
+Tweedie regression on insurance claims
+======================================
+
+This example illustrates the use of Poisson, Gamma and Tweedie regression on
+the `French Motor Third-Party Liability Claims dataset
+<https://www.openml.org/d/41214>`_, and is inspired by an R tutorial [1]_.
+
+In this dataset, each sample corresponds to an insurance policy, i.e. a
+contract within an insurance company and an individual (policyholder).
+Available features include driver age, vehicle age, vehicle power, etc.
+
+A few definitions: a *claim* is the request made by a policyholder to the
+insurer to compensate for a loss covered by the insurance. The *claim amount*
+is the amount of money that the insurer must pay. The *exposure* is the
+duration of the insurance coverage of a given policy, in years.
+
+Here our goal goal is to predict the expected
+value, i.e. the mean, of the total claim amount per exposure unit also
+referred to as the pure premium.
+
+There are several possibilities to do that, two of which are:
+
+1. Model the number of claims with a Poisson distribution, and the average
+   claim amount per claim, also known as severity, as a Gamma distribution
+   and multiply the predictions of both in order to get the total claim
+   amount.
+2. Model the total claim amount per exposure directly, typically with a Tweedie
+   distribution of Tweedie power :math:`p \\in (1, 2)`.
+
+In this example we will illustrate both approaches. We start by defining a few
+helper functions for loading the data and visualizing results.
+
+.. [1]  A. Noll, R. Salzmann and M.V. Wuthrich, Case Study: French Motor
+    Third-Party Liability Claims (November 8, 2018). `doi:10.2139/ssrn.3164764
+    <http://dx.doi.org/10.2139/ssrn.3164764>`_
+
+"""
+print(__doc__)
+
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#          Roman Yurchak <rth.yurchak@gmail.com>
+#          Olivier Grisel <olivier.grisel@ensta.org>
+# License: BSD 3 clause
+from functools import partial
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+from sklearn.datasets import fetch_openml
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import PoissonRegressor, GammaRegressor
+from sklearn.linear_model import TweedieRegressor
+from sklearn.metrics import mean_tweedie_deviance
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
+from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
+
+from sklearn.metrics import mean_absolute_error, mean_squared_error, auc
+
+
+def load_mtpl2(n_samples=100000):
+    """Fetch the French Motor Third-Party Liability Claims dataset.
+
+    Parameters
+    ----------
+    n_samples: int, default=100000
+      number of samples to select (for faster run time). Full dataset has
+      678013 samples.
+    """
+    # freMTPL2freq dataset from https://www.openml.org/d/41214
+    df_freq = fetch_openml(data_id=41214, as_frame=True)['data']
+    df_freq['IDpol'] = df_freq['IDpol'].astype(np.int)
+    df_freq.set_index('IDpol', inplace=True)
+
+    # freMTPL2sev dataset from https://www.openml.org/d/41215
+    df_sev = fetch_openml(data_id=41215, as_frame=True)['data']
+
+    # sum ClaimAmount over identical IDs
+    df_sev = df_sev.groupby('IDpol').sum()
+
+    df = df_freq.join(df_sev, how="left")
+    df["ClaimAmount"].fillna(0, inplace=True)
+
+    # unquote string fields
+    for column_name in df.columns[df.dtypes.values == np.object]:
+        df[column_name] = df[column_name].str.strip("'")
+    return df.iloc[:n_samples]
+
+
+def plot_obs_pred(df, feature, weight, observed, predicted, y_label=None,
+                  title=None, ax=None, fill_legend=False):
+    """Plot observed and predicted - aggregated per feature level.
+
+    Parameters
+    ----------
+    df : DataFrame
+        input data
+    feature: str
+        a column name of df for the feature to be plotted
+    weight : str
+        column name of df with the values of weights or exposure
+    observed : str
+        a column name of df with the observed target
+    predicted : DataFrame
+        a dataframe, with the same index as df, with the predicted target
+    fill_legend : bool, default=False
+        whether to show fill_between legend
+    """
+    # aggregate observed and predicted variables by feature level
+    df_ = df.loc[:, [feature, weight]].copy()
+    df_["observed"] = df[observed] * df[weight]
+    df_["predicted"] = predicted * df[weight]
+    df_ = (
+        df_.groupby([feature])[weight, "observed", "predicted"]
+        .sum()
+        .assign(observed=lambda x: x["observed"] / x[weight])
+        .assign(predicted=lambda x: x["predicted"] / x[weight])
+    )
+
+    ax = df_.loc[:, ["observed", "predicted"]].plot(style=".", ax=ax)
+    y_max = df_.loc[:, ["observed", "predicted"]].values.max() * 0.8
+    p2 = ax.fill_between(
+        df_.index,
+        0,
+        y_max * df_[weight] / df_[weight].values.max(),
+        color="g",
+        alpha=0.1,
+    )
+    if fill_legend:
+        ax.legend([p2], ["{} distribution".format(feature)])
+    ax.set(
+        ylabel=y_label if y_label is not None else None,
+        title=title if title is not None else "Train: Observed vs Predicted",
+    )
+
+
+def score_estimator(
+    estimator, X_train, X_test, df_train, df_test, target, weights,
+    tweedie_powers=None,
+):
+    """Evaluate an estimator on train and test sets with different metrics"""
+
+    metrics = [
+        ("D² explained", None),   # Use default scorer if it exists
+        ("mean abs. error", mean_absolute_error),
+        ("mean squared error", mean_squared_error),
+    ]
+    if tweedie_powers:
+        metrics += [(
+            "mean Tweedie dev p={:.4f}".format(power),
+            partial(mean_tweedie_deviance, power=power)
+        ) for power in tweedie_powers]
+
+    res = []
+    for subset_label, X, df in [
+        ("train", X_train, df_train),
+        ("test", X_test, df_test),
+    ]:
+        y, _weights = df[target], df[weights]
+        for score_label, metric in metrics:
+            if isinstance(estimator, tuple) and len(estimator) == 2:
+                # Score the model consisting of the product of frequency and
+                # severity models.
+                est_freq, est_sev = estimator
+                y_pred = est_freq.predict(X) * est_sev.predict(X)
+            else:
+                y_pred = estimator.predict(X)
+
+            if metric is None:
+                if not hasattr(estimator, "score"):
+                    continue
+                score = estimator.score(X, y, _weights)
+            else:
+                score = metric(y, y_pred, _weights)
+
+            res.append(
+                {"subset": subset_label, "metric": score_label, "score": score}
+            )
+
+    res = (
+        pd.DataFrame(res)
+        .set_index(["metric", "subset"])
+        .score.unstack(-1)
+        .round(4)
+        .loc[:, ['train', 'test']]
+    )
+    return res
+
+
+##############################################################################
+# Loading datasets, basic feature extraction and target definitions
+# -----------------------------------------------------------------
+#
+# We construct the freMTPL2 dataset by joining the freMTPL2freq table,
+# containing the number of claims (``ClaimNb``), with the freMTPL2sev table,
+# containing the claim amount (``ClaimAmount``) for the same policy ids
+# (``IDpol``).
+
+df = load_mtpl2(n_samples=60000)
+
+# Note: filter out claims with zero amount, as the severity model
+# requires strictly positive target values.
+df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0
+
+# Correct for unreasonable observations (that might be data error)
+# and a few exceptionally large claim amounts
+df["ClaimNb"] = df["ClaimNb"].clip(upper=4)
+df["Exposure"] = df["Exposure"].clip(upper=1)
+df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000)
+
+log_scale_transformer = make_pipeline(
+    FunctionTransformer(func=np.log),
+    StandardScaler()
+)
+
+column_trans = ColumnTransformer(
+    [
+        ("binned_numeric", KBinsDiscretizer(n_bins=10),
+            ["VehAge", "DrivAge"]),
+        ("onehot_categorical", OneHotEncoder(),
+            ["VehBrand", "VehPower", "VehGas", "Region", "Area"]),
+        ("passthrough_numeric", "passthrough",
+            ["BonusMalus"]),
+        ("log_scaled_numeric", log_scale_transformer,
+            ["Density"]),
+    ],
+    remainder="drop",
+)
+X = column_trans.fit_transform(df)
+
+# Insurances companies are interested in modeling the Pure Premium, that is
+# the expected total claim amount per unit of exposure for each policyholder
+# in their portfolio:
+df["PurePremium"] = df["ClaimAmount"] / df["Exposure"]
+
+# This can be indirectly approximated by a 2-step modeling: the product of the
+# Frequency times the average claim amount per claim:
+df["Frequency"] = df["ClaimNb"] / df["Exposure"]
+df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1)
+
+with pd.option_context("display.max_columns", 15):
+    print(df[df.ClaimAmount > 0].head())
+
+##############################################################################
+#
+# Frequency model -- Poisson distribution
+# ---------------------------------------
+#
+# The number of claims (``ClaimNb``) is a positive integer (0 included).
+# Thus, this target can be modelled by a Poisson distribution.
+# It is then assumed to be the number of discrete events occuring with a
+# constant rate in a given time interval (``Exposure``, in units of years).
+# Here we model the frequency ``y = ClaimNb / Exposure``, which is still a
+# (scaled) Poisson distribution, and use ``Exposure`` as `sample_weight`.
+
+df_train, df_test, X_train, X_test = train_test_split(df, X, random_state=0)
+
+# The parameters of the model are estimated by minimizing the Poisson deviance
+# on the training set via a quasi-Newton solver: l-BFGS. Some of the features
+# are collinear, we use a weak penalization to avoid numerical issues.
+glm_freq = PoissonRegressor(alpha=1e-3, max_iter=400)
+glm_freq.fit(X_train, df_train["Frequency"],
+             sample_weight=df_train["Exposure"])
+
+scores = score_estimator(
+    glm_freq,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="Frequency",
+    weights="Exposure",
+)
+print("Evaluation of PoissonRegressor on target Frequency")
+print(scores)
+
+##############################################################################
+# We can visually compare observed and predicted values, aggregated by the
+# drivers age (``DrivAge``), vehicle age (``VehAge``) and the insurance
+# bonus/malus (``BonusMalus``).
+
+fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(16, 8))
+fig.subplots_adjust(hspace=0.3, wspace=0.2)
+
+plot_obs_pred(
+    df=df_train,
+    feature="DrivAge",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_train),
+    y_label="Claim Frequency",
+    title="train data",
+    ax=ax[0, 0],
+)
+
+plot_obs_pred(
+    df=df_test,
+    feature="DrivAge",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[0, 1],
+    fill_legend=True
+)
+
+plot_obs_pred(
+    df=df_test,
+    feature="VehAge",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 0],
+    fill_legend=True
+)
+
+plot_obs_pred(
+    df=df_test,
+    feature="BonusMalus",
+    weight="Exposure",
+    observed="Frequency",
+    predicted=glm_freq.predict(X_test),
+    y_label="Claim Frequency",
+    title="test data",
+    ax=ax[1, 1],
+    fill_legend=True
+)
+
+
+##############################################################################
+# According to the observed data, the frequency of accidents is higher for
+# drivers younger than 30 years old, and is positively correlated with the
+# `BonusMalus` variable. Our model is able to mostly correctly model this
+# behaviour.
+#
+# Severity Model -  Gamma distribution
+# ------------------------------------
+# The mean claim amount or severity (`AvgClaimAmount`) can be empirically
+# shown to follow approximately a Gamma distribution. We fit a GLM model for
+# the severity with the same features as the frequency model.
+#
+# Note:
+#
+# - We filter out ``ClaimAmount == 0`` as the Gamma distribution has support
+#   on :math:`(0, \infty)`, not :math:`[0, \infty)`.
+# - We use ``ClaimNb`` as `sample_weight` to account for policies that contain
+#   more than one claim.
+
+mask_train = df_train["ClaimAmount"] > 0
+mask_test = df_test["ClaimAmount"] > 0
+
+glm_sev = GammaRegressor(alpha=10., max_iter=10000)
+
+glm_sev.fit(
+    X_train[mask_train.values],
+    df_train.loc[mask_train, "AvgClaimAmount"],
+    sample_weight=df_train.loc[mask_train, "ClaimNb"],
+)
+
+scores = score_estimator(
+    glm_sev,
+    X_train[mask_train.values],
+    X_test[mask_test.values],
+    df_train[mask_train],
+    df_test[mask_test],
+    target="AvgClaimAmount",
+    weights="ClaimNb",
+)
+print("Evaluation of GammaRegressor on target AvgClaimAmount")
+print(scores)
+
+##############################################################################
+# Here, the scores for the test data call for caution as they are
+# significantly worse than for the training data indicating an overfit despite
+# the strong regularization.
+#
+# Note that the resulting model is the average claim amount per claim. As
+# such, it is conditional on having at least one claim, and cannot be used to
+# predict the average claim amount per policy in general.
+
+print("Mean AvgClaim Amount per policy:              %.2f "
+      % df_train["AvgClaimAmount"].mean())
+print("Mean AvgClaim Amount | NbClaim > 0:           %.2f"
+      % df_train["AvgClaimAmount"][df_train["AvgClaimAmount"] > 0].mean())
+print("Predicted Mean AvgClaim Amount | NbClaim > 0: %.2f"
+      % glm_sev.predict(X_train).mean())
+
+
+##############################################################################
+# We can visually compare observed and predicted values, aggregated for
+# the drivers age (``DrivAge``).
+
+fig, ax = plt.subplots(ncols=1, nrows=2, figsize=(16, 6))
+
+plot_obs_pred(
+    df=df_train.loc[mask_train],
+    feature="DrivAge",
+    weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_train[mask_train.values]),
+    y_label="Average Claim Severity",
+    title="train data",
+    ax=ax[0],
+)
+
+plot_obs_pred(
+    df=df_test.loc[mask_test],
+    feature="DrivAge",
+    weight="Exposure",
+    observed="AvgClaimAmount",
+    predicted=glm_sev.predict(X_test[mask_test.values]),
+    y_label="Average Claim Severity",
+    title="test data",
+    ax=ax[1],
+    fill_legend=True
+)
+plt.tight_layout()
+
+##############################################################################
+# Overall, the drivers age (``DrivAge``) has a weak impact on the claim
+# severity, both in observed and predicted data.
+#
+# Pure Premium Modeling via a Product Model vs single TweedieRegressor
+# --------------------------------------------------------------------
+# As mentioned in the introduction, the total claim amount per unit of
+# exposure can be modeled as the product of the prediction of the
+# frequency model by the prediction of the severity model.
+#
+# Alternatively, one can directly model the total loss with a unique
+# Compound Poisson Gamma generalized linear model (with a log link function).
+# This model is a special case of the Tweedie GLM with a "power" parameter
+# :math:`p \in (1, 2)`. Here, we fix apriori the `power` parameter of the
+# Tweedie model to some arbitrary value (1.9) in the valid range. Ideally one
+# would select this value via grid-search by minimizing the negative
+# log-likelihood of the Tweedie model, but unfortunately the current
+# implementation does not allow for this (yet).
+#
+# We will compare the performance of both approaches.
+# To quantify the performance of both models, one can compute
+# the mean deviance of the train and test data assuming a Compound
+# Poisson-Gamma distribution of the total claim amount. This is equivalent to
+# a Tweedie distribution with a `power` parameter between 1 and 2.
+#
+# The :func:`sklearn.metrics.mean_tweedie_deviance` depends on a `power`
+# parameter. As we do not know the true value of the `power` parameter, we here
+# compute the mean deviances for a grid of possible values, and compare the
+# models side by side, i.e. we compare them at identical values of `power`.
+# Ideally, we hope that one model will be consistently better than the other,
+# regardless of `power`.
+
+glm_pure_premium = TweedieRegressor(power=1.9, alpha=.1, max_iter=10000)
+glm_pure_premium.fit(X_train, df_train["PurePremium"],
+                     sample_weight=df_train["Exposure"])
+
+tweedie_powers = [1.5, 1.7, 1.8, 1.9, 1.99, 1.999, 1.9999]
+
+scores_product_model = score_estimator(
+    (glm_freq, glm_sev),
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="PurePremium",
+    weights="Exposure",
+    tweedie_powers=tweedie_powers,
+)
+
+scores_glm_pure_premium = score_estimator(
+    glm_pure_premium,
+    X_train,
+    X_test,
+    df_train,
+    df_test,
+    target="PurePremium",
+    weights="Exposure",
+    tweedie_powers=tweedie_powers
+)
+
+scores = pd.concat([scores_product_model, scores_glm_pure_premium],
+                   axis=1, sort=True,
+                   keys=('Product Model', 'TweedieRegressor'))
+print("Evaluation of the Product Model and the Tweedie Regressor "
+      "on target PurePremium")
+with pd.option_context('display.expand_frame_repr', False):
+    print(scores)
+
+##############################################################################
+# In this example, both modeling approaches yield comparable performance
+# metrics. For implementation reasons, the percentage of explained variance
+# :math:`D^2` is not available for the product model.
+#
+# We can additionally validate these models by comparing observed and
+# predicted total claim amount over the test and train subsets. We see that,
+# on average, both model tend to underestimate the total claim (but this
+# behavior depends on the amount of regularization).
+
+res = []
+for subset_label, X, df in [
+    ("train", X_train, df_train),
+    ("test", X_test, df_test),
+]:
+    exposure = df["Exposure"].values
+    res.append(
+        {
+            "subset": subset_label,
+            "observed": df["ClaimAmount"].values.sum(),
+            "predicted, frequency*severity model": np.sum(
+                exposure * glm_freq.predict(X) * glm_sev.predict(X)
+            ),
+            "predicted, tweedie, power=%.2f"
+            % glm_pure_premium.power: np.sum(
+                exposure * glm_pure_premium.predict(X)),
+        }
+    )
+
+print(pd.DataFrame(res).set_index("subset").T)
+
+##############################################################################
+# Finally, we can compare the two models using a plot of cumulated claims: for
+# each model, the policyholders are ranked from safest to riskiest and the
+# fraction of observed total cumulated claims is plotted on the y axis. This
+# plot is often called the ordered Lorenz curve of the model.
+#
+# The Gini coefficient (based on the area under the curve) can be used as a
+# model selection metric to quantify the ability of the model to rank
+# policyholders. Note that this metric does not reflect the ability of the
+# models to make accurate predictions in terms of absolute value of total
+# claim amounts but only in terms of relative amounts as a ranking metric.
+#
+# Both models are able to rank policyholders by risky-ness significantly
+# better than chance although they are also both far from perfect due to the
+# natural difficulty of the prediction problem from few features.
+#
+# Note that the Gini index only characterize the ranking performance of the
+# model but not its calibration: any monotonic transformation of the
+# predictions leaves the Gini index of the model unchanged.
+#
+# Finally one should highlight that the Compound Poisson Gamma model that
+# is directly fit on the pure premium is operationally simpler to develop and
+# maintain as it consists in a single scikit-learn estimator instead of a
+# pair of models, each with its own set of hyperparameters.
+
+
+def lorenz_curve(y_true, y_pred, exposure):
+    y_true, y_pred = np.asarray(y_true), np.asarray(y_pred)
+    exposure = np.asarray(exposure)
+
+    # order samples by increasing predicted risk:
+    ranking = np.argsort(y_pred)
+    ranked_exposure = exposure[ranking]
+    ranked_pure_premium = y_true[ranking]
+    cumulated_claim_amount = np.cumsum(ranked_pure_premium * ranked_exposure)
+    cumulated_claim_amount /= cumulated_claim_amount[-1]
+    cumulated_samples = np.linspace(0, 1, len(cumulated_claim_amount))
+    return cumulated_samples, cumulated_claim_amount
+
+
+fig, ax = plt.subplots(figsize=(8, 8))
+
+y_pred_product = glm_freq.predict(X_test) * glm_sev.predict(X_test)
+y_pred_total = glm_pure_premium.predict(X_test)
+
+for label, y_pred in [("Frequency * Severity model", y_pred_product),
+                      ("Compound Poisson Gamma", y_pred_total)]:
+    ordered_samples, cum_claims = lorenz_curve(
+        df_test["PurePremium"], y_pred, df_test["Exposure"])
+    gini = 1 - 2 * auc(ordered_samples, cum_claims)
+    label += " (Gini index: {:.3f})".format(gini)
+    ax.plot(ordered_samples, cum_claims, linestyle="-", label=label)
+
+# Oracle model: y_pred == y_test
+ordered_samples, cum_claims = lorenz_curve(
+    df_test["PurePremium"], df_test["PurePremium"], df_test["Exposure"])
+gini = 1 - 2 * auc(ordered_samples, cum_claims)
+label = "Oracle (Gini index: {:.3f})".format(gini)
+ax.plot(ordered_samples, cum_claims, linestyle="-.", color="gray",
+        label=label)
+
+# Random baseline
+ax.plot([0, 1], [0, 1], linestyle="--", color="black",
+        label="Random baseline")
+ax.set(
+    title="Lorenz Curves",
+    xlabel=('Fraction of policyholders\n'
+            '(ordered by model from safest to riskiest)'),
+    ylabel='Fraction of total claim amount'
+)
+ax.legend(loc="upper left")
+plt.plot()
diff --git a/sklearn/_loss/__init__.py b/sklearn/_loss/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/_loss/glm_distribution.py b/sklearn/_loss/glm_distribution.py
new file mode 100644
index 0000000000000..cb20fda1c022d
--- /dev/null
+++ b/sklearn/_loss/glm_distribution.py
@@ -0,0 +1,355 @@
+"""
+Distribution functions used in GLM
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+from collections import namedtuple
+import numbers
+
+import numpy as np
+from scipy.special import xlogy
+
+
+DistributionBoundary = namedtuple("DistributionBoundary",
+                                  ("value", "inclusive"))
+
+
+class ExponentialDispersionModel(metaclass=ABCMeta):
+    r"""Base class for reproductive Exponential Dispersion Models (EDM).
+
+    The pdf of :math:`Y\sim \mathrm{EDM}(y_\textrm{pred}, \phi)` is given by
+
+    .. math:: p(y| \theta, \phi) = c(y, \phi)
+        \exp\left(\frac{\theta y-A(\theta)}{\phi}\right)
+        = \tilde{c}(y, \phi)
+            \exp\left(-\frac{d(y, y_\textrm{pred})}{2\phi}\right)
+
+    with mean :math:`\mathrm{E}[Y] = A'(\theta) = y_\textrm{pred}`,
+    variance :math:`\mathrm{Var}[Y] = \phi \cdot v(y_\textrm{pred})`,
+    unit variance :math:`v(y_\textrm{pred})` and
+    unit deviance :math:`d(y,y_\textrm{pred})`.
+
+    Methods
+    -------
+    deviance
+    deviance_derivative
+    in_y_range
+    unit_deviance
+    unit_deviance_derivative
+    unit_variance
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Exponential_dispersion_model.
+    """
+
+    def in_y_range(self, y):
+        """Returns ``True`` if y is in the valid range of Y~EDM.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+        """
+        # Note that currently supported distributions have +inf upper bound
+
+        if not isinstance(self._lower_bound, DistributionBoundary):
+            raise TypeError('_lower_bound attribute must be of type '
+                            'DistributionBoundary')
+
+        if self._lower_bound.inclusive:
+            return np.greater_equal(y, self._lower_bound.value)
+        else:
+            return np.greater(y, self._lower_bound.value)
+
+    @abstractmethod
+    def unit_variance(self, y_pred):
+        r"""Compute the unit variance function.
+
+        The unit variance :math:`v(y_\textrm{pred})` determines the variance as
+        a function of the mean :math:`y_\textrm{pred}` by
+        :math:`\mathrm{Var}[Y_i] = \phi/s_i*v(y_\textrm{pred}_i)`.
+        It can also be derived from the unit deviance
+        :math:`d(y,y_\textrm{pred})` as
+
+        .. math:: v(y_\textrm{pred}) = \frac{2}{
+            \frac{\partial^2 d(y,y_\textrm{pred})}{
+            \partialy_\textrm{pred}^2}}\big|_{y=y_\textrm{pred}}
+
+        See also :func:`variance`.
+
+        Parameters
+        ----------
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+        """
+
+    @abstractmethod
+    def unit_deviance(self, y, y_pred, check_input=False):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
+        log-likelihood as
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or y_pred values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array of shape (n_samples,)
+            Computed deviance
+        """
+
+    def unit_deviance_derivative(self, y, y_pred):
+        r"""Compute the derivative of the unit deviance w.r.t. y_pred.
+
+        The derivative of the unit deviance is given by
+        :math:`\frac{\partial}{\partialy_\textrm{pred}}d(y,y_\textrm{pred})
+             = -2\frac{y-y_\textrm{pred}}{v(y_\textrm{pred})}`
+        with unit variance :math:`v(y_\textrm{pred})`.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+        """
+        return -2 * (y - y_pred) / self.unit_variance(y_pred)
+
+    def deviance(self, y, y_pred, weights=1):
+        r"""Compute the deviance.
+
+        The deviance is a weighted sum of the per sample unit deviances,
+        :math:`D = \sum_i s_i \cdot d(y_i, y_\textrm{pred}_i)`
+        with weights :math:`s_i` and unit deviance
+        :math:`d(y,y_\textrm{pred})`.
+        In terms of the log-likelihood it is :math:`D = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\frac{phi}{s})
+        - loglike(y,y,\frac{phi}{s})\right)`.
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        weights : {int, array of shape (n_samples,)}, default=1
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return np.sum(weights * self.unit_deviance(y, y_pred))
+
+    def deviance_derivative(self, y, y_pred, weights=1):
+        r"""Compute the derivative of the deviance w.r.t. y_pred.
+
+        It gives :math:`\frac{\partial}{\partial y_\textrm{pred}}
+        D(y, \y_\textrm{pred}; weights)`.
+
+        Parameters
+        ----------
+        y : array, shape (n_samples,)
+            Target values.
+
+        y_pred : array, shape (n_samples,)
+            Predicted mean.
+
+        weights : {int, array of shape (n_samples,)}, default=1
+            Weights or exposure to which variance is inverse proportional.
+        """
+        return weights * self.unit_deviance_derivative(y, y_pred)
+
+
+class TweedieDistribution(ExponentialDispersionModel):
+    r"""A class for the Tweedie distribution.
+
+    A Tweedie distribution with mean :math:`y_\textrm{pred}=\mathrm{E}[Y]`
+    is uniquely defined by it's mean-variance relationship
+    :math:`\mathrm{Var}[Y] \propto y_\textrm{pred}^power`.
+
+    Special cases are:
+
+    ===== ================
+    Power Distribution
+    ===== ================
+    0     Normal
+    1     Poisson
+    (1,2) Compound Poisson
+    2     Gamma
+    3     Inverse Gaussian
+
+    Parameters
+    ----------
+    power : float, default=0
+            The variance power of the `unit_variance`
+            :math:`v(y_\textrm{pred}) = y_\textrm{pred}^{power}`.
+            For ``0<power<1``, no distribution exists.
+    """
+    def __init__(self, power=0):
+        self.power = power
+
+    @property
+    def power(self):
+        return self._power
+
+    @power.setter
+    def power(self, power):
+        # We use a property with a setter, to update lower and
+        # upper bound when the power parameter is updated e.g. in grid
+        # search.
+        if not isinstance(power, numbers.Real):
+            raise TypeError('power must be a real number, input was {0}'
+                            .format(power))
+
+        if power <= 0:
+            # Extreme Stable or Normal distribution
+            self._lower_bound = DistributionBoundary(-np.Inf, inclusive=False)
+        elif 0 < power < 1:
+            raise ValueError('Tweedie distribution is only defined for '
+                             'power<=0 and power>=1.')
+        elif 1 <= power < 2:
+            # Poisson or Compound Poisson distribution
+            self._lower_bound = DistributionBoundary(0, inclusive=True)
+        elif power >= 2:
+            # Gamma, Positive Stable, Inverse Gaussian distributions
+            self._lower_bound = DistributionBoundary(0, inclusive=False)
+        else:  # pragma: no cover
+            # this branch should be unreachable.
+            raise ValueError
+
+        self._power = power
+
+    def unit_variance(self, y_pred):
+        """Compute the unit variance of a Tweedie distribution
+        v(y_\textrm{pred})=y_\textrm{pred}**power.
+
+        Parameters
+        ----------
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+        """
+        return np.power(y_pred, self.power)
+
+    def unit_deviance(self, y, y_pred, check_input=False):
+        r"""Compute the unit deviance.
+
+        The unit_deviance :math:`d(y,y_\textrm{pred})` can be defined by the
+        log-likelihood as
+        :math:`d(y,y_\textrm{pred}) = -2\phi\cdot
+        \left(loglike(y,y_\textrm{pred},\phi) - loglike(y,y,\phi)\right).`
+
+        Parameters
+        ----------
+        y : array of shape (n_samples,)
+            Target values.
+
+        y_pred : array of shape (n_samples,)
+            Predicted mean.
+
+        check_input : bool, default=False
+            If True raise an exception on invalid y or y_pred values, otherwise
+            they will be propagated as NaN.
+        Returns
+        -------
+        deviance: array of shape (n_samples,)
+            Computed deviance
+        """
+        p = self.power
+
+        if check_input:
+            message = ("Mean Tweedie deviance error with power={} can only be "
+                       "used on ".format(p))
+            if p < 0:
+                # 'Extreme stable', y any realy number, y_pred > 0
+                if (y_pred <= 0).any():
+                    raise ValueError(message + "strictly positive y_pred.")
+            elif p == 0:
+                # Normal, y and y_pred can be any real number
+                pass
+            elif 0 < p < 1:
+                raise ValueError("Tweedie deviance is only defined for "
+                                 "power<=0 and power>=1.")
+            elif 1 <= p < 2:
+                # Poisson and Compount poisson distribution, y >= 0, y_pred > 0
+                if (y < 0).any() or (y_pred <= 0).any():
+                    raise ValueError(message + "non-negative y and strictly "
+                                     "positive y_pred.")
+            elif p >= 2:
+                # Gamma and Extreme stable distribution, y and y_pred > 0
+                if (y <= 0).any() or (y_pred <= 0).any():
+                    raise ValueError(message
+                                     + "strictly positive y and y_pred.")
+            else:  # pragma: nocover
+                # Unreachable statement
+                raise ValueError
+
+        if p < 0:
+            # 'Extreme stable', y any realy number, y_pred > 0
+            dev = 2 * (np.power(np.maximum(y, 0), 2-p) / ((1-p) * (2-p))
+                       - y * np.power(y_pred, 1-p) / (1-p)
+                       + np.power(y_pred, 2-p) / (2-p))
+
+        elif p == 0:
+            # Normal distribution, y and y_pred any real number
+            dev = (y - y_pred)**2
+        elif p < 1:
+            raise ValueError("Tweedie deviance is only defined for power<=0 "
+                             "and power>=1.")
+        elif p == 1:
+            # Poisson distribution
+            dev = 2 * (xlogy(y, y/y_pred) - y + y_pred)
+        elif p == 2:
+            # Gamma distribution
+            dev = 2 * (np.log(y_pred/y) + y/y_pred - 1)
+        else:
+            dev = 2 * (np.power(y, 2-p) / ((1-p) * (2-p))
+                       - y * np.power(y_pred, 1-p) / (1-p)
+                       + np.power(y_pred, 2-p) / (2-p))
+        return dev
+
+
+class NormalDistribution(TweedieDistribution):
+    """Class for the Normal (aka Gaussian) distribution"""
+    def __init__(self):
+        super().__init__(power=0)
+
+
+class PoissonDistribution(TweedieDistribution):
+    """Class for the scaled Poisson distribution"""
+    def __init__(self):
+        super().__init__(power=1)
+
+
+class GammaDistribution(TweedieDistribution):
+    """Class for the Gamma distribution"""
+    def __init__(self):
+        super().__init__(power=2)
+
+
+class InverseGaussianDistribution(TweedieDistribution):
+    """Class for the scaled InverseGaussianDistribution distribution"""
+    def __init__(self):
+        super().__init__(power=3)
+
+
+EDM_DISTRIBUTIONS = {
+    'normal': NormalDistribution,
+    'poisson': PoissonDistribution,
+    'gamma': GammaDistribution,
+    'inverse-gaussian': InverseGaussianDistribution,
+}
diff --git a/sklearn/_loss/tests/__init__.py b/sklearn/_loss/tests/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/sklearn/_loss/tests/test_glm_distribution.py b/sklearn/_loss/tests/test_glm_distribution.py
new file mode 100644
index 0000000000000..cb4c5ae07e4d1
--- /dev/null
+++ b/sklearn/_loss/tests/test_glm_distribution.py
@@ -0,0 +1,112 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+import numpy as np
+from numpy.testing import (
+    assert_allclose,
+    assert_array_equal,
+)
+from scipy.optimize import check_grad
+import pytest
+
+from sklearn._loss.glm_distribution import (
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+    DistributionBoundary
+)
+
+
+@pytest.mark.parametrize(
+    'family, expected',
+    [(NormalDistribution(), [True, True, True]),
+     (PoissonDistribution(), [False, True, True]),
+     (TweedieDistribution(power=1.5), [False, True, True]),
+     (GammaDistribution(), [False, False, True]),
+     (InverseGaussianDistribution(), [False, False, True]),
+     (TweedieDistribution(power=4.5), [False, False, True])])
+def test_family_bounds(family, expected):
+    """Test the valid range of distributions at -1, 0, 1."""
+    result = family.in_y_range([-1, 0, 1])
+    assert_array_equal(result, expected)
+
+
+def test_invalid_distribution_bound():
+    dist = TweedieDistribution()
+    dist._lower_bound = 0
+    with pytest.raises(TypeError,
+                       match="must be of type DistributionBoundary"):
+        dist.in_y_range([-1, 0, 1])
+
+
+def test_tweedie_distribution_power():
+    msg = "distribution is only defined for power<=0 and power>=1"
+    with pytest.raises(ValueError, match=msg):
+        TweedieDistribution(power=0.5)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        TweedieDistribution(power=1j)
+
+    with pytest.raises(TypeError, match="must be a real number"):
+        dist = TweedieDistribution()
+        dist.power = 1j
+
+    dist = TweedieDistribution()
+    assert isinstance(dist._lower_bound, DistributionBoundary)
+
+    assert dist._lower_bound.inclusive is False
+    dist.power = 1
+    assert dist._lower_bound.value == 0.0
+    assert dist._lower_bound.inclusive is True
+
+
+@pytest.mark.parametrize(
+    'family, chk_values',
+    [(NormalDistribution(), [-1.5, -0.1, 0.1, 2.5]),
+     (PoissonDistribution(), [0.1, 1.5]),
+     (GammaDistribution(), [0.1, 1.5]),
+     (InverseGaussianDistribution(), [0.1, 1.5]),
+     (TweedieDistribution(power=-2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-1), [0.1, 1.5]),
+     (TweedieDistribution(power=1.5), [0.1, 1.5]),
+     (TweedieDistribution(power=2.5), [0.1, 1.5]),
+     (TweedieDistribution(power=-4), [0.1, 1.5])])
+def test_deviance_zero(family, chk_values):
+    """Test deviance(y,y) = 0 for different families."""
+    for x in chk_values:
+        assert_allclose(family.deviance(x, x), 0, atol=1e-9)
+
+
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(),
+     PoissonDistribution(),
+     GammaDistribution(),
+     InverseGaussianDistribution(),
+     TweedieDistribution(power=-2.5),
+     TweedieDistribution(power=-1),
+     TweedieDistribution(power=1.5),
+     TweedieDistribution(power=2.5),
+     TweedieDistribution(power=-4)],
+    ids=lambda x: x.__class__.__name__
+)
+def test_deviance_derivative(family):
+    """Test deviance derivative for different families."""
+    rng = np.random.RandomState(0)
+    y_true = rng.rand(10)
+    # make data positive
+    y_true += np.abs(y_true.min()) + 1e-2
+
+    y_pred = y_true + np.fmax(rng.rand(10), 0.)
+
+    dev = family.deviance(y_true, y_pred)
+    assert isinstance(dev, float)
+    dev_derivative = family.deviance_derivative(y_true, y_pred)
+    assert dev_derivative.shape == y_pred.shape
+
+    err = check_grad(
+            lambda y_pred: family.deviance(y_true, y_pred),
+            lambda y_pred: family.deviance_derivative(y_true, y_pred),
+            y_pred,
+    ) / np.linalg.norm(dev_derivative)
+    assert abs(err) < 1e-6
diff --git a/sklearn/linear_model/__init__.py b/sklearn/linear_model/__init__.py
index 59d0600d508d0..110e0008bccc9 100644
--- a/sklearn/linear_model/__init__.py
+++ b/sklearn/linear_model/__init__.py
@@ -7,7 +7,6 @@
 # complete documentation.
 
 from ._base import LinearRegression
-
 from ._bayes import BayesianRidge, ARDRegression
 from ._least_angle import (Lars, LassoLars, lars_path, lars_path_gram, LarsCV,
                            LassoLarsCV, LassoLarsIC)
@@ -15,6 +14,8 @@
                                   lasso_path, enet_path, MultiTaskLasso,
                                   MultiTaskElasticNet, MultiTaskElasticNetCV,
                                   MultiTaskLassoCV)
+from ._glm import (PoissonRegressor,
+                   GammaRegressor, TweedieRegressor)
 from ._huber import HuberRegressor
 from ._sgd_fast import Hinge, Log, ModifiedHuber, SquaredLoss, Huber
 from ._stochastic_gradient import SGDClassifier, SGDRegressor
@@ -73,4 +74,7 @@
            'orthogonal_mp',
            'orthogonal_mp_gram',
            'ridge_regression',
-           'RANSACRegressor']
+           'RANSACRegressor',
+           'PoissonRegressor',
+           'GammaRegressor',
+           'TweedieRegressor']
diff --git a/sklearn/linear_model/_glm/__init__.py b/sklearn/linear_model/_glm/__init__.py
new file mode 100644
index 0000000000000..3b5c0d95d6124
--- /dev/null
+++ b/sklearn/linear_model/_glm/__init__.py
@@ -0,0 +1,15 @@
+# License: BSD 3 clause
+
+from .glm import (
+    GeneralizedLinearRegressor,
+    PoissonRegressor,
+    GammaRegressor,
+    TweedieRegressor
+)
+
+__all__ = [
+    "GeneralizedLinearRegressor",
+    "PoissonRegressor",
+    "GammaRegressor",
+    "TweedieRegressor"
+]
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
new file mode 100644
index 0000000000000..8607d6a1828ab
--- /dev/null
+++ b/sklearn/linear_model/_glm/glm.py
@@ -0,0 +1,615 @@
+"""
+Generalized Linear Models with Exponential Dispersion Family
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# some parts and tricks stolen from other sklearn files.
+# License: BSD 3 clause
+
+import numbers
+
+import numpy as np
+import scipy.optimize
+
+from ...base import BaseEstimator, RegressorMixin
+from ...utils import check_array, check_X_y
+from ...utils.optimize import _check_optimize_result
+from ...utils.validation import check_is_fitted, _check_sample_weight
+from ..._loss.glm_distribution import (
+        ExponentialDispersionModel,
+        TweedieDistribution,
+        EDM_DISTRIBUTIONS
+)
+from .link import (
+        BaseLink,
+        IdentityLink,
+        LogLink,
+)
+
+
+def _safe_lin_pred(X, coef):
+    """Compute the linear predictor taking care if intercept is present."""
+    if coef.size == X.shape[1] + 1:
+        return X @ coef[1:] + coef[0]
+    else:
+        return X @ coef
+
+
+def _y_pred_deviance_derivative(coef, X, y, weights, family, link):
+    """Compute y_pred and the derivative of the deviance w.r.t coef."""
+    lin_pred = _safe_lin_pred(X, coef)
+    y_pred = link.inverse(lin_pred)
+    d1 = link.inverse_derivative(lin_pred)
+    temp = d1 * family.deviance_derivative(y, y_pred, weights)
+    if coef.size == X.shape[1] + 1:
+        devp = np.concatenate(([temp.sum()], temp @ X))
+    else:
+        devp = temp @ X  # same as X.T @ temp
+    return y_pred, devp
+
+
+class GeneralizedLinearRegressor(BaseEstimator, RegressorMixin):
+    """Regression via a penalized Generalized Linear Model (GLM).
+
+    GLMs based on a reproductive Exponential Dispersion Model (EDM) aim at
+    fitting and predicting the mean of the target y as y_pred=h(X*w).
+    Therefore, the fit minimizes the following objective function with L2
+    priors as regularizer::
+
+            1/(2*sum(s)) * deviance(y, h(X*w); s)
+            + 1/2 * alpha * |w|_2
+
+    with inverse link function h and s=sample_weight.
+    The parameter ``alpha`` corresponds to the lambda parameter in glmnet.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the penalty term and thus determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X @ coef + intercept).
+
+    family : {'normal', 'poisson', 'gamma', 'inverse-gaussian'} \
+            or an ExponentialDispersionModel instance, default='normal'
+        The distributional assumption of the GLM, i.e. which distribution from
+        the EDM, specifies the loss function to be minimized.
+
+    link : {'auto', 'identity', 'log'} or an instance of class BaseLink, \
+            default='auto'
+        The link function of the GLM, i.e. mapping from linear predictor
+        `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
+        the link depending on the chosen family as follows:
+
+        - 'identity' for Normal distribution
+        - 'log' for Poisson,  Gamma and Inverse Gaussian distributions
+
+    solver : 'lbfgs', default='lbfgs'
+        Algorithm to use in the optimization problem:
+
+        'lbfgs'
+            Calls scipy's L-BFGS-B optimizer.
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_``.
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+    """
+    def __init__(self, *, alpha=1.0,
+                 fit_intercept=True, family='normal', link='auto',
+                 solver='lbfgs', max_iter=100, tol=1e-4, warm_start=False,
+                 verbose=0):
+        self.alpha = alpha
+        self.fit_intercept = fit_intercept
+        self.family = family
+        self.link = link
+        self.solver = solver
+        self.max_iter = max_iter
+        self.tol = tol
+        self.warm_start = warm_start
+        self.verbose = verbose
+
+    def fit(self, X, y, sample_weight=None):
+        """Fit a Generalized Linear Model.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Training data.
+
+        y : array-like of shape (n_samples,)
+            Target values.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if isinstance(self.family, ExponentialDispersionModel):
+            self._family_instance = self.family
+        elif self.family in EDM_DISTRIBUTIONS:
+            self._family_instance = EDM_DISTRIBUTIONS[self.family]()
+        else:
+            raise ValueError(
+                "The family must be an instance of class"
+                " ExponentialDispersionModel or an element of"
+                " ['normal', 'poisson', 'gamma', 'inverse-gaussian']"
+                "; got (family={0})".format(self.family))
+
+        # Guarantee that self._link_instance is set to an instance of
+        # class BaseLink
+        if isinstance(self.link, BaseLink):
+            self._link_instance = self.link
+        else:
+            if self.link == 'auto':
+                if isinstance(self._family_instance, TweedieDistribution):
+                    if self._family_instance.power <= 0:
+                        self._link_instance = IdentityLink()
+                    if self._family_instance.power >= 1:
+                        self._link_instance = LogLink()
+                else:
+                    raise ValueError("No default link known for the "
+                                     "specified distribution family. Please "
+                                     "set link manually, i.e. not to 'auto'; "
+                                     "got (link='auto', family={})"
+                                     .format(self.family))
+            elif self.link == 'identity':
+                self._link_instance = IdentityLink()
+            elif self.link == 'log':
+                self._link_instance = LogLink()
+            else:
+                raise ValueError(
+                    "The link must be an instance of class Link or "
+                    "an element of ['auto', 'identity', 'log']; "
+                    "got (link={0})".format(self.link))
+
+        if not isinstance(self.alpha, numbers.Number) or self.alpha < 0:
+            raise ValueError("Penalty term must be a non-negative number;"
+                             " got (alpha={0})".format(self.alpha))
+        if not isinstance(self.fit_intercept, bool):
+            raise ValueError("The argument fit_intercept must be bool;"
+                             " got {0}".format(self.fit_intercept))
+        if self.solver not in ['lbfgs']:
+            raise ValueError("GeneralizedLinearRegressor supports only solvers"
+                             "'lbfgs'; got {0}".format(self.solver))
+        solver = self.solver
+        if (not isinstance(self.max_iter, numbers.Integral)
+                or self.max_iter <= 0):
+            raise ValueError("Maximum number of iteration must be a positive "
+                             "integer;"
+                             " got (max_iter={0!r})".format(self.max_iter))
+        if not isinstance(self.tol, numbers.Number) or self.tol <= 0:
+            raise ValueError("Tolerance for stopping criteria must be "
+                             "positive; got (tol={0!r})".format(self.tol))
+        if not isinstance(self.warm_start, bool):
+            raise ValueError("The argument warm_start must be bool;"
+                             " got {0}".format(self.warm_start))
+
+        family = self._family_instance
+        link = self._link_instance
+
+        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr'],
+                         dtype=[np.float64, np.float32],
+                         y_numeric=True, multi_output=False)
+
+        weights = _check_sample_weight(sample_weight, X)
+
+        _, n_features = X.shape
+
+        if not np.all(family.in_y_range(y)):
+            raise ValueError("Some value(s) of y are out of the valid "
+                             "range for family {0}"
+                             .format(family.__class__.__name__))
+        # TODO: if alpha=0 check that X is not rank deficient
+
+        # rescaling of sample_weight
+        #
+        # IMPORTANT NOTE: Since we want to minimize
+        # 1/(2*sum(sample_weight)) * deviance + L2,
+        # deviance = sum(sample_weight * unit_deviance),
+        # we rescale weights such that sum(weights) = 1 and this becomes
+        # 1/2*deviance + L2 with deviance=sum(weights * unit_deviance)
+        weights = weights / weights.sum()
+
+        if self.warm_start and hasattr(self, 'coef_'):
+            if self.fit_intercept:
+                coef = np.concatenate((np.array([self.intercept_]),
+                                       self.coef_))
+            else:
+                coef = self.coef_
+        else:
+            if self.fit_intercept:
+                coef = np.zeros(n_features+1)
+                coef[0] = link(np.average(y, weights=weights))
+            else:
+                coef = np.zeros(n_features)
+
+        # algorithms for optimization
+
+        if solver == 'lbfgs':
+            def func(coef, X, y, weights, alpha, family, link):
+                y_pred, devp = _y_pred_deviance_derivative(
+                    coef, X, y, weights, family, link
+                )
+                dev = family.deviance(y, y_pred, weights)
+                # offset if coef[0] is intercept
+                offset = 1 if self.fit_intercept else 0
+                coef_scaled = alpha * coef[offset:]
+                obj = 0.5 * dev + 0.5 * (coef[offset:] @ coef_scaled)
+                objp = 0.5 * devp
+                objp[offset:] += coef_scaled
+                return obj, objp
+
+            args = (X, y, weights, self.alpha, family, link)
+
+            opt_res = scipy.optimize.minimize(
+                func, coef, method="L-BFGS-B", jac=True,
+                options={
+                    "maxiter": self.max_iter,
+                    "iprint": (self.verbose > 0) - 1,
+                    "gtol": self.tol,
+                    "ftol": 1e3*np.finfo(float).eps,
+                },
+                args=args)
+            self.n_iter_ = _check_optimize_result("lbfgs", opt_res)
+            coef = opt_res.x
+
+        if self.fit_intercept:
+            self.intercept_ = coef[0]
+            self.coef_ = coef[1:]
+        else:
+            # set intercept to zero as the other linear models do
+            self.intercept_ = 0.
+            self.coef_ = coef
+
+        return self
+
+    def _linear_predictor(self, X):
+        """Compute the linear_predictor = `X @ coef_ + intercept_`.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        y_pred : array of shape (n_samples,)
+            Returns predicted values of linear predictor.
+        """
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
+                        dtype=[np.float64, np.float32], ensure_2d=True,
+                        allow_nd=False)
+        return X @ self.coef_ + self.intercept_
+
+    def predict(self, X):
+        """Predict using GLM with feature matrix X.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Samples.
+
+        Returns
+        -------
+        y_pred : array of shape (n_samples,)
+            Returns predicted values.
+        """
+        # check_array is done in _linear_predictor
+        eta = self._linear_predictor(X)
+        y_pred = self._link_instance.inverse(eta)
+        return y_pred
+
+    def score(self, X, y, sample_weight=None):
+        """Compute D^2, the percentage of deviance explained.
+
+        D^2 is a generalization of the coefficient of determination R^2.
+        R^2 uses squared error and D^2 deviance. Note that those two are equal
+        for ``family='normal'``.
+
+        D^2 is defined as
+        :math:`D^2 = 1-\\frac{D(y_{true},y_{pred})}{D_{null}}`,
+        :math:`D_{null}` is the null deviance, i.e. the deviance of a model
+        with intercept alone, which corresponds to :math:`y_{pred} = \\bar{y}`.
+        The mean :math:`\\bar{y}` is averaged by sample_weight.
+        Best possible score is 1.0 and it can be negative (because the model
+        can be arbitrarily worse).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Test samples.
+
+        y : array-like of shape (n_samples,)
+            True values of target.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights.
+
+        Returns
+        -------
+        score : float
+            D^2 of self.predict(X) w.r.t. y.
+        """
+        # Note, default score defined in RegressorMixin is R^2 score.
+        # TODO: make D^2 a score function in module metrics (and thereby get
+        #       input validation and so on)
+        weights = _check_sample_weight(sample_weight, X)
+        y_pred = self.predict(X)
+        dev = self._family_instance.deviance(y, y_pred, weights=weights)
+        y_mean = np.average(y, weights=weights)
+        dev_null = self._family_instance.deviance(y, y_mean, weights=weights)
+        return 1 - dev / dev_null
+
+    def _more_tags(self):
+        # create the _family_instance if fit wasn't called yet.
+        if hasattr(self, '_family_instance'):
+            _family_instance = self._family_instance
+        elif isinstance(self.family, ExponentialDispersionModel):
+            _family_instance = self.family
+        elif self.family in EDM_DISTRIBUTIONS:
+            _family_instance = EDM_DISTRIBUTIONS[self.family]()
+        else:
+            raise ValueError
+        return {"requires_positive_y": not _family_instance.in_y_range(-1.0)}
+
+
+class PoissonRegressor(GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Poisson distribution.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the penalty term and thus determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X @ coef + intercept).
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+    """
+    def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100,
+                 tol=1e-4, warm_start=False, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family="poisson", link='log', max_iter=max_iter,
+                         tol=tol, warm_start=warm_start, verbose=verbose)
+
+    @property
+    def family(self):
+        # Make this attribute read-only to avoid mis-uses e.g. in GridSearch.
+        return "poisson"
+
+    @family.setter
+    def family(self, value):
+        if value != "poisson":
+            raise ValueError("PoissonRegressor.family must be 'poisson'!")
+
+
+class GammaRegressor(GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Gamma distribution.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    alpha : float, default=1
+        Constant that multiplies the penalty term and thus determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X @ coef + intercept).
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X * coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+    """
+    def __init__(self, *, alpha=1.0, fit_intercept=True, max_iter=100,
+                 tol=1e-4, warm_start=False, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family="gamma", link='log', max_iter=max_iter,
+                         tol=tol, warm_start=warm_start, verbose=verbose)
+
+    @property
+    def family(self):
+        # Make this attribute read-only to avoid mis-uses e.g. in GridSearch.
+        return "gamma"
+
+    @family.setter
+    def family(self, value):
+        if value != "gamma":
+            raise ValueError("GammaRegressor.family must be 'gamma'!")
+
+
+class TweedieRegressor(GeneralizedLinearRegressor):
+    """Generalized Linear Model with a Tweedie distribution.
+
+    This estimator can be used to model different GLMs depending on the
+    ``power`` parameter, which determines the underlying distribution.
+
+    Read more in the :ref:`User Guide <Generalized_linear_regression>`.
+
+    Parameters
+    ----------
+    power : float, default=0
+            The power determines the underlying target distribution according
+            to the following table:
+
+            +-------+------------------------+
+            | Power | Distribution           |
+            +=======+========================+
+            | 0     | Normal                 |
+            +-------+------------------------+
+            | 1     | Poisson                |
+            +-------+------------------------+
+            | (1,2) | Compound Poisson Gamma |
+            +-------+------------------------+
+            | 2     | Gamma                  |
+            +-------+------------------------+
+            | 3     | Inverse Gaussian       |
+            +-------+------------------------+
+
+            For ``0 < power < 1``, no distribution exists.
+
+    alpha : float, default=1
+        Constant that multiplies the penalty term and thus determines the
+        regularization strength. ``alpha = 0`` is equivalent to unpenalized
+        GLMs. In this case, the design matrix `X` must have full column rank
+        (no collinearities).
+
+    link : {'auto', 'identity', 'log'}, default='auto'
+        The link function of the GLM, i.e. mapping from linear predictor
+        `X @ coeff + intercept` to prediction `y_pred`. Option 'auto' sets
+        the link depending on the chosen family as follows:
+
+        - 'identity' for Normal distribution
+        - 'log' for Poisson,  Gamma and Inverse Gaussian distributions
+
+    fit_intercept : bool, default=True
+        Specifies if a constant (a.k.a. bias or intercept) should be
+        added to the linear predictor (X @ coef + intercept).
+
+    max_iter : int, default=100
+        The maximal number of iterations for the solver.
+
+    tol : float, default=1e-4
+        Stopping criterion. For the lbfgs solver,
+        the iteration will stop when ``max{|g_j|, j = 1, ..., d} <= tol``
+        where ``g_j`` is the j-th component of the gradient (derivative) of
+        the objective function.
+
+    warm_start : bool, default=False
+        If set to ``True``, reuse the solution of the previous call to ``fit``
+        as initialization for ``coef_`` and ``intercept_`` .
+
+    verbose : int, default=0
+        For the lbfgs solver set verbose to any positive number for verbosity.
+
+    Attributes
+    ----------
+    coef_ : array of shape (n_features,)
+        Estimated coefficients for the linear predictor (`X @ coef_ +
+        intercept_`) in the GLM.
+
+    intercept_ : float
+        Intercept (a.k.a. bias) added to linear predictor.
+
+    n_iter_ : int
+        Actual number of iterations used in the solver.
+    """
+    def __init__(self, *, power=0.0, alpha=1.0, fit_intercept=True,
+                 link='auto', max_iter=100, tol=1e-4,
+                 warm_start=False, verbose=0):
+
+        super().__init__(alpha=alpha, fit_intercept=fit_intercept,
+                         family=TweedieDistribution(power=power), link=link,
+                         max_iter=max_iter, tol=tol,
+                         warm_start=warm_start, verbose=verbose)
+
+    @property
+    def family(self):
+        # We use a property with a setter to make sure that the family is
+        # always a Tweedie distribution, and that self.power and
+        # self.family.power are identical by construction.
+        dist = TweedieDistribution(power=self.power)
+        # TODO: make the returned object immutable
+        return dist
+
+    @family.setter
+    def family(self, value):
+        if isinstance(value, TweedieDistribution):
+            self.power = value.power
+        else:
+            raise TypeError("TweedieRegressor.family must be of type "
+                            "TweedieDistribution!")
diff --git a/sklearn/linear_model/_glm/link.py b/sklearn/linear_model/_glm/link.py
new file mode 100644
index 0000000000000..878d8e835bc42
--- /dev/null
+++ b/sklearn/linear_model/_glm/link.py
@@ -0,0 +1,110 @@
+"""
+Link functions used in GLM
+"""
+
+# Author: Christian Lorentzen <lorentzen.ch@googlemail.com>
+# License: BSD 3 clause
+
+from abc import ABCMeta, abstractmethod
+
+import numpy as np
+from scipy.special import expit, logit
+
+
+class BaseLink(metaclass=ABCMeta):
+    """Abstract base class for Link functions."""
+
+    @abstractmethod
+    def __call__(self, y_pred):
+        """Compute the link function g(y_pred).
+
+        The link function links the mean y_pred=E[Y] to the so called linear
+        predictor (X*w), i.e. g(y_pred) = linear predictor.
+
+        Parameters
+        ----------
+        y_pred : array of shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+
+    @abstractmethod
+    def derivative(self, y_pred):
+        """Compute the derivative of the link g'(y_pred).
+
+        Parameters
+        ----------
+        y_pred : array of shape (n_samples,)
+            Usually the (predicted) mean.
+        """
+
+    @abstractmethod
+    def inverse(self, lin_pred):
+        """Compute the inverse link function h(lin_pred).
+
+        Gives the inverse relationship between linear predictor and the mean
+        y_pred=E[Y], i.e. h(linear predictor) = y_pred.
+
+        Parameters
+        ----------
+        lin_pred : array of shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+
+    @abstractmethod
+    def inverse_derivative(self, lin_pred):
+        """Compute the derivative of the inverse link function h'(lin_pred).
+
+        Parameters
+        ----------
+        lin_pred : array of shape (n_samples,)
+            Usually the (fitted) linear predictor.
+        """
+
+
+class IdentityLink(BaseLink):
+    """The identity link function g(x)=x."""
+
+    def __call__(self, y_pred):
+        return y_pred
+
+    def derivative(self, y_pred):
+        return np.ones_like(y_pred)
+
+    def inverse(self, lin_pred):
+        return lin_pred
+
+    def inverse_derivative(self, lin_pred):
+        return np.ones_like(lin_pred)
+
+
+class LogLink(BaseLink):
+    """The log link function g(x)=log(x)."""
+
+    def __call__(self, y_pred):
+        return np.log(y_pred)
+
+    def derivative(self, y_pred):
+        return 1 / y_pred
+
+    def inverse(self, lin_pred):
+        return np.exp(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        return np.exp(lin_pred)
+
+
+class LogitLink(BaseLink):
+    """The logit link function g(x)=logit(x)."""
+
+    def __call__(self, y_pred):
+        return logit(y_pred)
+
+    def derivative(self, y_pred):
+        return 1 / (y_pred * (1 - y_pred))
+
+    def inverse(self, lin_pred):
+        return expit(lin_pred)
+
+    def inverse_derivative(self, lin_pred):
+        ep = expit(lin_pred)
+        return ep * (1 - ep)
diff --git a/sklearn/linear_model/_glm/tests/__init__.py b/sklearn/linear_model/_glm/tests/__init__.py
new file mode 100644
index 0000000000000..588cf7e93eef0
--- /dev/null
+++ b/sklearn/linear_model/_glm/tests/__init__.py
@@ -0,0 +1 @@
+# License: BSD 3 clause
diff --git a/sklearn/linear_model/_glm/tests/test_glm.py b/sklearn/linear_model/_glm/tests/test_glm.py
new file mode 100644
index 0000000000000..ece8f09c76acd
--- /dev/null
+++ b/sklearn/linear_model/_glm/tests/test_glm.py
@@ -0,0 +1,431 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+
+import numpy as np
+from numpy.testing import assert_allclose
+import pytest
+import warnings
+
+from sklearn.datasets import make_regression
+from sklearn.linear_model._glm import GeneralizedLinearRegressor
+from sklearn.linear_model import (
+    TweedieRegressor,
+    PoissonRegressor,
+    GammaRegressor
+)
+from sklearn.linear_model._glm.link import (
+    IdentityLink,
+    LogLink,
+)
+from sklearn._loss.glm_distribution import (
+    TweedieDistribution,
+    NormalDistribution, PoissonDistribution,
+    GammaDistribution, InverseGaussianDistribution,
+)
+from sklearn.linear_model import Ridge
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.model_selection import train_test_split
+
+
+@pytest.fixture(scope="module")
+def regression_data():
+    X, y = make_regression(n_samples=107,
+                           n_features=10,
+                           n_informative=80, noise=0.5,
+                           random_state=2)
+    return X, y
+
+
+def test_sample_weights_validation():
+    """Test the raised errors in the validation of sample_weight."""
+    # scalar value but not positive
+    X = [[1]]
+    y = [1]
+    weights = 0
+    glm = GeneralizedLinearRegressor()
+
+    # Positive weights are accepted
+    glm.fit(X, y, sample_weight=1)
+
+    # 2d array
+    weights = [[0]]
+    with pytest.raises(ValueError, match="must be 1D array or scalar"):
+        glm.fit(X, y, weights)
+
+    # 1d but wrong length
+    weights = [1, 0]
+    msg = r"sample_weight.shape == \(2,\), expected \(1,\)!"
+    with pytest.raises(ValueError, match=msg):
+        glm.fit(X, y, weights)
+
+
+@pytest.mark.parametrize('name, instance',
+                         [('normal', NormalDistribution()),
+                          ('poisson', PoissonDistribution()),
+                          ('gamma', GammaDistribution()),
+                          ('inverse-gaussian', InverseGaussianDistribution())])
+def test_glm_family_argument(name, instance):
+    """Test GLM family argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family=name, alpha=0).fit(X, y)
+    assert isinstance(glm._family_instance, instance.__class__)
+
+    glm = GeneralizedLinearRegressor(family='not a family')
+    with pytest.raises(ValueError, match="family must be"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('name, instance',
+                         [('identity', IdentityLink()),
+                          ('log', LogLink())])
+def test_glm_link_argument(name, instance):
+    """Test GLM link argument set as string."""
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', link=name).fit(X, y)
+    assert isinstance(glm._link_instance, instance.__class__)
+
+    glm = GeneralizedLinearRegressor(family='normal', link='not a link')
+    with pytest.raises(ValueError, match="link must be"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('family, expected_link_class', [
+    ('normal', IdentityLink),
+    ('poisson', LogLink),
+    ('gamma', LogLink),
+    ('inverse-gaussian', LogLink),
+])
+def test_glm_link_auto(family, expected_link_class):
+    # Make sure link='auto' delivers the expected link function
+    y = np.array([0.1, 0.5])  # in range of all distributions
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family=family, link='auto').fit(X, y)
+    assert isinstance(glm._link_instance, expected_link_class)
+
+
+@pytest.mark.parametrize('alpha', ['not a number', -4.2])
+def test_glm_alpha_argument(alpha):
+    """Test GLM for invalid alpha argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(family='normal', alpha=alpha)
+    with pytest.raises(ValueError,
+                       match="Penalty term must be a non-negative"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('fit_intercept', ['not bool', 1, 0, [True]])
+def test_glm_fit_intercept_argument(fit_intercept):
+    """Test GLM for invalid fit_intercept argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(fit_intercept=fit_intercept)
+    with pytest.raises(ValueError, match="fit_intercept must be bool"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('solver',
+                         ['not a solver', 1, [1]])
+def test_glm_solver_argument(solver):
+    """Test GLM for invalid solver argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(solver=solver)
+    with pytest.raises(ValueError):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('max_iter', ['not a number', 0, -1, 5.5, [1]])
+def test_glm_max_iter_argument(max_iter):
+    """Test GLM for invalid max_iter argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(max_iter=max_iter)
+    with pytest.raises(ValueError, match="must be a positive integer"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('tol', ['not a number', 0, -1.0, [1e-3]])
+def test_glm_tol_argument(tol):
+    """Test GLM for invalid tol argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [2]])
+    glm = GeneralizedLinearRegressor(tol=tol)
+    with pytest.raises(ValueError, match="stopping criteria must be positive"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('warm_start', ['not bool', 1, 0, [True]])
+def test_glm_warm_start_argument(warm_start):
+    """Test GLM for invalid warm_start argument."""
+    y = np.array([1, 2])
+    X = np.array([[1], [1]])
+    glm = GeneralizedLinearRegressor(warm_start=warm_start)
+    with pytest.raises(ValueError, match="warm_start must be bool"):
+        glm.fit(X, y)
+
+
+@pytest.mark.parametrize('fit_intercept', [False, True])
+def test_glm_identity_regression(fit_intercept):
+    """Test GLM regression with identity link on a simple dataset."""
+    coef = [1., 2.]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.dot(X, coef)
+    glm = GeneralizedLinearRegressor(alpha=0, family='normal', link='identity',
+                                     fit_intercept=fit_intercept, tol=1e-12)
+    if fit_intercept:
+        glm.fit(X[:, 1:], y)
+        assert_allclose(glm.coef_, coef[1:], rtol=1e-10)
+        assert_allclose(glm.intercept_, coef[0], rtol=1e-10)
+    else:
+        glm.fit(X, y)
+        assert_allclose(glm.coef_, coef, rtol=1e-12)
+
+
+@pytest.mark.parametrize('fit_intercept', [False, True])
+@pytest.mark.parametrize('alpha', [0.0, 1.0])
+@pytest.mark.parametrize('family', ['normal', 'poisson', 'gamma'])
+def test_glm_sample_weight_consistentcy(fit_intercept, alpha, family):
+    """Test that the impact of sample_weight is consistent"""
+    rng = np.random.RandomState(0)
+    n_samples, n_features = 10, 5
+
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    glm_params = dict(alpha=alpha, family=family, link='auto',
+                      fit_intercept=fit_intercept)
+
+    glm = GeneralizedLinearRegressor(**glm_params).fit(X, y)
+    coef = glm.coef_.copy()
+
+    # sample_weight=np.ones(..) should be equivalent to sample_weight=None
+    sample_weight = np.ones(y.shape)
+    glm.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(glm.coef_, coef, rtol=1e-12)
+
+    # sample_weight are normalized to 1 so, scaling them has no effect
+    sample_weight = 2*np.ones(y.shape)
+    glm.fit(X, y, sample_weight=sample_weight)
+    assert_allclose(glm.coef_, coef, rtol=1e-12)
+
+    # setting one element of sample_weight to 0 is equivalent to removing
+    # the correspoding sample
+    sample_weight = np.ones(y.shape)
+    sample_weight[-1] = 0
+    glm.fit(X, y, sample_weight=sample_weight)
+    coef1 = glm.coef_.copy()
+    glm.fit(X[:-1], y[:-1])
+    assert_allclose(glm.coef_, coef1, rtol=1e-12)
+
+    # check that multiplying sample_weight by 2 is equivalent
+    # to repeating correspoding samples twice
+    X2 = np.concatenate([X, X[:n_samples//2]], axis=0)
+    y2 = np.concatenate([y, y[:n_samples//2]])
+    sample_weight_1 = np.ones(len(y))
+    sample_weight_1[:n_samples//2] = 2
+
+    glm1 = GeneralizedLinearRegressor(**glm_params).fit(
+            X, y, sample_weight=sample_weight_1
+    )
+
+    glm2 = GeneralizedLinearRegressor(**glm_params).fit(
+            X2, y2, sample_weight=None
+    )
+    assert_allclose(glm1.coef_, glm2.coef_)
+
+
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize(
+    'family',
+    [NormalDistribution(), PoissonDistribution(),
+     GammaDistribution(), InverseGaussianDistribution(),
+     TweedieDistribution(power=1.5), TweedieDistribution(power=4.5)])
+def test_glm_log_regression(fit_intercept, family):
+    """Test GLM regression with log link on a simple dataset."""
+    coef = [0.2, -0.1]
+    X = np.array([[1, 1, 1, 1, 1], [0, 1, 2, 3, 4]]).T
+    y = np.exp(np.dot(X, coef))
+    glm = GeneralizedLinearRegressor(
+                alpha=0, family=family, link='log',
+                fit_intercept=fit_intercept, tol=1e-7)
+    if fit_intercept:
+        res = glm.fit(X[:, 1:], y)
+        assert_allclose(res.coef_, coef[1:], rtol=1e-6)
+        assert_allclose(res.intercept_, coef[0], rtol=1e-6)
+    else:
+        res = glm.fit(X, y)
+        assert_allclose(res.coef_, coef, rtol=2e-6)
+
+
+@pytest.mark.parametrize('fit_intercept', [True, False])
+def test_warm_start(fit_intercept):
+    n_samples, n_features = 110, 10
+    X, y = make_regression(n_samples=n_samples, n_features=n_features,
+                           n_informative=n_features-2, noise=0.5,
+                           random_state=42)
+
+    glm1 = GeneralizedLinearRegressor(
+        warm_start=False,
+        fit_intercept=fit_intercept,
+        max_iter=1000
+    )
+    glm1.fit(X, y)
+
+    glm2 = GeneralizedLinearRegressor(
+        warm_start=True,
+        fit_intercept=fit_intercept,
+        max_iter=1
+    )
+    # As we intentionally set max_iter=1, L-BFGS-B will issue a
+    # ConvergenceWarning which we here simply ignore.
+    with warnings.catch_warnings():
+        warnings.filterwarnings('ignore', category=ConvergenceWarning)
+        glm2.fit(X, y)
+    assert glm1.score(X, y) > glm2.score(X, y)
+    glm2.set_params(max_iter=1000)
+    glm2.fit(X, y)
+    # The two model are not exactly identical since the lbfgs solver
+    # computes the approximate hessian from previous iterations, which
+    # will not be strictly identical in the case of a warm start.
+    assert_allclose(glm1.coef_, glm2.coef_, rtol=1e-5)
+    assert_allclose(glm1.score(X, y), glm2.score(X, y), rtol=1e-4)
+
+
+@pytest.mark.parametrize('n_samples, n_features', [(100, 10), (10, 100)])
+@pytest.mark.parametrize('fit_intercept', [True, False])
+@pytest.mark.parametrize('sample_weight', [None, True])
+def test_normal_ridge_comparison(n_samples, n_features, fit_intercept,
+                                 sample_weight, request):
+    """Compare with Ridge regression for Normal distributions."""
+    test_size = 10
+    X, y = make_regression(n_samples=n_samples + test_size,
+                           n_features=n_features,
+                           n_informative=n_features-2, noise=0.5,
+                           random_state=42)
+
+    if n_samples > n_features:
+        ridge_params = {"solver": "svd"}
+    else:
+        ridge_params = {"solver": "saga", "max_iter": 1000000, "tol": 1e-7}
+
+    X_train, X_test, y_train, y_test, = train_test_split(
+        X, y, test_size=test_size, random_state=0
+    )
+
+    alpha = 1.0
+    if sample_weight is None:
+        sw_train = None
+        alpha_ridge = alpha * n_samples
+    else:
+        sw_train = np.random.RandomState(0).rand(len(y_train))
+        alpha_ridge = alpha * sw_train.sum()
+
+    # GLM has 1/(2*n) * Loss + 1/2*L2, Ridge has Loss + L2
+    ridge = Ridge(alpha=alpha_ridge, normalize=False,
+                  random_state=42, fit_intercept=fit_intercept,
+                  **ridge_params)
+    ridge.fit(X_train, y_train, sample_weight=sw_train)
+
+    glm = GeneralizedLinearRegressor(alpha=alpha, family='normal',
+                                     link='identity',
+                                     fit_intercept=fit_intercept,
+                                     max_iter=300,
+                                     tol=1e-5)
+    glm.fit(X_train, y_train, sample_weight=sw_train)
+    assert glm.coef_.shape == (X.shape[1], )
+    assert_allclose(glm.coef_, ridge.coef_, atol=5e-5)
+    assert_allclose(glm.intercept_, ridge.intercept_, rtol=1e-5)
+    assert_allclose(glm.predict(X_train), ridge.predict(X_train), rtol=2e-4)
+    assert_allclose(glm.predict(X_test), ridge.predict(X_test), rtol=2e-4)
+
+
+def test_poisson_glmnet():
+    """Compare Poisson regression with L2 regularization and LogLink to glmnet
+    """
+    # library("glmnet")
+    # options(digits=10)
+    # df <- data.frame(a=c(-2,-1,1,2), b=c(0,0,1,1), y=c(0,1,1,2))
+    # x <- data.matrix(df[,c("a", "b")])
+    # y <- df$y
+    # fit <- glmnet(x=x, y=y, alpha=0, intercept=T, family="poisson",
+    #               standardize=F, thresh=1e-10, nlambda=10000)
+    # coef(fit, s=1)
+    # (Intercept) -0.12889386979
+    # a            0.29019207995
+    # b            0.03741173122
+    X = np.array([[-2, -1, 1, 2], [0, 0, 1, 1]]).T
+    y = np.array([0, 1, 1, 2])
+    glm = GeneralizedLinearRegressor(alpha=1,
+                                     fit_intercept=True, family='poisson',
+                                     link='log', tol=1e-7,
+                                     max_iter=300)
+    glm.fit(X, y)
+    assert_allclose(glm.intercept_, -0.12889386979, rtol=1e-5)
+    assert_allclose(glm.coef_, [0.29019207995, 0.03741173122], rtol=1e-5)
+
+
+def test_convergence_warning(regression_data):
+    X, y = regression_data
+
+    est = GeneralizedLinearRegressor(max_iter=1, tol=1e-20)
+    with pytest.warns(ConvergenceWarning):
+        est.fit(X, y)
+
+
+def test_poisson_regression_family(regression_data):
+    # Make sure the family attribute is read-only to prevent searching over it
+    # e.g. in a grid search
+    est = PoissonRegressor()
+    est.family == "poisson"
+
+    msg = "PoissonRegressor.family must be 'poisson'!"
+    with pytest.raises(ValueError, match=msg):
+        est.family = 0
+
+
+def test_gamma_regression_family(regression_data):
+    # Make sure the family attribute is read-only to prevent searching over it
+    # e.g. in a grid search
+    est = GammaRegressor()
+    est.family == "gamma"
+
+    msg = "GammaRegressor.family must be 'gamma'!"
+    with pytest.raises(ValueError, match=msg):
+        est.family = 0
+
+
+def test_tweedie_regression_family(regression_data):
+    # Make sure the family attribute is always a TweedieDistribution and that
+    # the power attribute is properly updated
+    power = 2.0
+    est = TweedieRegressor(power=power)
+    assert isinstance(est.family, TweedieDistribution)
+    assert est.family.power == power
+    assert est.power == power
+
+    new_power = 0
+    new_family = TweedieDistribution(power=new_power)
+    est.family = new_family
+    assert isinstance(est.family, TweedieDistribution)
+    assert est.family.power == new_power
+    assert est.power == new_power
+
+    msg = "TweedieRegressor.family must be of type TweedieDistribution!"
+    with pytest.raises(TypeError, match=msg):
+        est.family = None
+
+
+@pytest.mark.parametrize(
+        'estimator, value',
+        [
+            (PoissonRegressor(), True),
+            (GammaRegressor(), True),
+            (TweedieRegressor(power=1.5), True),
+            (TweedieRegressor(power=0), False)
+        ],
+)
+def test_tags(estimator, value):
+    assert estimator._get_tags()['requires_positive_y'] is value
diff --git a/sklearn/linear_model/_glm/tests/test_link.py b/sklearn/linear_model/_glm/tests/test_link.py
new file mode 100644
index 0000000000000..27ec4ed19bdc2
--- /dev/null
+++ b/sklearn/linear_model/_glm/tests/test_link.py
@@ -0,0 +1,45 @@
+# Authors: Christian Lorentzen <lorentzen.ch@gmail.com>
+#
+# License: BSD 3 clause
+import numpy as np
+from numpy.testing import assert_allclose
+import pytest
+from scipy.optimize import check_grad
+
+from sklearn.linear_model._glm.link import (
+    IdentityLink,
+    LogLink,
+    LogitLink,
+)
+
+
+LINK_FUNCTIONS = [IdentityLink, LogLink, LogitLink]
+
+
+@pytest.mark.parametrize('Link', LINK_FUNCTIONS)
+def test_link_properties(Link):
+    """Test link inverse and derivative."""
+    rng = np.random.RandomState(42)
+    x = rng.rand(100) * 100
+    link = Link()
+    if isinstance(link, LogitLink):
+        # careful for large x, note expit(36) = 1
+        # limit max eta to 15
+        x = x / 100 * 15
+    assert_allclose(link(link.inverse(x)), x)
+    # if g(h(x)) = x, then g'(h(x)) = 1/h'(x)
+    # g = link, h = link.inverse
+    assert_allclose(link.derivative(link.inverse(x)),
+                    1 / link.inverse_derivative(x))
+
+
+@pytest.mark.parametrize('Link', LINK_FUNCTIONS)
+def test_link_derivative(Link):
+    link = Link()
+    x = np.random.RandomState(0).rand(1)
+    err = check_grad(link, link.derivative, x) / link.derivative(x)
+    assert abs(err) < 1e-6
+
+    err = (check_grad(link.inverse, link.inverse_derivative, x)
+           / link.derivative(x))
+    assert abs(err) < 1e-6
diff --git a/sklearn/linear_model/setup.py b/sklearn/linear_model/setup.py
index 121b449d673d0..d0c9e8c04c16d 100644
--- a/sklearn/linear_model/setup.py
+++ b/sklearn/linear_model/setup.py
@@ -33,6 +33,8 @@ def configuration(parent_package='', top_path=None):
 
     # add other directories
     config.add_subpackage('tests')
+    config.add_subpackage('_glm')
+    config.add_subpackage('_glm/tests')
 
     return config
 
diff --git a/sklearn/metrics/_regression.py b/sklearn/metrics/_regression.py
index 61120c578094e..6026a5293806a 100644
--- a/sklearn/metrics/_regression.py
+++ b/sklearn/metrics/_regression.py
@@ -22,11 +22,10 @@
 #          Christian Lorentzen <lorentzen.ch@googlemail.com>
 # License: BSD 3 clause
 
-
 import numpy as np
-from scipy.special import xlogy
 import warnings
 
+from .._loss.glm_distribution import TweedieDistribution
 from ..utils.validation import (check_array, check_consistent_length,
                                 _num_samples)
 from ..utils.validation import column_or_1d
@@ -669,7 +668,7 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
     y_pred : array-like of shape (n_samples,)
         Estimated target values.
 
-    sample_weight : array-like, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
     power : float, default=0
@@ -714,47 +713,8 @@ def mean_tweedie_deviance(y_true, y_pred, sample_weight=None, power=0):
         sample_weight = column_or_1d(sample_weight)
         sample_weight = sample_weight[:, np.newaxis]
 
-    message = ("Mean Tweedie deviance error with power={} can only be used on "
-               .format(power))
-    if power < 0:
-        # 'Extreme stable', y_true any real number, y_pred > 0
-        if (y_pred <= 0).any():
-            raise ValueError(message + "strictly positive y_pred.")
-        dev = 2 * (np.power(np.maximum(y_true, 0), 2 - power)
-                   / ((1 - power) * (2 - power))
-                   - y_true * np.power(y_pred, 1 - power)/(1 - power)
-                   + np.power(y_pred, 2 - power)/(2 - power))
-    elif power == 0:
-        # Normal distribution, y_true and y_pred any real number
-        dev = (y_true - y_pred)**2
-    elif power < 1:
-        raise ValueError("Tweedie deviance is only defined for power<=0 and "
-                         "power>=1.")
-    elif power == 1:
-        # Poisson distribution, y_true >= 0, y_pred > 0
-        if (y_true < 0).any() or (y_pred <= 0).any():
-            raise ValueError(message + "non-negative y_true and strictly "
-                             "positive y_pred.")
-        dev = 2 * (xlogy(y_true, y_true/y_pred) - y_true + y_pred)
-    elif power == 2:
-        # Gamma distribution, y_true and y_pred > 0
-        if (y_true <= 0).any() or (y_pred <= 0).any():
-            raise ValueError(message + "strictly positive y_true and y_pred.")
-        dev = 2 * (np.log(y_pred/y_true) + y_true/y_pred - 1)
-    else:
-        if power < 2:
-            # 1 < p < 2 is Compound Poisson, y_true >= 0, y_pred > 0
-            if (y_true < 0).any() or (y_pred <= 0).any():
-                raise ValueError(message + "non-negative y_true and strictly "
-                                           "positive y_pred.")
-        else:
-            if (y_true <= 0).any() or (y_pred <= 0).any():
-                raise ValueError(message + "strictly positive y_true and "
-                                           "y_pred.")
-
-        dev = 2 * (np.power(y_true, 2 - power)/((1 - power) * (2 - power))
-                   - y_true * np.power(y_pred, 1 - power)/(1 - power)
-                   + np.power(y_pred, 2 - power)/(2 - power))
+    dist = TweedieDistribution(power=power)
+    dev = dist.unit_deviance(y_true, y_pred, check_input=True)
 
     return np.average(dev, weights=sample_weight)
 
@@ -763,7 +723,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
     """Mean Poisson deviance regression loss.
 
     Poisson deviance is equivalent to the Tweedie deviance with
-    the power parameter `p=1`.
+    the power parameter `power=1`.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
 
@@ -775,7 +735,7 @@ def mean_poisson_deviance(y_true, y_pred, sample_weight=None):
     y_pred : array-like of shape (n_samples,)
         Estimated target values. Requires y_pred > 0.
 
-    sample_weight : array-like, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
     Returns
@@ -800,7 +760,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
     """Mean Gamma deviance regression loss.
 
     Gamma deviance is equivalent to the Tweedie deviance with
-    the power parameter `p=2`. It is invariant to scaling of
+    the power parameter `power=2`. It is invariant to scaling of
     the target variable, and measures relative errors.
 
     Read more in the :ref:`User Guide <mean_tweedie_deviance>`.
@@ -813,7 +773,7 @@ def mean_gamma_deviance(y_true, y_pred, sample_weight=None):
     y_pred : array-like of shape (n_samples,)
         Estimated target values. Requires y_pred > 0.
 
-    sample_weight : array-like, shape (n_samples,), optional
+    sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
     Returns
diff --git a/sklearn/metrics/tests/test_regression.py b/sklearn/metrics/tests/test_regression.py
index b1bf018c7eba7..06c44b2b6f59e 100644
--- a/sklearn/metrics/tests/test_regression.py
+++ b/sklearn/metrics/tests/test_regression.py
@@ -126,27 +126,27 @@ def test_regression_metrics_at_limits():
         mean_tweedie_deviance([0.], [0.], power=power)
     assert_almost_equal(mean_tweedie_deviance([0.], [0.], power=0), 0.00, 2)
 
-    msg = "only be used on non-negative y_true and strictly positive y_pred."
+    msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=1.0)
 
     power = 1.5
     assert_allclose(mean_tweedie_deviance([0.], [1.], power=power),
                     2 / (2 - power))
-    msg = "only be used on non-negative y_true and strictly positive y_pred."
+    msg = "only be used on non-negative y and strictly positive y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
     power = 2.
     assert_allclose(mean_tweedie_deviance([1.], [1.], power=power), 0.00,
                     atol=1e-8)
-    msg = "can only be used on strictly positive y_true and y_pred."
+    msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
     power = 3.
     assert_allclose(mean_tweedie_deviance([1.], [1.], power=power),
                     0.00, atol=1e-8)
 
-    msg = "can only be used on strictly positive y_true and y_pred."
+    msg = "can only be used on strictly positive y and y_pred."
     with pytest.raises(ValueError, match=msg):
         mean_tweedie_deviance([0.], [0.], power=power)
 
diff --git a/sklearn/setup.py b/sklearn/setup.py
index cc257c30e6f43..e759cdabc88ee 100644
--- a/sklearn/setup.py
+++ b/sklearn/setup.py
@@ -53,6 +53,8 @@ def configuration(parent_package='', top_path=None):
     config.add_subpackage('experimental/tests')
     config.add_subpackage('ensemble/_hist_gradient_boosting')
     config.add_subpackage('ensemble/_hist_gradient_boosting/tests')
+    config.add_subpackage('_loss/')
+    config.add_subpackage('_loss/tests')
 
     # submodules which have their own setup.py
     config.add_subpackage('cluster')