diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index c98b0e14493c6..142f7e50f1903 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -30,6 +30,7 @@
 from .classification import recall_score
 from .classification import zero_one_loss
 from .classification import brier_score_loss
+from .classification import calibration_loss
 
 from . import cluster
 from .cluster import adjusted_mutual_info_score
@@ -120,4 +121,5 @@
     'v_measure_score',
     'zero_one_loss',
     'brier_score_loss',
+    'calibration_loss',
 ]
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
index e0055e3476f04..1b6b4455b93cb 100644
--- a/sklearn/metrics/classification.py
+++ b/sklearn/metrics/classification.py
@@ -39,6 +39,7 @@
 from ..utils.validation import _num_samples
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
+from __future__ import division
 
 
 def _check_targets(y_true, y_pred):
@@ -1993,3 +1994,64 @@ def brier_score_loss(y_true, y_prob, sample_weight=None, pos_label=None):
     y_true = np.array(y_true == pos_label, int)
     y_true = _check_binary_probabilistic_predictions(y_true, y_prob)
     return np.average((y_true - y_prob) ** 2, weights=sample_weight)
+
+
+def calibration_loss(y_true, y_prob, bin_size=2):
+
+    """Compute Calibration score by bins.
+    The calibration loss is defined as the measure to access the quality of
+    learning methods and learned models. A calibration measure based on
+    overlaping binning is CAL (Caruana and Niculescu-Mizil, 2004).
+
+    Parameters
+    ----------
+    y_true : array, shape (n_samples,)
+        True targets.
+
+    y_prob : array, shape (n_samples,)
+        Probabilities of the positive class.
+
+    bin_size : int
+        Size of the bin (samples) analysed in one iteration
+
+    Returns
+    -------
+    score : float
+        Calibration loss
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import calibration_loss
+    >>> y_true = np.array([0, 1, 1, 0])
+    >>> y_true_categorical = np.array(["spam", "ham", "ham", "spam"])
+    >>> y_prob = np.array([0.1, 0.9, 0.8, 0.3])
+    >>> calibration_loss(y_true, y_prob, bin_size=1)
+    0.175
+    >>> calibration_loss(y_true, y_prob, bin_size=2)
+    0.5333333333333333
+    """
+    pos_loss = 0.0
+    neg_loss = 0.0
+
+    for bin_start in range(0, len(y_true) - bin_size + 1):
+
+        bin_end = bin_start + bin_size
+        actual_per_pos_class = (y_true[bin_start:bin_end]
+                                .sum()) / bin_size
+        bin_error_pos = abs(y_prob[bin_start:bin_end]
+                            - actual_per_pos_class).sum()
+        pos_loss += bin_error_pos
+
+        actual_per_neg_class = (bin_size - y_true[bin_start:bin_end]
+                                .sum()) / bin_size
+        bin_error_neg = abs((1-y_prob[bin_start:bin_end])
+                            - actual_per_neg_class).sum()
+        neg_loss += bin_error_neg
+
+    pos_loss /= (len(y_true) - bin_size + 1)
+    neg_loss /= (len(y_true) - bin_size + 1)
+    loss = (0.5) * (pos_loss + neg_loss)
+
+    return loss
+
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
index cae78e721bc8e..aca45c889815b 100644
--- a/sklearn/metrics/tests/test_classification.py
+++ b/sklearn/metrics/tests/test_classification.py
@@ -28,6 +28,7 @@
 from sklearn.utils.mocking import MockDataFrame
 
 from sklearn.metrics import accuracy_score
+from sklearn.metrics import calibration_loss
 from sklearn.metrics import average_precision_score
 from sklearn.metrics import classification_report
 from sklearn.metrics import cohen_kappa_score
@@ -1635,3 +1636,36 @@ def test_brier_score_loss():
     # calculate even if only single class in y_true (#6980)
     assert_almost_equal(brier_score_loss([0], [0.5]), 0.25)
     assert_almost_equal(brier_score_loss([1], [0.5]), 0.25)
+
+
+def test_calibration_loss():
+    # Check calibration_loss function
+    y_true = np.array([0, 1, 1, 0, 1, 1])
+    y_pred = np.array([0.1, 0.8, 0.9, 0.3, 1.0, 0.95])
+    calibration_loss_val = calibration_loss(y_true, y_pred, bin_size=2)
+    assert_almost_equal(calibration_loss_val, 0.46999, decimal=4)
+
+def test_balanced_accuracy_score_unseen():
+    assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',
+                         balanced_accuracy_score, [0, 0, 0], [0, 0, 1])
+
+
+@pytest.mark.parametrize('y_true,y_pred',
+                         [
+                             (['a', 'b', 'a', 'b'], ['a', 'a', 'a', 'b']),
+                             (['a', 'b', 'c', 'b'], ['a', 'a', 'a', 'b']),
+                             (['a', 'a', 'a', 'b'], ['a', 'b', 'c', 'b']),
+                         ])
+def test_balanced_accuracy_score(y_true, y_pred):
+    macro_recall = recall_score(y_true, y_pred, average='macro',
+                                labels=np.unique(y_true))
+    with ignore_warnings():
+        # Warnings are tested in test_balanced_accuracy_score_unseen
+        balanced = balanced_accuracy_score(y_true, y_pred)
+    assert balanced == pytest.approx(macro_recall)
+    adjusted = balanced_accuracy_score(y_true, y_pred, adjusted=True)
+    chance = balanced_accuracy_score(y_true, np.full_like(y_true, y_true[0]))
+    assert adjusted == (balanced - chance) / (1 - chance)
+def test_balanced_accuracy_score_unseen():
+    assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',
+                         balanced_accuracy_score, [0, 0, 0], [0, 0, 1])