scikit-learn · qinhanmin2014 · Jul 27, 2018 · Feb 5, 2018 · Feb 5, 2018 · Feb 5, 2018
diff --git a/doc/glossary.rst b/doc/glossary.rst
@@ -1375,6 +1375,8 @@ functions or non-estimator constructors.
         equal weight by giving each sample a weight inversely related
         to its class's prevalence in the training data:
         ``n_samples / (n_classes * np.bincount(y))``.
+        **Note** however that this rebalancing does not take the weight of
+        samples in each class into account.
 
         For multioutput classification, a list of dicts is used to specify
         weights for each output. For example, for four-class multilabel

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -417,66 +417,67 @@ In the multilabel case with binary label indicators: ::
 Balanced accuracy score
 -----------------------
 
-The :func:`balanced_accuracy_score` function computes the
-`balanced accuracy <https://en.wikipedia.org/wiki/Accuracy_and_precision>`_, which
-avoids inflated performance estimates on imbalanced datasets. It is defined as the
-arithmetic mean of `sensitivity <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_
-(true positive rate) and `specificity <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_
-(true negative rate), or the average of `recall scores <https://en.wikipedia.org/wiki/Precision_and_recall>`_
-obtained on either class.
-
-If the classifier performs equally well on either class, this term reduces to the
-conventional accuracy (i.e., the number of correct predictions divided by the total
-number of predictions). In contrast, if the conventional accuracy is above chance only
-because the classifier takes advantage of an imbalanced test set, then the balanced
-accuracy, as appropriate, will drop to 50%.
-
-If :math:`\hat{y}_i\in\{0,1\}` is the predicted value of
-the :math:`i`-th sample and :math:`y_i\in\{0,1\}` is the corresponding true value,
-then the balanced accuracy is defined as
+The :func:`balanced_accuracy_score` function computes the `balanced accuracy
+<https://en.wikipedia.org/wiki/Accuracy_and_precision>`_, which avoids inflated
+performance estimates on imbalanced datasets. It is the macro-average of recall
+scores per class or, equivalently, raw accuracy where each sample is weighted
+according to the inverse prevalence of its true class.
+Thus for balanced datasets, the score is equal to accuracy.
+
+In the binary case, balanced accuracy is equal to the arithmetic mean of
+`sensitivity <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_
+(true positive rate) and `specificity
+<https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_ (true negative
+rate), or the area under the ROC curve with binary predictions rather than
+scores.
+
+If the classifier performs equally well on either class, this term reduces to
+the conventional accuracy (i.e., the number of correct predictions divided by
+the total number of predictions).
+
+In contrast, if the conventional accuracy is above chance only because the
+classifier takes advantage of an imbalanced test set, then the balanced
+accuracy, as appropriate, will drop to :math:`\frac{1}{\text{n_classes}}`.
+
+The score ranges from 0 to 1, or when ``adjusted=True`` is used, it rescaled to
+the range :math:`\frac{1}{1 - \text{n_classes}}` to 1, inclusive, with
+performance at random scoring 0.
+
+If :math:`y_i` is the true value of the :math:`i`-th sample, and :math:`w_i`
+is the corresponding sample weight, then we adjust the sample weight to:
 
 .. math::
 
-   \texttt{balanced-accuracy}(y, \hat{y}) = \frac{1}{2} \left(\frac{\sum_i 1(\hat{y}_i = 1 \land y_i = 1)}{\sum_i 1(y_i = 1)} + \frac{\sum_i 1(\hat{y}_i = 0 \land y_i = 0)}{\sum_i 1(y_i = 0)}\right)
+   \hat{w}_i = \frac{w_i}{\sum_j{1(y_j = y_i) w_j}}
 
 where :math:`1(x)` is the `indicator function <https://en.wikipedia.org/wiki/Indicator_function>`_.
+Given predicted :math:`\hat{y}_i` for sample :math:`i`, balanced accuracy is
+defined as:
 
-Under this definition, the balanced accuracy coincides with :func:`roc_auc_score`
-given binary ``y_true`` and ``y_pred``:
+.. math::
 
-  >>> import numpy as np
-  >>> from sklearn.metrics import balanced_accuracy_score, roc_auc_score
-  >>> y_true = [0, 1, 0, 0, 1, 0]
-  >>> y_pred = [0, 1, 0, 0, 0, 1]
-  >>> balanced_accuracy_score(y_true, y_pred)
-  0.625
-  >>> roc_auc_score(y_true, y_pred)
-  0.625
+   \texttt{balanced-accuracy}(y, \hat{y}, w) = \frac{1}{\sum{\hat{w}_i}} \sum_i 1(\hat{y}_i = y_i) \hat{w}_i
 
-(but in general, :func:`roc_auc_score` takes as its second argument non-binary scores).
+With ``adjusted=True``, balanced accuracy reports the relative increase from
+:math:`\texttt{balanced-accuracy}(y, \mathbf{0}, w) =
+\frac{1}{\text{n_classes}}`.  In the binary case, this is also known as
+`*Youden's J statistic* <https://en.wikipedia.org/wiki/Youden%27s_J_statistic>`_, or *informedness*.
 
 .. note::
 
-    Currently this score function is only defined for binary classification problems, you
-    may need to wrap it by yourself if you want to use it for multilabel problems.
+    The multiclass definition here seems the most reasonable extension of the
+    metric used in binary classification, though there is no certain consensus
+    in the literature:
 
-    There is no clear consensus on the definition of a balanced accuracy for the
-    multiclass setting. Here are some definitions that can be found in the literature:
-
-    * Macro-average recall as described in [Mosley2013]_, [Kelleher2015]_ and [Guyon2015]_:
-      the recall for each class is computed independently and the average is taken over all classes.
-      In [Guyon2015]_, the macro-average recall is then adjusted to ensure that random predictions
-      have a score of :math:`0` while perfect predictions have a score of :math:`1`.
-      One can compute the macro-average recall using ``recall_score(average="macro")`` in :func:`recall_score`.
+    * Our definition: [Mosley2013]_, [Kelleher2015]_ and [Guyon2015]_, where
+      [Guyon2015]_ adopt the adjusted version to ensure that random predictions
+      have a score of :math:`0` and perfect predictions have a score of :math:`1`..
     * Class balanced accuracy as described in [Mosley2013]_: the minimum between the precision
       and the recall for each class is computed. Those values are then averaged over the total
       number of classes to get the balanced accuracy.
-    * Balanced Accuracy as described in [Urbanowicz2015]_: the average of sensitivity and selectivity
+    * Balanced Accuracy as described in [Urbanowicz2015]_: the average of sensitivity and specificity
       is computed for each class and then averaged over total number of classes.
 
-    Note that none of these different definitions are currently implemented within
-    the :func:`balanced_accuracy_score` function.
-
 .. topic:: References:
 
   .. [Guyon2015] I. Guyon, K. Bennett, G. Cawley, H.J. Escalante, S. Escalera, T.K. Ho, N. Macià,

diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -134,8 +134,9 @@ Model evaluation
   evaluation of clustering models. :issue:`10827` by :user:`Luis Osa <logc>`.
 
 - Added the :func:`metrics.balanced_accuracy_score` metric and a corresponding
-  ``'balanced_accuracy'`` scorer for binary classification.
-  :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia <dalmia>`.
+  ``'balanced_accuracy'`` scorer for binary and multiclass classification.
+  :issue:`8066` by :user:`xyguo` and :user:`Aman Dalmia <dalmia>`, and
+  :issue:`10587` by `Joel Nothman`_.
 
 Decomposition, manifold learning and clustering
 

diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -1365,16 +1365,15 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     return r
 
 
-def balanced_accuracy_score(y_true, y_pred, sample_weight=None):
+def balanced_accuracy_score(y_true, y_pred, sample_weight=None,
+                            adjusted=False):
     """Compute the balanced accuracy
 
-    The balanced accuracy is used in binary classification problems to deal
-    with imbalanced datasets. It is defined as the arithmetic mean of
-    sensitivity (true positive rate) and specificity (true negative rate),
-    or the average recall obtained on either class. It is also equal to the
-    ROC AUC score given binary inputs.
+    The balanced accuracy in binary and multiclass classification problems to
+    deal with imbalanced datasets. It is defined as the average of recall
+    obtained on each class.
 
-    The best value is 1 and the worst value is 0.
+    The best value is 1 and the worst value is 0 when ``adjusted=False``.
 
     Read more in the :ref:`User Guide <balanced_accuracy_score>`.
 
@@ -1389,10 +1388,13 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None):
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
 
+    adjusted : bool, default=False
+        When true, the result is adjusted for chance, so that random
+        performance would score 0, and perfect performance scores 1.
+
     Returns
     -------
-    balanced_accuracy : float.
-        The average of sensitivity and specificity
+    balanced_accuracy : float
 
     See also
     --------
@@ -1404,6 +1406,10 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None):
            The balanced accuracy and its posterior distribution.
            Proceedings of the 20th International Conference on Pattern
            Recognition, 3121-24.
+    .. [2] John. D. Kelleher, Brian Mac Namee, Aoife D'Arcy, (2015).
+           `Fundamentals of Machine Learning for Predictive Data Analytics:
+           Algorithms, Worked Examples, and Case Studies
+           <https://mitpress.mit.edu/books/fundamentals-machine-learning-predictive-data-analytics>`_.
 
     Examples
     --------
@@ -1414,16 +1420,19 @@ def balanced_accuracy_score(y_true, y_pred, sample_weight=None):
     0.625
 
     """
-    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
-
-    if y_type != 'binary':
-        raise ValueError('Balanced accuracy is only meaningful '
-                         'for binary classification problems.')
-    # simply wrap the ``recall_score`` function
-    return recall_score(y_true, y_pred,
-                        pos_label=None,
-                        average='macro',
-                        sample_weight=sample_weight)
+    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
+    with np.errstate(divide='ignore', invalid='ignore'):
+        per_class = np.diag(C) / C.sum(axis=1)
+    if np.any(np.isnan(per_class)):
+        warnings.warn('y_pred contains classes not in y_true')
+        per_class = per_class[~np.isnan(per_class)]
+    score = np.mean(per_class)
+    if adjusted:
+        n_classes = len(per_class)
+        chance = 1 / n_classes
+        score -= chance
+        score /= 1 - chance
+    return score
 
 
 def classification_report(y_true, y_pred, labels=None, target_names=None,

diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -1,11 +1,11 @@
 from __future__ import division, print_function
 
-import numpy as np
-from scipy import linalg
 from functools import partial
 from itertools import product
 import warnings
 
+import numpy as np
+from scipy import linalg
 import pytest
 
 from sklearn import datasets
@@ -31,6 +31,7 @@
 
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import average_precision_score
+from sklearn.metrics import balanced_accuracy_score
 from sklearn.metrics import classification_report
 from sklearn.metrics import cohen_kappa_score
 from sklearn.metrics import confusion_matrix
@@ -1675,3 +1676,26 @@ def test_brier_score_loss():
     # calculate even if only single class in y_true (#6980)
     assert_almost_equal(brier_score_loss([0], [0.5]), 0.25)
     assert_almost_equal(brier_score_loss([1], [0.5]), 0.25)
+
+
+def test_balanced_accuracy_score_unseen():
+    assert_warns_message(UserWarning, 'y_pred contains classes not in y_true',
+                         balanced_accuracy_score, [0, 0, 0], [0, 0, 1])
+
+
+@pytest.mark.parametrize('y_true,y_pred',
+                         [
+                             (['a', 'b', 'a', 'b'], ['a', 'a', 'a', 'b']),
+                             (['a', 'b', 'c', 'b'], ['a', 'a', 'a', 'b']),
+                             (['a', 'a', 'a', 'b'], ['a', 'b', 'c', 'b']),
+                         ])
+def test_balanced_accuracy_score(y_true, y_pred):
+    macro_recall = recall_score(y_true, y_pred, average='macro',
+                                labels=np.unique(y_true))
+    with ignore_warnings():
+        # Warnings are tested in test_balanced_accuracy_score_unseen
+        balanced = balanced_accuracy_score(y_true, y_pred)
+    assert balanced == pytest.approx(macro_recall)
+    adjusted = balanced_accuracy_score(y_true, y_pred, adjusted=True)
+    chance = balanced_accuracy_score(y_true, np.full_like(y_true, y_true[0]))
+    assert adjusted == (balanced - chance) / (1 - chance)
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
@@ -105,6 +105,8 @@
 CLASSIFICATION_METRICS = {
     "accuracy_score": accuracy_score,
     "balanced_accuracy_score": balanced_accuracy_score,
+    "adjusted_balanced_accuracy_score": partial(balanced_accuracy_score,
+                                                adjusted=True),
     "unnormalized_accuracy_score": partial(accuracy_score, normalize=False),
     "confusion_matrix": confusion_matrix,
     "hamming_loss": hamming_loss,
@@ -217,7 +219,6 @@
 # Those metrics don't support multiclass inputs
 METRIC_UNDEFINED_MULTICLASS = {
     "brier_score_loss",
-    "balanced_accuracy_score",
 
     "roc_auc_score",
     "micro_roc_auc",
@@ -362,6 +363,7 @@
 # metric(y_true, y_pred) != metric(y_pred, y_true).
 NOT_SYMMETRIC_METRICS = {
     "balanced_accuracy_score",
+    "adjusted_balanced_accuracy_score",
     "explained_variance_score",
     "r2_score",
     "confusion_matrix",