From a66618039050648c79f05e56241242784cebb68a Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Tue, 16 Jan 2018 11:23:47 -0500
Subject: [PATCH 01/93] Add Hand & Till (OvO) and Provost & Domingos (OvR)
 implementations

---
 sklearn/metrics/base.py | 129 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index b8bbab30930b4..79ff07c7d9537 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -13,11 +13,13 @@
 # License: BSD 3 clause
 
 from __future__ import division
+import itertools
 
 import numpy as np
 
 from ..utils import check_array, check_consistent_length
 from ..utils.multiclass import type_of_target
+from ..preprocessing import LabelBinarizer
 
 
 def _average_binary_score(binary_metric, y_true, y_score, average,
@@ -33,7 +35,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         Target scores, can either be probability estimates of the positive
         class, confidence values, or binary decisions.
 
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -122,3 +125,127 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         return np.average(score, weights=average_weight)
     else:
         return score
+
+
+def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
+    """Uses the binary metric for one-vs-one multiclass classification,
+    where the score is computed according to the Hand & Till (2001) algorithm.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True multiclass labels.
+        Assumes labels have been recoded to 0 to n_classes.
+
+    y_score : array, shape = [n_samples, n_classes]
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    average : 'macro' or 'weighted', default='macro'
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the prevalence
+             of the classes.
+
+    binary_metric : callable, the binary metric function to use.
+        Accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    Returns
+    -------
+    score : float
+        Average the sum of pairwise binary metric scores
+    """
+    n_classes = len(np.unique(y_true))
+    n_pairs = n_classes * (n_classes - 1) // 2
+    prevalence = np.empty(n_pairs)
+    pair_scores = np.empty(n_pairs)
+
+    ix = 0
+    for a, b in itertools.combinations(range(n_classes), 2):
+        a_mask = y_true == a
+        ab_mask = np.logical_or(a_mask, y_true == b)
+
+        prevalence[ix] = np.sum(ab_mask) / len(y_true)
+
+        y_score_filtered = y_score[ab_mask]
+
+        a_true = a_mask[ab_mask]
+        b_true = np.logical_not(a_true)
+
+        a_true_score = binary_metric(
+                a_true, y_score_filtered[:, a])
+        b_true_score = binary_metric(
+                b_true, y_score_filtered[:, b])
+        binary_avg_score = (a_true_score + b_true_score) / 2
+        pair_scores[ix] = binary_avg_score
+
+        ix += 1
+    return (np.average(pair_scores, weights=prevalence)
+            if average == "weighted" else np.average(pair_scores))
+
+
+def _average_multiclass_ovr_score(binary_metric, y_true, y_score, average):
+    """Uses the binary metric for one-vs-rest multi-class classification,
+    where the score is computed according to the Provost & Domingos (2001)
+    definition of the AUC in multi-class settings (when `average` parameter is
+    set to `weighted`).
+
+    For each class, the ROC curve is generated and the AUC computed.
+    The output is the average of the individual AUCs weighted by the prevalence
+    of the classes in the data.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True multiclass labels.
+        Assumes labels have been recoded to 0 to n_classes.
+
+    y_score : array, shape = [n_samples, n_classes]
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class.
+
+    average : 'macro' or 'weighted', default='macro'
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the prevalence
+             of the classes in the dataset.
+
+    binary_metric : callable, the binary metric function to use.
+        Accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    Returns
+    -------
+    score : float
+        Average of binary metric scores
+    """
+    n_classes = len(np.unique(y_true))
+    scores = np.zeros((n_classes,))
+
+    y_true_multilabel = LabelBinarizer().fit_transform(y_true)
+    prevalence = np.sum(y_true_multilabel, axis=0) / y_true_multilabel.shape[0]
+
+    for c in range(n_classes):
+        y_true_c = y_true_multilabel.take([c], axis=1).ravel()
+        y_score_c = y_score.take([c], axis=1).ravel()
+        scores[c] = binary_metric(y_true_c, y_score_c)
+
+    return (np.average(scores, weights=prevalence)
+            if average == "weighted" else np.average(scores))

From 118a7003bc1a2f723a91268245700c1bd96f39b2 Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Tue, 16 Jan 2018 11:28:03 -0500
Subject: [PATCH 02/93] Add multi-class implementation in roc_auc_score method

---
 sklearn/metrics/ranking.py | 93 +++++++++++++++++++++++++++++---------
 1 file changed, 72 insertions(+), 21 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 1d8d37954b99c..3d7277466a713 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -31,9 +31,10 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
-from ..preprocessing import label_binarize
+from ..preprocessing import LabelBinarizer, label_binarize
 
-from .base import _average_binary_score
+from .base import _average_binary_score, _average_multiclass_ovo_score, \
+                  _average_multiclass_ovr_score
 
 
 def auc(x, y, reorder='deprecated'):
@@ -157,7 +158,8 @@ def average_precision_score(y_true, y_score, average="macro",
         class, confidence values, or non-thresholded measure of decisions
         (as returned by "decision_function" on some classifiers).
 
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -217,28 +219,39 @@ def _binary_uninterpolated_average_precision(
                                  sample_weight=sample_weight)
 
 
-def roc_auc_score(y_true, y_score, average="macro", sample_weight=None):
-    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
-    from prediction scores.
-
-    Note: this implementation is restricted to the binary classification task
-    or multilabel classification task in label indicator format.
+def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
+                  sample_weight=None):
+    """Compute Area Under the Curve (AUC) from prediction scores
 
     Read more in the :ref:`User Guide <roc_metrics>`.
 
     Parameters
     ----------
     y_true : array, shape = [n_samples] or [n_samples, n_classes]
-        True binary labels or binary label indicators.
+        True binary labels in binary label indicators.
+        The multiclass case expects shape = [n_samples] and labels
+        with values from 0 to (n_classes-1), inclusive.
 
     y_score : array, shape = [n_samples] or [n_samples, n_classes]
         Target scores, can either be probability estimates of the positive
         class, confidence values, or non-thresholded measure of decisions
-        (as returned by "decision_function" on some classifiers). For binary
-        y_true, y_score is supposed to be the score of the class with greater
-        label.
-
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+        (as returned by "decision_function" on some classifiers).
+        The multiclass case expects shape = [n_samples, n_classes]
+        where the scores correspond to probability estimates.
+
+    multiclass : string, 'ovr' or 'ovo', default 'ovr'
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages.
+
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -295,13 +308,51 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return auc(fpr, tpr)
 
     y_type = type_of_target(y_true)
-    if y_type == "binary":
-        labels = np.unique(y_true)
-        y_true = label_binarize(y_true, labels)[:, 0]
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
 
-    return _average_binary_score(
-        _binary_roc_auc_score, y_true, y_score, average,
-        sample_weight=sample_weight)
+    if y_type == "multiclass" or (y_type == "binary" and
+                                  y_score.ndim == 2 and
+                                  y_score.shape[1] > 2):
+        # validation of the input y_score
+        if not np.allclose(1, y_score.sum(axis=1)):
+            raise ValueError("Target scores should sum up to 1.0 for all"
+                             "samples.")
+        # validation for multiclass parameter specifications
+        average_options = ("macro", "weighted")
+        if average not in average_options:
+            raise ValueError("Parameter 'average' must be one of {0} for"
+                             " multiclass problems.".format(average_options))
+        multiclass_options = ("ovo", "ovr")
+        if multiclass not in multiclass_options:
+            raise ValueError("Parameter multiclass='{0}' is not supported"
+                             " for multiclass ROC AUC. 'multiclass' must be"
+                             " one of {1}.".format(
+                                 multiclass, multiclass_options))
+        if sample_weight is not None:
+            # TODO: check if only in ovo case, if yes, do not raise when ovr
+            raise ValueError("Parameter 'sample_weight' is not supported"
+                             " for multiclass one-vs-one ROC AUC."
+                             " 'sample_weight' must be None in this case.")
+
+        if multiclass == "ovo":
+            # Hand & Till (2001) implementation
+            return _average_multiclass_ovo_score(
+                _binary_roc_auc_score, y_true, y_score, average)
+        elif multiclass == "ovr" and average == "weighted":
+            # Provost & Domingos (2001) implementation
+            return _average_multiclass_ovr_score(
+                _binary_roc_auc_score, y_true, y_score, average)
+        else:
+            y_true = y_true.reshape((-1, 1))
+            y_true_multilabel = LabelBinarizer().fit_transform(y_true)
+            return _average_binary_score(
+                 _binary_roc_auc_score, y_true_multilabel, y_score, average,
+                 sample_weight=sample_weight)
+    else:
+        return _average_binary_score(
+            _binary_roc_auc_score, y_true, y_score, average,
+            sample_weight=sample_weight)
 
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):

From 3371b1dd41416b20a4166fc84d1ad78806cd7b7b Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Tue, 16 Jan 2018 11:31:03 -0500
Subject: [PATCH 03/93] Add tests for multi-class settings OvO and OvR

---
 sklearn/metrics/tests/test_ranking.py | 130 ++++++++++++++++++++++++--
 1 file changed, 120 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index a17935ae7de17..f66c39fbe256b 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -442,6 +442,125 @@ def test_deprecated_auc_reorder():
                          [1, 2], [2, 3], reorder=True)
 
 
+def test_multi_ovo_auc_toydata():
+    # Tests the one-vs-one multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_true = np.array([0, 1, 0, 2])
+    n_labels = len(np.unique(y_true))
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
+    average_score_01 = (score_01 + score_10) / 2.
+
+    # Consider labels 0 and 2:
+    score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
+    score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
+    average_score_02 = (score_02 + score_20) / 2.
+
+    # Consider labels 1 and 2:
+    score_12 = roc_auc_score([1, 0], [0.4, 0.2])
+    score_21 = roc_auc_score([0, 1], [0.3, 0.8])
+    average_score_12 = (score_12 + score_21) / 2.
+
+    # Unweighted, one-vs-one multiclass ROC AUC algorithm
+    sum_avg_scores = average_score_01 + average_score_02 + average_score_12
+    ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1))
+    ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo"),
+        ovo_unweighted_score)
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    # Each term is weighted by the prevalence for the positive label.
+    pair_scores = [average_score_01, average_score_02, average_score_12]
+    prevalence = [0.75, 0.75, 0.50]
+    ovo_weighted_score = np.average(pair_scores, weights=prevalence)
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
+        ovo_weighted_score)
+
+
+def test_multi_ovr_auc_toydata():
+    # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_true = np.array([0, 1, 2, 2])
+    y_scores = np.array(
+        [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
+    # Compute the expected result by individually computing the 'one-vs-rest'
+    # ROC AUC scores for classes 0, 1, and 2.
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
+    result_unweighted = (out_0 + out_1 + out_2) / 3.
+
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr"),
+        result_unweighted)
+
+    # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
+    # on the same input (Provost & Domingos, 2001)
+    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
+        result_weighted)
+
+
+def test_multi_auc_score_under_permutation():
+    y_score = np.random.rand(100, 3)
+    y_score[:, 2] += .1
+    y_score[:, 1] -= .1
+    y_true = np.argmax(y_score, axis=1)
+    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
+        2, size=20)
+    for multiclass in ['ovr', 'ovo']:
+        for average in ['macro', 'weighted']:
+            same_score_under_permutation = None
+            for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2],
+                         [1, 2, 0], [2, 0, 1], [2, 1, 0]]:
+                inv_perm = np.zeros(3, dtype=int)
+                inv_perm[perm] = np.arange(3)
+                y_score_perm = y_score[:, inv_perm]
+                y_true_perm = np.take(perm, y_true)
+                score = roc_auc_score(y_true_perm, y_score_perm,
+                                      multiclass=multiclass, average=average)
+                if same_score_under_permutation is None:
+                    same_score_under_permutation = score
+                else:
+                    assert_almost_equal(score, same_score_under_permutation)
+
+
+def test_auc_score_multi_error():
+    # Test that roc_auc_score function returns an error when trying
+    # to compute multiclass AUC for parameters where an output
+    # is not defined.
+    rng = check_random_state(404)
+    y_pred = rng.rand(10)
+    y_true = rng.randint(0, 3, size=10)
+    average_error_msg = ("Parameter 'average' must be one of "
+                         "('macro', 'weighted') for multiclass problems.")
+    assert_raise_message(ValueError, average_error_msg,
+                         roc_auc_score, y_true, y_pred, average="sample")
+    assert_raise_message(ValueError, average_error_msg,
+                         roc_auc_score, y_true, y_pred, average="micro")
+    multiclass_error_msg = ("Parameter multiclass='invalid' is not "
+                            "supported for multiclass ROC AUC. 'multiclass' "
+                            "must be one of ('ovo', 'ovr').")
+    assert_raise_message(ValueError, multiclass_error_msg,
+                         roc_auc_score, y_true, y_pred, multiclass="invalid")
+    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported "
+                               "for multiclass one-vs-one ROC AUC. "
+                               "'sample_weight' must be None in this case.")
+    assert_raise_message(ValueError, sample_weight_error_msg,
+                         roc_auc_score, y_true, y_pred,
+                         multiclass="ovo", sample_weight=[])
+
+
 def test_auc_score_non_binary_class():
     # Test that roc_auc_score function returns an error when trying
     # to compute AUC for non-binary class values.
@@ -457,10 +576,6 @@ def test_auc_score_non_binary_class():
     y_true = -np.ones(10, dtype="int")
     assert_raise_message(ValueError, "ROC AUC score is not defined",
                          roc_auc_score, y_true, y_pred)
-    # y_true contains three different class values
-    y_true = rng.randint(0, 3, size=10)
-    assert_raise_message(ValueError, "multiclass format is not supported",
-                         roc_auc_score, y_true, y_pred)
 
     clean_warning_registry()
     with warnings.catch_warnings(record=True):
@@ -477,11 +592,6 @@ def test_auc_score_non_binary_class():
         assert_raise_message(ValueError, "ROC AUC score is not defined",
                              roc_auc_score, y_true, y_pred)
 
-        # y_true contains three different class values
-        y_true = rng.randint(0, 3, size=10)
-        assert_raise_message(ValueError, "multiclass format is not supported",
-                             roc_auc_score, y_true, y_pred)
-
 
 def test_binary_clf_curve():
     rng = check_random_state(404)
@@ -491,6 +601,7 @@ def test_binary_clf_curve():
     assert_raise_message(ValueError, msg, precision_recall_curve,
                          y_true, y_pred)
 
+
 def test_precision_recall_curve():
     y_true, _, probas_pred = make_prediction(binary=True)
     _test_precision_recall_curve(y_true, probas_pred)
@@ -675,7 +786,6 @@ def test_score_scale_invariance():
     # issue #3864 (and others), where overly aggressive rounding was causing
     # problems for users with very small y_score values
     y_true, _, probas_pred = make_prediction(binary=True)
-
     roc_auc = roc_auc_score(y_true, probas_pred)
     roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred)
     roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)

From d74ce160a0fcc576e44de28bcea0135fc33fc0fb Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Wed, 17 Jan 2018 15:06:47 -0500
Subject: [PATCH 04/93] Fix binary case roc computation

---
 sklearn/metrics/ranking.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 3d7277466a713..ea479fd50be2f 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -350,6 +350,10 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                  _binary_roc_auc_score, y_true_multilabel, y_score, average,
                  sample_weight=sample_weight)
     else:
+        # Binary case
+        labels = np.unique(y_true)
+        y_true = label_binarize(y_true, labels)[:, 0]
+
         return _average_binary_score(
             _binary_roc_auc_score, y_true, y_score, average,
             sample_weight=sample_weight)

From 805d80483d51799ebdcbc3a255d26b34ac0b7324 Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Wed, 17 Jan 2018 15:28:08 -0500
Subject: [PATCH 05/93] Make scores add up to 1.0

---
 sklearn/metrics/tests/test_ranking.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index f66c39fbe256b..e21274db545fe 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -513,8 +513,10 @@ def test_multi_ovr_auc_toydata():
 
 def test_multi_auc_score_under_permutation():
     y_score = np.random.rand(100, 3)
-    y_score[:, 2] += .1
-    y_score[:, 1] -= .1
+    # Normalize the scores for each row
+    row_sums = y_score.sum(axis=1)
+    y_score = y_score / row_sums[:, np.newaxis]
+    # Generate the true labels
     y_true = np.argmax(y_score, axis=1)
     y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
         2, size=20)

From 2bd693ecad3b0d009994b42d1bea4a0de42203ca Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Wed, 17 Jan 2018 15:45:25 -0500
Subject: [PATCH 06/93] Fix typo

---
 sklearn/metrics/tests/test_ranking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index e21274db545fe..018f271de362c 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -547,7 +547,7 @@ def test_auc_score_multi_error():
     average_error_msg = ("Parameter 'average' must be one of "
                          "('macro', 'weighted') for multiclass problems.")
     assert_raise_message(ValueError, average_error_msg,
-                         roc_auc_score, y_true, y_pred, average="sample")
+                         roc_auc_score, y_true, y_pred, average="samples")
     assert_raise_message(ValueError, average_error_msg,
                          roc_auc_score, y_true, y_pred, average="micro")
     multiclass_error_msg = ("Parameter multiclass='invalid' is not "

From fc54dde81db7416ec85f88709dd54aedaca61dea Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Wed, 17 Jan 2018 17:18:01 -0500
Subject: [PATCH 07/93] Differenciate binary case explicitly to avoid error
 when multilabel-indicator format

---
 sklearn/metrics/ranking.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index ea479fd50be2f..6d8d6c05bd5ce 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -349,11 +349,13 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
             return _average_binary_score(
                  _binary_roc_auc_score, y_true_multilabel, y_score, average,
                  sample_weight=sample_weight)
-    else:
-        # Binary case
+    elif y_type == "binary":
         labels = np.unique(y_true)
         y_true = label_binarize(y_true, labels)[:, 0]
-
+        return _average_binary_score(
+            _binary_roc_auc_score, y_true, y_score, average,
+            sample_weight=sample_weight)
+    else:
         return _average_binary_score(
             _binary_roc_auc_score, y_true, y_score, average,
             sample_weight=sample_weight)

From 133a09ab0126f5d21e78fa81757ab97126061802 Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Fri, 19 Jan 2018 12:18:54 -0500
Subject: [PATCH 08/93] Fix prediciton scores

---
 sklearn/metrics/tests/test_ranking.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 018f271de362c..c83e83e1748fa 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -542,7 +542,9 @@ def test_auc_score_multi_error():
     # to compute multiclass AUC for parameters where an output
     # is not defined.
     rng = check_random_state(404)
-    y_pred = rng.rand(10)
+    y_pred = rng.rand(10, 3)
+    row_sums = y_pred.sum(axis=1)
+    y_pred = y_pred / row_sums[:, np.newaxis]
     y_true = rng.randint(0, 3, size=10)
     average_error_msg = ("Parameter 'average' must be one of "
                          "('macro', 'weighted') for multiclass problems.")

From d08f084e21833c1f16a6f21f9ce4e11b6199d589 Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Mon, 26 Mar 2018 10:17:20 -0400
Subject: [PATCH 09/93] Fix test error by setting param dtype=None

---
 sklearn/metrics/ranking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index a6ff03895bcc1..fcf2a61178439 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -338,7 +338,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
 
     y_type = type_of_target(y_true)
-    y_true = check_array(y_true, ensure_2d=False)
+    y_true = check_array(y_true, ensure_2d=False, dtype=None)
     y_score = check_array(y_score, ensure_2d=False)
 
     if y_type == "multiclass" or (y_type == "binary" and

From 4c7a65633b550318ceffad2291fd289fbf48ba03 Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Mon, 26 Mar 2018 10:22:46 -0400
Subject: [PATCH 10/93] Quick fix

---
 sklearn/metrics/base.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 79ff07c7d9537..b1aa73c237b11 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -169,8 +169,7 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     prevalence = np.empty(n_pairs)
     pair_scores = np.empty(n_pairs)
 
-    ix = 0
-    for a, b in itertools.combinations(range(n_classes), 2):
+    for ix, (a, b) in enumerate(itertools.combinations(range(n_classes), 2)):
         a_mask = y_true == a
         ab_mask = np.logical_or(a_mask, y_true == b)
 
@@ -188,7 +187,6 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         binary_avg_score = (a_true_score + b_true_score) / 2
         pair_scores[ix] = binary_avg_score
 
-        ix += 1
     return (np.average(pair_scores, weights=prevalence)
             if average == "weighted" else np.average(pair_scores))
 

From 4723b00b0bdbdd2313895ab1ce6ff70569cd4816 Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Mon, 26 Mar 2018 11:25:53 -0400
Subject: [PATCH 11/93] Raise error for partial computation in multiclass

---
 sklearn/metrics/ranking.py            | 12 +++++++++++-
 sklearn/metrics/tests/test_ranking.py |  7 +++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index fcf2a61178439..505149749cb55 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -277,7 +277,9 @@ def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
 
     max_fpr : float > 0 and <= 1, optional
         If not ``None``, the standardized partial AUC [3]_ over the range
-        [0, max_fpr] is returned.
+        [0, max_fpr] is returned. If multiclass task, should be either
+        equal to ``None`` or ``1.0`` as AUC ROC partial computation currently
+        not supported in this case.
 
     Returns
     -------
@@ -348,6 +350,14 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         if not np.allclose(1, y_score.sum(axis=1)):
             raise ValueError("Target scores should sum up to 1.0 for all"
                              "samples.")
+
+        # do not support partial ROC computation for multiclass
+        if max_fpr is not None and max_fpr != 1.:
+            raise ValueError("Partial AUC computation not available in "
+                             "multiclass setting. Parameter 'max_fpr' must be"
+                             " set to `None`. Received `max_fpr={0}` "
+                             "instead.".format(max_fpr))
+
         # validation for multiclass parameter specifications
         average_options = ("macro", "weighted")
         if average not in average_options:
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index b36824a08e534..a503b0b84078e 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -592,6 +592,13 @@ def test_auc_score_multi_error():
     assert_raise_message(ValueError, sample_weight_error_msg,
                          roc_auc_score, y_true, y_pred,
                          multiclass="ovo", sample_weight=[])
+    partial_comp_error_msg = ("Partial AUC computation not available in "
+                              "multiclass setting. Parameter 'max_fpr' must "
+                              "be set to `None`. Received `max_fpr=0.5` "
+                              "instead.")
+    assert_raise_message(ValueError, partial_comp_error_msg,
+                         roc_auc_score, y_true, y_pred,
+                         multiclass="ovo", max_fpr=0.5)
 
 
 def test_auc_score_non_binary_class():

From aa6dd49389070ac8457af5af336a2dcf9fd7ff1e Mon Sep 17 00:00:00 2001
From: Maskani Filali Mohamed <maskani.mohamed@gmail.com>
Date: Mon, 26 Mar 2018 11:28:07 -0400
Subject: [PATCH 12/93] Fix pep8

---
 sklearn/metrics/base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index b1aa73c237b11..0d0fba50e8737 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -147,8 +147,8 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
             mean. This does not take label imbalance into account. Classes
             are assumed to be uniformly distributed.
         ``'weighted'``:
-            Calculate metrics for each label, taking into account the prevalence
-             of the classes.
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes.
 
     binary_metric : callable, the binary metric function to use.
         Accepts the following as input
@@ -217,8 +217,8 @@ def _average_multiclass_ovr_score(binary_metric, y_true, y_score, average):
             mean. This does not take label imbalance into account. Classes
             are assumed to be uniformly distributed.
         ``'weighted'``:
-            Calculate metrics for each label, taking into account the prevalence
-             of the classes in the dataset.
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes in the dataset.
 
     binary_metric : callable, the binary metric function to use.
         Accepts the following as input

From 5c094cd75ae361a0627bf9b222dbae1bb95146a1 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Fri, 5 Oct 2018 12:26:20 -0400
Subject: [PATCH 13/93] try adding ovo multiclass scores

---
 sklearn/metrics/tests/test_common.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 16e4f5d4c76da..ff95cfff4a0d6 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -204,6 +204,9 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "weighted_roc_auc": partial(roc_auc_score, average="weighted"),
     "samples_roc_auc": partial(roc_auc_score, average="samples"),
     "micro_roc_auc": partial(roc_auc_score, average="micro"),
+    "ovo_roc_auc": partial(roc_auc_score, average="macro", multiclass='ovo'),
+    "ovo_roc_auc_weighted": partial(roc_auc_score, average="weighted",
+                                    multiclass='ovo'),
     "partial_roc_auc": partial(roc_auc_score, max_fpr=0.5),
 
     "average_precision_score":

From d0393d7ca68105d3e803f76ec542c3e50e0e6757 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Fri, 5 Oct 2018 12:38:49 -0400
Subject: [PATCH 14/93] allow roc_auc and macro_roc_auc for multiclass in 
 test_common

---
 sklearn/metrics/ranking.py           | 5 +++--
 sklearn/metrics/tests/test_common.py | 2 --
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 41b68a1dc0e7b..eb2c7057b4407 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -364,8 +364,9 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                                   y_score.shape[1] > 2):
         # validation of the input y_score
         if not np.allclose(1, y_score.sum(axis=1)):
-            raise ValueError("Target scores should sum up to 1.0 for all"
-                             "samples.")
+            raise ValueError(
+                "Target scores need to be probabilities for multiclass "
+                "roc_auc, i.e. they should sum up to 1.0 over classes.")
 
         # do not support partial ROC computation for multiclass
         if max_fpr is not None and max_fpr != 1.:
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index ff95cfff4a0d6..b7fa5e0cb4b3b 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -252,9 +252,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
 METRIC_UNDEFINED_MULTICLASS = {
     "brier_score_loss",
 
-    "roc_auc_score",
     "micro_roc_auc",
-    "weighted_roc_auc",
     "samples_roc_auc",
     "partial_roc_auc",
 

From 4a0ded6f6cf732cc006be3b28c759cacf53a546a Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Fri, 5 Oct 2018 15:15:40 -0400
Subject: [PATCH 15/93] add multiclass roc_auc metrics to scores, more common
 tests

---
 sklearn/metrics/scorer.py                   | 10 ++++++++++
 sklearn/metrics/tests/test_common.py        |  6 ++++++
 sklearn/metrics/tests/test_score_objects.py |  3 ++-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 2661a379b4e53..d11fa92888f58 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -476,6 +476,13 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                                        needs_threshold=True)
 precision_scorer = make_scorer(precision_score)
 recall_scorer = make_scorer(recall_score)
+roc_auc_ovo_scorer = make_scorer(roc_auc_score, needs_threshold=True,
+                                 multiclass='ovo')
+roc_auc_weighted_scorer = make_scorer(roc_auc_score, average='weighted',
+                                      needs_threshold=True)
+roc_auc_ovo_weighted_scorer = make_scorer(roc_auc_score, average='weighted',
+                                          multiclass='ovo',
+                                          needs_threshold=True)
 
 # Score function for probabilistic classification
 neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
@@ -503,6 +510,9 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                neg_mean_squared_error=neg_mean_squared_error_scorer,
                neg_mean_squared_log_error=neg_mean_squared_log_error_scorer,
                accuracy=accuracy_scorer, roc_auc=roc_auc_scorer,
+               roc_auc_ovo=roc_auc_ovo_scorer,
+               roc_auc_weighted=roc_auc_weighted_scorer,
+               roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer,
                balanced_accuracy=balanced_accuracy_scorer,
                average_precision=average_precision_scorer,
                neg_log_loss=neg_log_loss_scorer,
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index b7fa5e0cb4b3b..288d8d5f351c3 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -430,6 +430,12 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
 # No Sample weight support
 METRICS_WITHOUT_SAMPLE_WEIGHT = {
     "median_absolute_error",
+    # these allow sample_weights in the multi-label case but not multi-class?
+    # that seems ... odd?
+    "roc_auc_score",
+    "weighted_roc_auc",
+    "ovo_roc_auc",
+    "ovo_roc_auc_weighted"
 }
 
 
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 9033a2b2d86ee..f1a9e5644217a 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -53,7 +53,8 @@
                'roc_auc', 'average_precision', 'precision',
                'precision_weighted', 'precision_macro', 'precision_micro',
                'recall', 'recall_weighted', 'recall_macro', 'recall_micro',
-               'neg_log_loss', 'log_loss', 'brier_score_loss']
+               'neg_log_loss', 'log_loss', 'brier_score_loss',
+               'roc_auc_weighted', 'roc_auc_ovo_weighted', 'roc_auc_ovo']
 
 # All supervised cluster scorers (They behave like classification metric)
 CLUSTER_SCORERS = ["adjusted_rand_score",

From d59955269011371e1e1475f0ae5969af51ffb39d Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Fri, 5 Oct 2018 15:28:21 -0400
Subject: [PATCH 16/93] ovr is same as multilabel

---
 sklearn/metrics/base.py    | 61 +-------------------------------------
 sklearn/metrics/ranking.py |  7 ++---
 2 files changed, 3 insertions(+), 65 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 287f69ccdc5ae..9bbcde7804ab9 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -19,7 +19,6 @@
 
 from ..utils import check_array, check_consistent_length
 from ..utils.multiclass import type_of_target
-from ..preprocessing import LabelBinarizer
 
 
 def _average_binary_score(binary_metric, y_true, y_score, average,
@@ -190,62 +189,4 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         pair_scores[ix] = binary_avg_score
 
     return (np.average(pair_scores, weights=prevalence)
-            if average == "weighted" else np.average(pair_scores))
-
-
-def _average_multiclass_ovr_score(binary_metric, y_true, y_score, average):
-    """Uses the binary metric for one-vs-rest multi-class classification,
-    where the score is computed according to the Provost & Domingos (2001)
-    definition of the AUC in multi-class settings (when `average` parameter is
-    set to `weighted`).
-
-    For each class, the ROC curve is generated and the AUC computed.
-    The output is the average of the individual AUCs weighted by the prevalence
-    of the classes in the data.
-
-    Parameters
-    ----------
-    y_true : array, shape = [n_samples]
-        True multiclass labels.
-        Assumes labels have been recoded to 0 to n_classes.
-
-    y_score : array, shape = [n_samples, n_classes]
-        Target scores corresponding to probability estimates of a sample
-        belonging to a particular class.
-
-    average : 'macro' or 'weighted', default='macro'
-        ``'macro'``:
-            Calculate metrics for each label, and find their unweighted
-            mean. This does not take label imbalance into account. Classes
-            are assumed to be uniformly distributed.
-        ``'weighted'``:
-            Calculate metrics for each label, taking into account the
-            prevalence of the classes in the dataset.
-
-    binary_metric : callable, the binary metric function to use.
-        Accepts the following as input
-            y_true_target : array, shape = [n_samples_target]
-                Some sub-array of y_true for a pair of classes designated
-                positive and negative in the one-vs-one scheme.
-            y_score_target : array, shape = [n_samples_target]
-                Scores corresponding to the probability estimates
-                of a sample belonging to the designated positive class label
-
-    Returns
-    -------
-    score : float
-        Average of binary metric scores
-    """
-    n_classes = len(np.unique(y_true))
-    scores = np.zeros((n_classes,))
-
-    y_true_multilabel = LabelBinarizer().fit_transform(y_true)
-    prevalence = np.sum(y_true_multilabel, axis=0) / y_true_multilabel.shape[0]
-
-    for c in range(n_classes):
-        y_true_c = y_true_multilabel.take([c], axis=1).ravel()
-        y_score_c = y_score.take([c], axis=1).ravel()
-        scores[c] = binary_metric(y_true_c, y_score_c)
-
-    return (np.average(scores, weights=prevalence)
-            if average == "weighted" else np.average(scores))
+            if average == "weighted" else np.average(pair_scores))
\ No newline at end of file
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index eb2c7057b4407..19cfecc37a236 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -396,11 +396,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
             # Hand & Till (2001) implementation
             return _average_multiclass_ovo_score(
                 _binary_roc_auc_score, y_true, y_score, average)
-        elif multiclass == "ovr" and average == "weighted":
-            # Provost & Domingos (2001) implementation
-            return _average_multiclass_ovr_score(
-                _binary_roc_auc_score, y_true, y_score, average)
         else:
+            # ovr is same as multi-label
             y_true = y_true.reshape((-1, 1))
             y_true_multilabel = LabelBinarizer().fit_transform(y_true)
             return _average_binary_score(
@@ -933,7 +930,7 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
             unique_inverse[y_true.indices[start:stop]],
             minlength=len(unique_scores))
         all_at_reversed_rank = np.bincount(unique_inverse,
-                                        minlength=len(unique_scores))
+                                           minlength=len(unique_scores))
         false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
 
         # if the scores are ordered, it's possible to count the number of

From 2cc343a9bf1c0a73ce9c0a5bc1cce1b5503c9e91 Mon Sep 17 00:00:00 2001
From: Andreas Mueller <andreas.mueller@columbia.edu>
Date: Fri, 5 Oct 2018 15:29:31 -0400
Subject: [PATCH 17/93] remove non-existant import

---
 sklearn/metrics/ranking.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 19cfecc37a236..a552a7f18d409 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -35,8 +35,7 @@
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import LabelBinarizer, label_binarize
 
-from .base import _average_binary_score, _average_multiclass_ovo_score, \
-                  _average_multiclass_ovr_score
+from .base import _average_binary_score, _average_multiclass_ovo_score
 
 
 def auc(x, y, reorder='deprecated'):

From c91a9bd4180b1ec47efc9364037acba83963ec4f Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 12 Dec 2018 21:36:43 -0500
Subject: [PATCH 18/93] RFC: Removes unrelated diffs

---
 sklearn/metrics/base.py               |   5 +-
 sklearn/metrics/ranking.py            |   5 +-
 sklearn/metrics/tests/test_ranking.py | 131 --------------------------
 3 files changed, 4 insertions(+), 137 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 9bbcde7804ab9..95774c9045114 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -34,8 +34,7 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         Target scores, can either be probability estimates of the positive
         class, confidence values, or binary decisions.
 
-    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
-              default 'macro'
+    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -189,4 +188,4 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         pair_scores[ix] = binary_avg_score
 
     return (np.average(pair_scores, weights=prevalence)
-            if average == "weighted" else np.average(pair_scores))
\ No newline at end of file
+            if average == "weighted" else np.average(pair_scores))
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 20c4d4b7eab6b..6cc8b75e3b22a 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -159,8 +159,7 @@ def average_precision_score(y_true, y_score, average="macro", pos_label=1,
         class, confidence values, or non-thresholded measure of decisions
         (as returned by "decision_function" on some classifiers).
 
-    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
-              default 'macro'
+    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -934,7 +933,7 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
             unique_inverse[y_true.indices[start:stop]],
             minlength=len(unique_scores))
         all_at_reversed_rank = np.bincount(unique_inverse,
-                                           minlength=len(unique_scores))
+                                        minlength=len(unique_scores))
         false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
 
         # if the scores are ordered, it's possible to count the number of
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 328c26521a9fd..8ef585a436233 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -471,136 +471,6 @@ def test_deprecated_auc_reorder():
                          [1, 2], [2, 3], reorder=True)
 
 
-def test_multi_ovo_auc_toydata():
-    # Tests the one-vs-one multiclass ROC AUC algorithm
-    # on a small example, representative of an expected use case.
-    y_true = np.array([0, 1, 0, 2])
-    n_labels = len(np.unique(y_true))
-    y_scores = np.array(
-        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
-
-    # Used to compute the expected output.
-    # Consider labels 0 and 1:
-    # positive label is 0, negative label is 1
-    score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
-    # positive label is 1, negative label is 0
-    score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
-    average_score_01 = (score_01 + score_10) / 2.
-
-    # Consider labels 0 and 2:
-    score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
-    score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
-    average_score_02 = (score_02 + score_20) / 2.
-
-    # Consider labels 1 and 2:
-    score_12 = roc_auc_score([1, 0], [0.4, 0.2])
-    score_21 = roc_auc_score([0, 1], [0.3, 0.8])
-    average_score_12 = (score_12 + score_21) / 2.
-
-    # Unweighted, one-vs-one multiclass ROC AUC algorithm
-    sum_avg_scores = average_score_01 + average_score_02 + average_score_12
-    ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1))
-    ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores
-    assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovo"),
-        ovo_unweighted_score)
-
-    # Weighted, one-vs-one multiclass ROC AUC algorithm
-    # Each term is weighted by the prevalence for the positive label.
-    pair_scores = [average_score_01, average_score_02, average_score_12]
-    prevalence = [0.75, 0.75, 0.50]
-    ovo_weighted_score = np.average(pair_scores, weights=prevalence)
-    assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
-        ovo_weighted_score)
-
-
-def test_multi_ovr_auc_toydata():
-    # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
-    # on a small example, representative of an expected use case.
-    y_true = np.array([0, 1, 2, 2])
-    y_scores = np.array(
-        [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
-    # Compute the expected result by individually computing the 'one-vs-rest'
-    # ROC AUC scores for classes 0, 1, and 2.
-    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
-    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
-    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
-    result_unweighted = (out_0 + out_1 + out_2) / 3.
-
-    assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovr"),
-        result_unweighted)
-
-    # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
-    # on the same input (Provost & Domingos, 2001)
-    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
-    assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
-        result_weighted)
-
-
-def test_multi_auc_score_under_permutation():
-    y_score = np.random.rand(100, 3)
-    # Normalize the scores for each row
-    row_sums = y_score.sum(axis=1)
-    y_score = y_score / row_sums[:, np.newaxis]
-    # Generate the true labels
-    y_true = np.argmax(y_score, axis=1)
-    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
-        2, size=20)
-    for multiclass in ['ovr', 'ovo']:
-        for average in ['macro', 'weighted']:
-            same_score_under_permutation = None
-            for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2],
-                         [1, 2, 0], [2, 0, 1], [2, 1, 0]]:
-                inv_perm = np.zeros(3, dtype=int)
-                inv_perm[perm] = np.arange(3)
-                y_score_perm = y_score[:, inv_perm]
-                y_true_perm = np.take(perm, y_true)
-                score = roc_auc_score(y_true_perm, y_score_perm,
-                                      multiclass=multiclass, average=average)
-                if same_score_under_permutation is None:
-                    same_score_under_permutation = score
-                else:
-                    assert_almost_equal(score, same_score_under_permutation)
-
-
-def test_auc_score_multi_error():
-    # Test that roc_auc_score function returns an error when trying
-    # to compute multiclass AUC for parameters where an output
-    # is not defined.
-    rng = check_random_state(404)
-    y_pred = rng.rand(10, 3)
-    row_sums = y_pred.sum(axis=1)
-    y_pred = y_pred / row_sums[:, np.newaxis]
-    y_true = rng.randint(0, 3, size=10)
-    average_error_msg = ("Parameter 'average' must be one of "
-                         "('macro', 'weighted') for multiclass problems.")
-    assert_raise_message(ValueError, average_error_msg,
-                         roc_auc_score, y_true, y_pred, average="samples")
-    assert_raise_message(ValueError, average_error_msg,
-                         roc_auc_score, y_true, y_pred, average="micro")
-    multiclass_error_msg = ("Parameter multiclass='invalid' is not "
-                            "supported for multiclass ROC AUC. 'multiclass' "
-                            "must be one of ('ovo', 'ovr').")
-    assert_raise_message(ValueError, multiclass_error_msg,
-                         roc_auc_score, y_true, y_pred, multiclass="invalid")
-    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported "
-                               "for multiclass one-vs-one ROC AUC. "
-                               "'sample_weight' must be None in this case.")
-    assert_raise_message(ValueError, sample_weight_error_msg,
-                         roc_auc_score, y_true, y_pred,
-                         multiclass="ovo", sample_weight=[])
-    partial_comp_error_msg = ("Partial AUC computation not available in "
-                              "multiclass setting. Parameter 'max_fpr' must "
-                              "be set to `None`. Received `max_fpr=0.5` "
-                              "instead.")
-    assert_raise_message(ValueError, partial_comp_error_msg,
-                         roc_auc_score, y_true, y_pred,
-                         multiclass="ovo", max_fpr=0.5)
-
-
 def test_auc_score_non_binary_class():
     # Test that roc_auc_score function returns an error when trying
     # to compute AUC for non-binary class values.
@@ -641,7 +511,6 @@ def test_binary_clf_curve():
     assert_raise_message(ValueError, msg, precision_recall_curve,
                          y_true, y_pred)
 
-
 def test_precision_recall_curve():
     y_true, _, probas_pred = make_prediction(binary=True)
     _test_precision_recall_curve(y_true, probas_pred)

From e4d24434ec5909828e1e1810f74ec777ede6a424 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 12 Dec 2018 21:40:49 -0500
Subject: [PATCH 19/93] ENH: Optimizes ovo

---
 sklearn/metrics/base.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 95774c9045114..fc7318c3eb880 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -166,26 +166,28 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     """
     n_classes = len(np.unique(y_true))
     n_pairs = n_classes * (n_classes - 1) // 2
-    prevalence = np.empty(n_pairs)
     pair_scores = np.empty(n_pairs)
 
+    is_weighted = average == "weighted"
+    if is_weighted:
+        prevalence = np.empty(n_pairs)
+
     for ix, (a, b) in enumerate(itertools.combinations(range(n_classes), 2)):
         a_mask = y_true == a
         ab_mask = np.logical_or(a_mask, y_true == b)
 
-        prevalence[ix] = np.sum(ab_mask) / len(y_true)
+        if is_weighted:
+            prevalence[ix] = np.sum(ab_mask) / len(y_true)
 
         y_score_filtered = y_score[ab_mask]
 
         a_true = a_mask[ab_mask]
         b_true = np.logical_not(a_true)
 
-        a_true_score = binary_metric(
-                a_true, y_score_filtered[:, a])
-        b_true_score = binary_metric(
-                b_true, y_score_filtered[:, b])
+        a_true_score = binary_metric(a_true, y_score_filtered[:, a])
+        b_true_score = binary_metric(b_true, y_score_filtered[:, b])
         binary_avg_score = (a_true_score + b_true_score) / 2
         pair_scores[ix] = binary_avg_score
 
     return (np.average(pair_scores, weights=prevalence)
-            if average == "weighted" else np.average(pair_scores))
+            if is_weighted else np.average(pair_scores))

From 0f5a0888d32ed6112e30e259824b534d687abe16 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 11:25:36 -0500
Subject: [PATCH 20/93] WIP: Adds tests back

---
 sklearn/metrics/tests/test_ranking.py | 130 ++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 8ef585a436233..ce09cbf0df97b 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -471,6 +471,136 @@ def test_deprecated_auc_reorder():
                          [1, 2], [2, 3], reorder=True)
 
 
+def test_multi_ovo_auc_toydata():
+    # Tests the one-vs-one multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_true = np.array([0, 1, 0, 2])
+    n_labels = len(np.unique(y_true))
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
+    average_score_01 = (score_01 + score_10) / 2.
+
+    # Consider labels 0 and 2:
+    score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
+    score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
+    average_score_02 = (score_02 + score_20) / 2.
+
+    # Consider labels 1 and 2:
+    score_12 = roc_auc_score([1, 0], [0.4, 0.2])
+    score_21 = roc_auc_score([0, 1], [0.3, 0.8])
+    average_score_12 = (score_12 + score_21) / 2.
+
+    # Unweighted, one-vs-one multiclass ROC AUC algorithm
+    sum_avg_scores = average_score_01 + average_score_02 + average_score_12
+    ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1))
+    ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo"),
+        ovo_unweighted_score)
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    # Each term is weighted by the prevalence for the positive label.
+    pair_scores = [average_score_01, average_score_02, average_score_12]
+    prevalence = [0.75, 0.75, 0.50]
+    ovo_weighted_score = np.average(pair_scores, weights=prevalence)
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
+        ovo_weighted_score)
+
+
+def test_multi_ovr_auc_toydata():
+    # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_true = np.array([0, 1, 2, 2])
+    y_scores = np.array(
+        [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
+    # Compute the expected result by individually computing the 'one-vs-rest'
+    # ROC AUC scores for classes 0, 1, and 2.
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
+    result_unweighted = (out_0 + out_1 + out_2) / 3.
+
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr"),
+        result_unweighted)
+
+    # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
+    # on the same input (Provost & Domingos, 2001)
+    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
+        result_weighted)
+
+
+def test_multi_auc_score_under_permutation():
+    y_score = np.random.rand(100, 3)
+    # Normalize the scores for each row
+    row_sums = y_score.sum(axis=1)
+    y_score = y_score / row_sums[:, np.newaxis]
+    # Generate the true labels
+    y_true = np.argmax(y_score, axis=1)
+    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
+        2, size=20)
+    for multiclass in ['ovr', 'ovo']:
+        for average in ['macro', 'weighted']:
+            same_score_under_permutation = None
+            for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2],
+                         [1, 2, 0], [2, 0, 1], [2, 1, 0]]:
+                inv_perm = np.zeros(3, dtype=int)
+                inv_perm[perm] = np.arange(3)
+                y_score_perm = y_score[:, inv_perm]
+                y_true_perm = np.take(perm, y_true)
+                score = roc_auc_score(y_true_perm, y_score_perm,
+                                      multiclass=multiclass, average=average)
+                if same_score_under_permutation is None:
+                    same_score_under_permutation = score
+                else:
+                    assert_almost_equal(score, same_score_under_permutation)
+
+
+def test_auc_score_multi_error():
+    # Test that roc_auc_score function returns an error when trying
+    # to compute multiclass AUC for parameters where an output
+    # is not defined.
+    rng = check_random_state(404)
+    y_pred = rng.rand(10, 3)
+    row_sums = y_pred.sum(axis=1)
+    y_pred = y_pred / row_sums[:, np.newaxis]
+    y_true = rng.randint(0, 3, size=10)
+    average_error_msg = ("Parameter 'average' must be one of "
+                         "('macro', 'weighted') for multiclass problems.")
+    assert_raise_message(ValueError, average_error_msg,
+                         roc_auc_score, y_true, y_pred, average="samples")
+    assert_raise_message(ValueError, average_error_msg,
+                         roc_auc_score, y_true, y_pred, average="micro")
+    multiclass_error_msg = ("Parameter multiclass='invalid' is not "
+                            "supported for multiclass ROC AUC. 'multiclass' "
+                            "must be one of ('ovo', 'ovr').")
+    assert_raise_message(ValueError, multiclass_error_msg,
+                         roc_auc_score, y_true, y_pred, multiclass="invalid")
+    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported "
+                               "for multiclass one-vs-one ROC AUC. "
+                               "'sample_weight' must be None in this case.")
+    assert_raise_message(ValueError, sample_weight_error_msg,
+                         roc_auc_score, y_true, y_pred,
+                         multiclass="ovo", sample_weight=[])
+    partial_comp_error_msg = ("Partial AUC computation not available in "
+                              "multiclass setting. Parameter 'max_fpr' must "
+                              "be set to `None`. Received `max_fpr=0.5` "
+                              "instead.")
+    assert_raise_message(ValueError, partial_comp_error_msg,
+                         roc_auc_score, y_true, y_pred,
+                         multiclass="ovo", max_fpr=0.5)
+
+
 def test_auc_score_non_binary_class():
     # Test that roc_auc_score function returns an error when trying
     # to compute AUC for non-binary class values.

From 1de433385496d2cd9dcd819053347ba0361cabca Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 11:27:12 -0500
Subject: [PATCH 21/93] WIP: ovr supports sample_weigth

---
 sklearn/metrics/ranking.py            | 10 ++++------
 sklearn/metrics/tests/test_common.py  |  5 ++---
 sklearn/metrics/tests/test_ranking.py |  1 +
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 6cc8b75e3b22a..32fb1b88eab9b 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -389,13 +389,11 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              " for multiclass ROC AUC. 'multiclass' must be"
                              " one of {1}.".format(
                                  multiclass, multiclass_options))
-        if sample_weight is not None:
-            # TODO: check if only in ovo case, if yes, do not raise when ovr
-            raise ValueError("Parameter 'sample_weight' is not supported"
-                             " for multiclass one-vs-one ROC AUC."
-                             " 'sample_weight' must be None in this case.")
-
         if multiclass == "ovo":
+            if sample_weight is not None:
+                raise ValueError("Parameter 'sample_weight' is not supported"
+                                 " for multiclass one-vs-one ROC AUC."
+                                 " 'sample_weight' must be None in this case.")
             # Hand & Till (2001) implementation
             return _average_multiclass_ovo_score(
                 _binary_roc_auc_score, y_true, y_score, average)
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 73e9e1967478e..c711344dafe1f 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -441,8 +441,6 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
 METRICS_WITHOUT_SAMPLE_WEIGHT = {
     "median_absolute_error",
     "max_error",
-    "roc_auc_score",
-    "weighted_roc_auc",
     "ovo_roc_auc",
     "ovo_roc_auc_weighted"
 }
@@ -1155,7 +1153,8 @@ def test_multiclass_sample_weight_invariance(name):
     y_score = random_state.random_sample(size=(n_samples, 5))
     metric = ALL_METRICS[name]
     if name in THRESHOLDED_METRICS:
-        check_sample_weight_invariance(name, metric, y_true, y_score)
+        y_score_norm = y_score / y_score.sum(1, keepdims=True)
+        check_sample_weight_invariance(name, metric, y_true, y_score_norm)
     else:
         check_sample_weight_invariance(name, metric, y_true, y_pred)
 
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index ce09cbf0df97b..9325b7f51acf8 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -828,6 +828,7 @@ def test_score_scale_invariance():
     # issue #3864 (and others), where overly aggressive rounding was causing
     # problems for users with very small y_score values
     y_true, _, probas_pred = make_prediction(binary=True)
+
     roc_auc = roc_auc_score(y_true, probas_pred)
     roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred)
     roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)

From e169e0de17e289912c288541cc19a006b2b7e933 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 11:40:36 -0500
Subject: [PATCH 22/93] RFC: Rename with weighted prefix

---
 sklearn/metrics/tests/test_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index c711344dafe1f..0062e077419e8 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -211,7 +211,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "samples_roc_auc": partial(roc_auc_score, average="samples"),
     "micro_roc_auc": partial(roc_auc_score, average="micro"),
     "ovo_roc_auc": partial(roc_auc_score, average="macro", multiclass='ovo'),
-    "ovo_roc_auc_weighted": partial(roc_auc_score, average="weighted",
+    "weighted_ovo_roc_auc": partial(roc_auc_score, average="weighted",
                                     multiclass='ovo'),
     "partial_roc_auc": partial(roc_auc_score, max_fpr=0.5),
 
@@ -442,7 +442,7 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "median_absolute_error",
     "max_error",
     "ovo_roc_auc",
-    "ovo_roc_auc_weighted"
+    "weighted_ovo_roc_auc"
 }
 
 
From 95a117c9dc441d95b46bc7643d05e3f072662406 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 12:30:37 -0500
Subject: [PATCH 23/93] RFC: Moves permutation test to common

---
 sklearn/metrics/base.py               |  4 ++--
 sklearn/metrics/tests/test_common.py  | 29 +++++++++++++++++++++++++++
 sklearn/metrics/tests/test_ranking.py | 26 ------------------------
 3 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index fc7318c3eb880..988459145dd06 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -13,7 +13,7 @@
 # License: BSD 3 clause
 
 from __future__ import division
-import itertools
+from itertools import combinations
 
 import numpy as np
 
@@ -172,7 +172,7 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     if is_weighted:
         prevalence = np.empty(n_pairs)
 
-    for ix, (a, b) in enumerate(itertools.combinations(range(n_classes), 2)):
+    for ix, (a, b) in enumerate(combinations(range(n_classes), 2)):
         a_mask = y_true == a
         ab_mask = np.logical_or(a_mask, y_true == b)
 
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 0062e077419e8..310c89b7337a0 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -3,6 +3,7 @@
 from functools import partial
 from itertools import product
 from itertools import chain
+from itertools import permutations
 
 import numpy as np
 import scipy.sparse as sp
@@ -17,6 +18,7 @@
 from sklearn.utils import shuffle
 
 from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_almost_equal
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_array_less
 from sklearn.utils.testing import assert_equal
@@ -1205,3 +1207,30 @@ def test_no_averaging_labels():
             score_labels = metric(y_true, y_pred, labels=labels, average=None)
             score = metric(y_true, y_pred, average=None)
             assert_array_equal(score_labels, score[inverse_labels])
+
+
+@pytest.mark.parametrize(
+    'name',
+    set(ALL_METRICS) - set(CLASSIFICATION_METRICS) - set(REGRESSION_METRICS) -
+    METRIC_UNDEFINED_BINARY_MULTICLASS)
+def test_multiclass_score_permutation_invariance(name):
+    y_score = np.random.rand(100, 3)
+    y_score = y_score / y_score.sum(axis=1, keepdims=True)
+    y_true = np.argmax(y_score, axis=1)
+    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
+        2, size=20)
+
+    metric = ALL_METRICS[name]
+    current_score = None
+    for perm in permutations(range(3), 3):
+        perm = list(perm)
+        inv_perm = np.zeros(3, dtype=int)
+        inv_perm[perm] = np.arange(3)
+        y_score_perm = y_score[:, inv_perm]
+        y_true_perm = np.take(perm, y_true)
+
+        score = metric(y_true_perm, y_score_perm)
+        if current_score is None:
+            current_score = score
+        else:
+            assert_almost_equal(score, current_score)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 9325b7f51acf8..dfb7f01b29183 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -540,32 +540,6 @@ def test_multi_ovr_auc_toydata():
         result_weighted)
 
 
-def test_multi_auc_score_under_permutation():
-    y_score = np.random.rand(100, 3)
-    # Normalize the scores for each row
-    row_sums = y_score.sum(axis=1)
-    y_score = y_score / row_sums[:, np.newaxis]
-    # Generate the true labels
-    y_true = np.argmax(y_score, axis=1)
-    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
-        2, size=20)
-    for multiclass in ['ovr', 'ovo']:
-        for average in ['macro', 'weighted']:
-            same_score_under_permutation = None
-            for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2],
-                         [1, 2, 0], [2, 0, 1], [2, 1, 0]]:
-                inv_perm = np.zeros(3, dtype=int)
-                inv_perm[perm] = np.arange(3)
-                y_score_perm = y_score[:, inv_perm]
-                y_true_perm = np.take(perm, y_true)
-                score = roc_auc_score(y_true_perm, y_score_perm,
-                                      multiclass=multiclass, average=average)
-                if same_score_under_permutation is None:
-                    same_score_under_permutation = score
-                else:
-                    assert_almost_equal(score, same_score_under_permutation)
-
-
 def test_auc_score_multi_error():
     # Test that roc_auc_score function returns an error when trying
     # to compute multiclass AUC for parameters where an output

From 01ba344b606f130b93bd6a3ec64b371acbf7545a Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 12:48:54 -0500
Subject: [PATCH 24/93] RFC: Uses pytest parameters

---
 sklearn/metrics/tests/test_ranking.py | 44 +++++++++++----------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index dfb7f01b29183..0de12657d5898 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -540,39 +540,29 @@ def test_multi_ovr_auc_toydata():
         result_weighted)
 
 
-def test_auc_score_multi_error():
+@pytest.mark.parametrize("msg, kwargs", [
+    (("Parameter 'average' must be one of ('macro', 'weighted') for "
+      "multiclass problems."), {"average": "samples"}),
+    (("Parameter 'average' must be one of ('macro', 'weighted') for "
+      "multiclass problems."), {"average": "micro"}),
+    (("Parameter 'sample_weight' is not supported for multiclass one-vs-one "
+      "ROC AUC. 'sample_weight' must be None in this case."),
+     {"multiclass": "ovo", "sample_weight": []}),
+    (("Partial AUC computation not available in multiclass setting. "
+      "Parameter 'max_fpr' must be set to `None`. Received `max_fpr=0.5` "
+      "instead."), {"multiclass": "ovo", "max_fpr": 0.5})
+])
+def test_auc_score_multi_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying
     # to compute multiclass AUC for parameters where an output
     # is not defined.
     rng = check_random_state(404)
     y_pred = rng.rand(10, 3)
-    row_sums = y_pred.sum(axis=1)
-    y_pred = y_pred / row_sums[:, np.newaxis]
+    y_pred = y_pred / y_pred.sum(axis=1, keepdims=True)
     y_true = rng.randint(0, 3, size=10)
-    average_error_msg = ("Parameter 'average' must be one of "
-                         "('macro', 'weighted') for multiclass problems.")
-    assert_raise_message(ValueError, average_error_msg,
-                         roc_auc_score, y_true, y_pred, average="samples")
-    assert_raise_message(ValueError, average_error_msg,
-                         roc_auc_score, y_true, y_pred, average="micro")
-    multiclass_error_msg = ("Parameter multiclass='invalid' is not "
-                            "supported for multiclass ROC AUC. 'multiclass' "
-                            "must be one of ('ovo', 'ovr').")
-    assert_raise_message(ValueError, multiclass_error_msg,
-                         roc_auc_score, y_true, y_pred, multiclass="invalid")
-    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported "
-                               "for multiclass one-vs-one ROC AUC. "
-                               "'sample_weight' must be None in this case.")
-    assert_raise_message(ValueError, sample_weight_error_msg,
-                         roc_auc_score, y_true, y_pred,
-                         multiclass="ovo", sample_weight=[])
-    partial_comp_error_msg = ("Partial AUC computation not available in "
-                              "multiclass setting. Parameter 'max_fpr' must "
-                              "be set to `None`. Received `max_fpr=0.5` "
-                              "instead.")
-    assert_raise_message(ValueError, partial_comp_error_msg,
-                         roc_auc_score, y_true, y_pred,
-                         multiclass="ovo", max_fpr=0.5)
+    with pytest.raises(ValueError) as exc_info:
+        roc_auc_score(y_true, y_pred, **kwargs)
+    assert str(exc_info.value) == msg
 
 
 def test_auc_score_non_binary_class():

From 67f2376a61098a81b10eda04bd7426ddfe751e4e Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 12:53:46 -0500
Subject: [PATCH 25/93] RFC: Minimizes diffs

---
 sklearn/metrics/ranking.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 32fb1b88eab9b..18db819e64a84 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -243,22 +243,27 @@ def _binary_uninterpolated_average_precision(
 
 def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
                   sample_weight=None, max_fpr=None):
-    """Compute Area Under the Curve (AUC) from prediction scores.
+    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
+    from prediction scores.
+
+    Note: this implementation is restricted to the binary classification task
+    or multilabel classification task in label indicator format.
 
     Read more in the :ref:`User Guide <roc_metrics>`.
 
     Parameters
     ----------
     y_true : array, shape = [n_samples] or [n_samples, n_classes]
-        True binary labels in binary label indicators.
+        True binary labels or binary label indicators.
         The multiclass case expects shape = [n_samples] and labels
         with values from 0 to (n_classes-1), inclusive.
 
     y_score : array, shape = [n_samples] or [n_samples, n_classes]
         Target scores, can either be probability estimates of the positive
         class, confidence values, or non-thresholded measure of decisions
-        (as returned by "decision_function" on some classifiers).
-        The multiclass case expects shape = [n_samples, n_classes]
+        (as returned by "decision_function" on some classifiers). For binary
+        y_true, y_score is supposed to be the score of the class with greater
+        label. The multiclass case expects shape = [n_samples, n_classes]
         where the scores correspond to probability estimates.
 
     multiclass : string, 'ovr' or 'ovo', default 'ovr'
@@ -272,8 +277,7 @@ def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
             Calculate metrics for the multiclass case using the one-vs-one
             approach.
 
-    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
-              default 'macro'
+    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 

From a4ea7a6c2920b7c6cd4d3f4291bf96fcd00eefc9 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 15:46:31 -0500
Subject: [PATCH 26/93] DOC: Adds narative

---
 doc/modules/model_evaluation.rst | 54 +++++++++++++++++++++++++++++---
 1 file changed, 50 insertions(+), 4 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index ce842033348e2..941a73c774507 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -313,6 +313,7 @@ Some also work in the multilabel case:
    precision_recall_fscore_support
    precision_score
    recall_score
+   roc_auc_score
    zero_one_loss
 
 And some work with binary and multilabel (but not multiclass) problems:
@@ -321,7 +322,6 @@ And some work with binary and multilabel (but not multiclass) problems:
    :template: function.rst
 
    average_precision_score
-   roc_auc_score
 
 
 In the following sub-sections, we will describe each of those functions,
@@ -1287,9 +1287,48 @@ In multi-label classification, the :func:`roc_auc_score` function is
 extended by averaging over the labels as :ref:`above <average>`.
 
 Compared to metrics such as the subset accuracy, the Hamming loss, or the
-F1 score, ROC doesn't require optimizing a threshold for each label. The
-:func:`roc_auc_score` function can also be used in multi-class classification,
-if the predicted outputs have been binarized.
+F1 score, ROC doesn't require optimizing a threshold for each label.
+
+The :func:`roc_auc_score` function can also be used in mult-class
+classification. Two averaging strategies are currently supported: the
+one-vs-one algorithm computes the average of the pairwise ROC AUC scores, and
+the one-vs-rest algorithm computes the average of the ROC AUC scores for each
+class against all other classes. In both cases, the predicted labels are
+provided in an array with values from 0 to ``n_classes``, and the scores
+correspond to the probability estimates that a sample belongs to a particular
+class.
+
+**One-vs-one Algorithm**: Computes the AUC of all possible pairwise
+combinations of classes. [HT2001]_ defines a multiclass AUC metric weighted
+uniformly:
+
+.. math::
+
+   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k \neq j}^c (\textnormal{AUC}(j | k) +
+   \textnormal{AUC}(k | j))
+
+where :math:`c` is the number of classes and ``\textnormal{AUC}(j | k)`` is the
+auc with class :math:`j` as the positive class and class :math:`k` as the
+negative class. In general,
+:math:`\textnormal{AUC}(j | k) \neq \textnormal{AUC}(k | j))` in the multiclass
+case. This algorithm is used by setting the keyword argument ``multiclass``
+to ``'ovo'`` and ``average`` to ``'macro'``.
+
+The [HT2001]_ multiclass AUC metric can be extended to be weighted by the
+prevalence:
+
+.. math::
+
+   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k \neq j}^c p(j \cup k)(
+   \textnormal{AUC}(j | k) + \textnormal{AUC}(k | j))
+
+where :math:`c` is the number of classes. This algorithm is used by setting
+the keyword argument ``multiclass`` to ``'ovo'`` and ``average`` to
+``'weighted'``.
+
+**One-vs-rest Algorithm**: Computes the AUC of each class against the rest.
+The algorithm is functionally the same as the multilabel case. To enable this
+algorithm set the keyword argument ``multiclass`` to ``'ovr'``.
 
 In applications where a high false positive rate is not tolerable the parameter
 ``max_fpr`` of :func:`roc_auc_score` can be used to summarize the ROC curve up
@@ -1315,6 +1354,13 @@ to the given limit.
     for an example of using ROC to
     model species distribution.
 
+.. topic:: References:
+
+    .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
+       of the area under the ROC curve for multiple class classification problems.
+       <http://link.springer.com/article/10.1023/A:1010920819831>`_
+       Machine learning, 45(2), pp.171-186.
+
 .. _zero_one_loss:
 
 Zero one loss

From 96f2c2d3aa5d6bf62201db662d48ad92dc14ed47 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 15:46:58 -0500
Subject: [PATCH 27/93] RFC: Lowers line count

---
 sklearn/metrics/base.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 988459145dd06..ac455f9b9c23e 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -179,15 +179,12 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         if is_weighted:
             prevalence[ix] = np.sum(ab_mask) / len(y_true)
 
-        y_score_filtered = y_score[ab_mask]
-
         a_true = a_mask[ab_mask]
         b_true = np.logical_not(a_true)
 
-        a_true_score = binary_metric(a_true, y_score_filtered[:, a])
-        b_true_score = binary_metric(b_true, y_score_filtered[:, b])
-        binary_avg_score = (a_true_score + b_true_score) / 2
-        pair_scores[ix] = binary_avg_score
+        a_true_score = binary_metric(a_true, y_score[ab_mask, a])
+        b_true_score = binary_metric(b_true, y_score[ab_mask, b])
+        pair_scores[ix] = (a_true_score + b_true_score) / 2
 
     return (np.average(pair_scores, weights=prevalence)
             if is_weighted else np.average(pair_scores))

From 236504dbb086465aa9b69abf85b2a17ac6e1a65d Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 13 Dec 2018 19:24:24 -0500
Subject: [PATCH 28/93] DOC: Fixes latex errors

---
 doc/modules/model_evaluation.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 941a73c774507..ebc9f737e509f 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1304,13 +1304,13 @@ uniformly:
 
 .. math::
 
-   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k \neq j}^c (\textnormal{AUC}(j | k) +
-   \textnormal{AUC}(k | j))
+   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k \neq j}^c (\text{AUC}(j | k) +
+   \text{AUC}(k | j))
 
-where :math:`c` is the number of classes and ``\textnormal{AUC}(j | k)`` is the
+where :math:`c` is the number of classes and :math:`\text{AUC}(j | k)` is the
 auc with class :math:`j` as the positive class and class :math:`k` as the
 negative class. In general,
-:math:`\textnormal{AUC}(j | k) \neq \textnormal{AUC}(k | j))` in the multiclass
+:math:`\text{AUC}(j | k) \neq \text{AUC}(k | j))` in the multiclass
 case. This algorithm is used by setting the keyword argument ``multiclass``
 to ``'ovo'`` and ``average`` to ``'macro'``.
 
@@ -1320,7 +1320,7 @@ prevalence:
 .. math::
 
    \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k \neq j}^c p(j \cup k)(
-   \textnormal{AUC}(j | k) + \textnormal{AUC}(k | j))
+   \text{AUC}(j | k) + \text{AUC}(k | j))
 
 where :math:`c` is the number of classes. This algorithm is used by setting
 the keyword argument ``multiclass`` to ``'ovo'`` and ``average`` to

From 8fbcd354a19478a21a1aa9e792c244aac7b7da1b Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 14 Dec 2018 15:19:17 -0500
Subject: [PATCH 29/93] DOC: Update plot_roc for multiclass

---
 examples/model_selection/plot_roc.py | 143 ++++++++++++++++++++-------
 1 file changed, 105 insertions(+), 38 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 475d7b4aba7a6..dd935c46708a2 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -19,44 +19,32 @@
 -------------------
 
 ROC curves are typically used in binary classification to study the output of
-a classifier. In order to extend ROC curve and ROC area to multi-class
-or multi-label classification, it is necessary to binarize the output. One ROC
-curve can be drawn per label, but one can also draw a ROC curve by considering
-each element of the label indicator matrix as a binary prediction
-(micro-averaging).
-
-Another evaluation measure for multi-class classification is
-macro-averaging, which gives equal weight to the classification of each
-label.
+a classifier. The ROC curve and ROC area can be extended to multi-class or
+multi-label classification by using the One-vs-Rest or One-vs-One scheme.
 
 .. note::
 
     See also :func:`sklearn.metrics.roc_auc_score`,
              :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`.
-
 """
-print(__doc__)
-
-import numpy as np
-import matplotlib.pyplot as plt
-from itertools import cycle
-
+###############################################################################
+# One-vs-Rest
+# -----------
+# The One-vs-Rest scheme compares the each class against all of the other
+# classes ("the rest").
+#
+# Load iris dataset and train a SVC
+# .................................
 from sklearn import svm, datasets
-from sklearn.metrics import roc_curve, auc
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import label_binarize
 from sklearn.multiclass import OneVsRestClassifier
-from scipy import interp
+import numpy as np
 
 # Import some data to play with
 iris = datasets.load_iris()
 X = iris.data
 y = iris.target
 
-# Binarize the output
-y = label_binarize(y, classes=[0, 1, 2])
-n_classes = y.shape[1]
-
 # Add noisy features to make the problem harder
 random_state = np.random.RandomState(0)
 n_samples, n_features = X.shape
@@ -71,21 +59,44 @@
                                  random_state=random_state))
 y_score = classifier.fit(X_train, y_train).decision_function(X_test)
 
+###############################################################################
+# Compute the AUC scores
+# ......................
+# The ROC area can be approximated by taking the average either weighted
+# uniformly or by the priori class distribution.
+from sklearn.metrics import roc_auc_score
+
+y_score_norm = y_score / y_score.sum(1, keepdims=True)
+unweighted_roc_auc_ovr = roc_auc_score(y_test, y_score_norm, multiclass="ovr")
+weighted_roc_auc_ovr = roc_auc_score(
+      y_test, y_score_norm, multiclass="ovr", average="weighted")
+print("One-vs-Rest ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
+       unweighted_roc_auc_ovr, weighted_roc_auc_ovr))
+
+###############################################################################
+# Plotting the ROC curve for virginica
+# ....................................
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import label_binarize
+from sklearn.metrics import roc_curve, auc
+
+# Binarize y_test to compute the ROC curve
+classes = np.unique(y)
+n_classes = len(classes)
+y_test_binarized = label_binarize(y_test, classes=classes)
+
 # Compute ROC curve and ROC area for each class
 fpr = dict()
 tpr = dict()
 roc_auc = dict()
 for i in range(n_classes):
-    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
+    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
     roc_auc[i] = auc(fpr[i], tpr[i])
 
-# Compute micro-average ROC curve and ROC area
-fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
+fpr["micro"], tpr["micro"], _ = roc_curve(
+      y_test_binarized.ravel(), y_score.ravel())
 roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
-
-##############################################################################
-# Plot of a ROC curve for a specific class
 plt.figure()
 lw = 2
 plt.plot(fpr[2], tpr[2], color='darkorange',
@@ -95,15 +106,21 @@
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
-plt.title('Receiver operating characteristic example')
+plt.title('Receiver operating characteristic example for {}'
+          .format(iris.target_names[2]))
 plt.legend(loc="lower right")
-plt.show()
-
-
-##############################################################################
-# Plot ROC curves for the multiclass problem
 
-# Compute macro-average ROC curve and ROC area
+###############################################################################
+# Plot ROC curves for the multiclass problem using One-vs-Rest
+# ............................................................
+# One can draw a ROC curve by considering each element of the label indicator
+# matrix as a binary prediction (micro-averaging).
+#
+# Another evaluation measure for one-vs-rest multi-class classification is
+# macro-averaging, which gives equal weight to the classification of each
+# label.
+from itertools import cycle
+from scipy import interp
 
 # First aggregate all false positive rates
 all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
@@ -143,6 +160,56 @@
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
-plt.title('Some extension of Receiver operating characteristic to multi-class')
+plt.title('An extension of ROC to multi-class using One-vs-Rest')
+plt.legend(loc="lower right")
+
+###############################################################################
+# One-vs-One
+# ---------------------
+# The One-vs-One scheme compares every pairwise combiantion of classes.
+#
+# Compute the AUC score
+# .....................
+# The ROC area can be approximated by taking the average either weighted
+# uniformly or by the priori class distribution.
+unweighted_roc_auc_ovo = roc_auc_score(y_test, y_score_norm, multiclass="ovo")
+weighted_roc_auc_ovo = roc_auc_score(
+      y_test, y_score_norm, multiclass="ovo", average="weighted")
+print("One-vs-One ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
+       unweighted_roc_auc_ovo, weighted_roc_auc_ovo))
+
+###############################################################################
+# Plot ROC curves for the multiclass problem using One-vs-One
+# ...........................................................
+from itertools import combinations
+
+for a, b in combinations(range(n_classes), 2):
+    ab_mask = np.logical_or(y_test == a, y_test == b)
+
+    # Compute ROC curve and ROC area with `a` as the positive class
+    fpr[(a, b)], tpr[(a, b)], _ = roc_curve(
+          y_test[ab_mask] == a, y_score[ab_mask, a])
+    roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)])
+
+    # Compute ROC curve and ROC area with `b` as the positive class
+    fpr[(b, a)], tpr[(b, a)], _ = roc_curve(
+          y_test[ab_mask] == b, y_score[ab_mask, b])
+    roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
+
+plt.figure()
+for a, b in combinations(range(n_classes), 2):
+    plt.plot(
+          fpr[(a, b)], tpr[(a, b)],
+          lw=lw, label='ROC curve: class {0} vs. {1} '
+          '(area = {2:0.2f})'.format(a, b, roc_auc[(a, b)]))
+    plt.plot(
+           fpr[(b, a)], tpr[(b, a)],
+           lw=lw, label='ROC curve: class {0} vs. {1} '
+           '(area = {2:0.2f})'.format(b, a, roc_auc[(b, a)]))
+plt.plot([0, 1], [0, 1], 'k--', lw=lw)
+plt.xlim([0.0, 1.0])
+plt.ylim([0.0, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('An extension of ROC to multi-class using One-vs-One')
 plt.legend(loc="lower right")
-plt.show()

From 5a89add50eaf43ae52ad2885fa6a17110bb8421f Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 14 Dec 2018 15:51:29 -0500
Subject: [PATCH 30/93] DOC: Adds whats new

---
 doc/whats_new/v0.21.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index a9c553fa06ad6..52e94fcefe422 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -116,6 +116,9 @@ Support for Python 3.4 and below has been officially dropped.
   :issue:`12334` by :user:`Emmanuel Arias <eamanu@eamanu.com>`,
   `Joel Nothman`_ and `Andreas Müller`_
 
+- |Feature| Added multiclass support to :func:`metrics.roc_auc_score`.
+  :issue:`3298` by :user:`Thomas Fan <thomasjpfan>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 

From 6ea7aa3546778a7cedfea0f9495b3fc2f65f6adf Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sat, 15 Dec 2018 15:47:54 -0500
Subject: [PATCH 31/93] RFC: Clears up test

---
 sklearn/metrics/tests/test_common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 310c89b7337a0..136970881258a 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1211,8 +1211,7 @@ def test_no_averaging_labels():
 
 @pytest.mark.parametrize(
     'name',
-    set(ALL_METRICS) - set(CLASSIFICATION_METRICS) - set(REGRESSION_METRICS) -
-    METRIC_UNDEFINED_BINARY_MULTICLASS)
+    set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
 def test_multiclass_score_permutation_invariance(name):
     y_score = np.random.rand(100, 3)
     y_score = y_score / y_score.sum(axis=1, keepdims=True)

From c7e1aa8642b215b849135c6c39a469f976101492 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 17 Dec 2018 12:29:54 -0500
Subject: [PATCH 32/93] RFC: Small

---
 sklearn/metrics/tests/test_common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 136970881258a..a20045a544b46 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1222,9 +1222,8 @@ def test_multiclass_score_permutation_invariance(name):
     metric = ALL_METRICS[name]
     current_score = None
     for perm in permutations(range(3), 3):
-        perm = list(perm)
         inv_perm = np.zeros(3, dtype=int)
-        inv_perm[perm] = np.arange(3)
+        inv_perm[list(perm)] = np.arange(3)
         y_score_perm = y_score[:, inv_perm]
         y_true_perm = np.take(perm, y_true)
 

From 75a0d7e967bc8d584f66643abfcf7f1ac02fdac4 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 17 Dec 2018 13:41:05 -0500
Subject: [PATCH 33/93] RFC: Clears up test

---
 sklearn/metrics/tests/test_common.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index a20045a544b46..741d05ada3aa6 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1213,22 +1213,22 @@ def test_no_averaging_labels():
     'name',
     set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
 def test_multiclass_score_permutation_invariance(name):
-    y_score = np.random.rand(100, 3)
+    n_samples, n_classes = 100, 3
+    random_state = check_random_state(0)
+
+    y_score = random_state.rand(n_samples, n_classes)
     y_score = y_score / y_score.sum(axis=1, keepdims=True)
     y_true = np.argmax(y_score, axis=1)
-    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
+    y_true[np.random.randint(n_samples, size=20)] = np.random.randint(
         2, size=20)
 
     metric = ALL_METRICS[name]
-    current_score = None
-    for perm in permutations(range(3), 3):
-        inv_perm = np.zeros(3, dtype=int)
-        inv_perm[list(perm)] = np.arange(3)
+    current_score = metric(y_true, y_score)
+    for perm in permutations(range(n_classes), n_classes):
+        inv_perm = np.zeros(n_classes, dtype=int)
+        inv_perm[list(perm)] = np.arange(n_classes)
         y_score_perm = y_score[:, inv_perm]
         y_true_perm = np.take(perm, y_true)
 
         score = metric(y_true_perm, y_score_perm)
-        if current_score is None:
-            current_score = score
-        else:
-            assert_almost_equal(score, current_score)
+        assert_almost_equal(current_score, score)

From e5290f59565436b8062a3e27c98672364fc3a54d Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 17 Dec 2018 13:49:05 -0500
Subject: [PATCH 34/93] RFC: Clears up test

---
 sklearn/metrics/tests/test_common.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 741d05ada3aa6..a06a9afe4f3af 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1223,12 +1223,12 @@ def test_multiclass_score_permutation_invariance(name):
         2, size=20)
 
     metric = ALL_METRICS[name]
-    current_score = metric(y_true, y_score)
+    score = metric(y_true, y_score)
     for perm in permutations(range(n_classes), n_classes):
         inv_perm = np.zeros(n_classes, dtype=int)
         inv_perm[list(perm)] = np.arange(n_classes)
         y_score_perm = y_score[:, inv_perm]
         y_true_perm = np.take(perm, y_true)
 
-        score = metric(y_true_perm, y_score_perm)
-        assert_almost_equal(current_score, score)
+        current_score = metric(y_true_perm, y_score_perm)
+        assert_almost_equal(score, current_score)

From d68c3f0d9f086936f9125a8a0648e29e5e2ddd0b Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 17 Dec 2018 14:19:10 -0500
Subject: [PATCH 35/93] RFC: Simplifies test

---
 sklearn/metrics/tests/test_common.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index a06a9afe4f3af..de03afac2b9bf 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1218,9 +1218,7 @@ def test_multiclass_score_permutation_invariance(name):
 
     y_score = random_state.rand(n_samples, n_classes)
     y_score = y_score / y_score.sum(axis=1, keepdims=True)
-    y_true = np.argmax(y_score, axis=1)
-    y_true[np.random.randint(n_samples, size=20)] = np.random.randint(
-        2, size=20)
+    y_true = random_state.randint(0, n_classes, size=n_samples)
 
     metric = ALL_METRICS[name]
     score = metric(y_true, y_score)

From 3fb95d6e6dbbd9ba3da4925a8603f17bc66715ce Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 20 Dec 2018 11:00:04 -0500
Subject: [PATCH 36/93] TST: Adds ValueError test

---
 sklearn/metrics/tests/test_ranking.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 0de12657d5898..0f1ac37251fba 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -550,7 +550,9 @@ def test_multi_ovr_auc_toydata():
      {"multiclass": "ovo", "sample_weight": []}),
     (("Partial AUC computation not available in multiclass setting. "
       "Parameter 'max_fpr' must be set to `None`. Received `max_fpr=0.5` "
-      "instead."), {"multiclass": "ovo", "max_fpr": 0.5})
+      "instead."), {"multiclass": "ovo", "max_fpr": 0.5}),
+    (("Parameter multiclass='ovp' is not supported for multiclass ROC AUC. "
+      "'multiclass' must be one of ('ovo', 'ovr')."), {"multiclass": "ovp"})
 ])
 def test_auc_score_multi_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying

From 9f3477689308ebc9cf72be00aa73306583fb7c65 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 21 Dec 2018 16:54:36 -0500
Subject: [PATCH 37/93] DOC: Show plots

---
 examples/model_selection/plot_roc.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index dd935c46708a2..586919943ca97 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -109,6 +109,7 @@
 plt.title('Receiver operating characteristic example for {}'
           .format(iris.target_names[2]))
 plt.legend(loc="lower right")
+plt.show()
 
 ###############################################################################
 # Plot ROC curves for the multiclass problem using One-vs-Rest
@@ -162,6 +163,7 @@
 plt.ylabel('True Positive Rate')
 plt.title('An extension of ROC to multi-class using One-vs-Rest')
 plt.legend(loc="lower right")
+plt.show()
 
 ###############################################################################
 # One-vs-One
@@ -213,3 +215,4 @@
 plt.ylabel('True Positive Rate')
 plt.title('An extension of ROC to multi-class using One-vs-One')
 plt.legend(loc="lower right")
+plt.show()

From 0044f41ea5892fbcf06263600bcb322696d2be9c Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 21 Dec 2018 17:48:02 -0500
Subject: [PATCH 38/93] DOC: Adds names

---
 doc/whats_new/v0.21.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 60bb74cf26221..4319fc2372078 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -132,7 +132,8 @@ Support for Python 3.4 and below has been officially dropped.
   Mitrovic <SandraMNE>`.
 
 - |Feature| Added multiclass support to :func:`metrics.roc_auc_score`.
-  :issue:`3298` by :user:`Thomas Fan <thomasjpfan>`.
+  :issue:`3298` by :user:`Kathy Chen <kathyxchen>`,
+  :user:`Mohamed Maskani <maskani-moh>`, and :user:`Thomas Fan <thomasjpfan>`.
 
 :mod:`sklearn.model_selection`
 ..............................

From a93f06bc9cf8b3c2181bb8ef9ba8c252374c3eb7 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Jan 2019 13:50:31 -0500
Subject: [PATCH 39/93] ENH: Adds support for strings and labels

---
 sklearn/metrics/base.py               | 24 ++++++---
 sklearn/metrics/ranking.py            | 27 ++++++++--
 sklearn/metrics/tests/test_ranking.py | 72 +++++++++++++++++++++++----
 3 files changed, 103 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index ac455f9b9c23e..95f791404121c 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -127,7 +127,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         return score
 
 
-def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
+def _average_multiclass_ovo_score(
+        binary_metric, y_true, y_score, labels=None, average='macro'):
     """Uses the binary metric for one-vs-one multiclass classification,
     where the score is computed according to the Hand & Till (2001) algorithm.
 
@@ -135,12 +136,15 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     ----------
     y_true : array, shape = [n_samples]
         True multiclass labels.
-        Assumes labels have been recoded to 0 to n_classes.
 
     y_score : array, shape = [n_samples, n_classes]
         Target scores corresponding to probability estimates of a sample
         belonging to a particular class
 
+    labels : array, shape = [n_classes] or None, optional (default=None)
+        List of labels to index ``y_score``. If ``None``,
+        the lexicon order of ``y_true`` is used to index ``y_score``.
+
     average : 'macro' or 'weighted', default='macro'
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
@@ -164,7 +168,14 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     score : float
         Average the sum of pairwise binary metric scores
     """
-    n_classes = len(np.unique(y_true))
+    check_consistent_length(y_true, y_score)
+
+    if labels is None:
+        classes = np.unique(y_true)
+    else:
+        classes = np.array(labels)
+
+    n_classes = len(classes)
     n_pairs = n_classes * (n_classes - 1) // 2
     pair_scores = np.empty(n_pairs)
 
@@ -172,7 +183,8 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     if is_weighted:
         prevalence = np.empty(n_pairs)
 
-    for ix, (a, b) in enumerate(combinations(range(n_classes), 2)):
+    all_combinations = enumerate(combinations(enumerate(classes), 2))
+    for ix, ((a_ix, a), (b_ix, b)) in all_combinations:
         a_mask = y_true == a
         ab_mask = np.logical_or(a_mask, y_true == b)
 
@@ -182,8 +194,8 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         a_true = a_mask[ab_mask]
         b_true = np.logical_not(a_true)
 
-        a_true_score = binary_metric(a_true, y_score[ab_mask, a])
-        b_true_score = binary_metric(b_true, y_score[ab_mask, b])
+        a_true_score = binary_metric(a_true, y_score[ab_mask, a_ix])
+        b_true_score = binary_metric(b_true, y_score[ab_mask, b_ix])
         pair_scores[ix] = (a_true_score + b_true_score) / 2
 
     return (np.average(pair_scores, weights=prevalence)
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 18db819e64a84..2e9c5c9afeb17 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -241,7 +241,8 @@ def _binary_uninterpolated_average_precision(
                                  average, sample_weight=sample_weight)
 
 
-def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
+def roc_auc_score(y_true, y_score, labels=None,
+                  multiclass="ovr", average="macro",
                   sample_weight=None, max_fpr=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
@@ -266,7 +267,11 @@ def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
         label. The multiclass case expects shape = [n_samples, n_classes]
         where the scores correspond to probability estimates.
 
-    multiclass : string, 'ovr' or 'ovo', default 'ovr'
+    labels : array, shape = [n_classes] or None, optional (default=None)
+        List of labels to index ``y_score`` used for multiclass. If ``None``,
+        the lexicon order of ``y_true`` is used to index ``y_score``.
+
+    multiclass : string, 'ovr' or 'ovo', (default='ovr')
         Note: multiclass ROC AUC currently only handles the 'macro' and
         'weighted' averages.
 
@@ -393,6 +398,17 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              " for multiclass ROC AUC. 'multiclass' must be"
                              " one of {1}.".format(
                                  multiclass, multiclass_options))
+        if labels is not None:
+            classes = np.unique(labels)
+            if len(classes) != len(labels):
+                raise ValueError("Parameter 'labels' must be unique")
+            if len(classes) != y_score.shape[1]:
+                raise ValueError(
+                    "Parameter 'labels' not equal to the number of columns in "
+                    "'y_score'")
+            if set(classes) < set(np.unique(y_true)):
+                raise ValueError(
+                    "'y_true' contains labels not in parameter 'labels'")
         if multiclass == "ovo":
             if sample_weight is not None:
                 raise ValueError("Parameter 'sample_weight' is not supported"
@@ -400,10 +416,15 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                                  " 'sample_weight' must be None in this case.")
             # Hand & Till (2001) implementation
             return _average_multiclass_ovo_score(
-                _binary_roc_auc_score, y_true, y_score, average)
+                _binary_roc_auc_score, y_true, y_score,
+                labels=labels, average=average)
         else:
             # ovr is same as multi-label
             y_true = y_true.reshape((-1, 1))
+            # Order y_true by labels
+            if labels is not None:
+                for i, label in enumerate(labels):
+                    y_true[y_true == label] = i
             y_true_multilabel = LabelBinarizer().fit_transform(y_true)
             return _average_binary_score(
                  _binary_roc_auc_score, y_true_multilabel, y_score, average,
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 0f1ac37251fba..94624ed98093d 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -471,10 +471,15 @@ def test_deprecated_auc_reorder():
                          [1, 2], [2, 3], reorder=True)
 
 
-def test_multi_ovo_auc_toydata():
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [(np.array([0, 1, 0, 2]), None),
+     (["a", "b", "a", "c"], None),
+     (["c", "b", "c", "a"], ["c", "b", "a"])]
+)
+def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
     # Tests the one-vs-one multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
-    y_true = np.array([0, 1, 0, 2])
     n_labels = len(np.unique(y_true))
     y_scores = np.array(
         [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
@@ -502,7 +507,7 @@ def test_multi_ovo_auc_toydata():
     ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1))
     ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovo"),
+        roc_auc_score(y_true, y_scores, labels=labels, multiclass="ovo"),
         ovo_unweighted_score)
 
     # Weighted, one-vs-one multiclass ROC AUC algorithm
@@ -511,14 +516,22 @@ def test_multi_ovo_auc_toydata():
     prevalence = [0.75, 0.75, 0.50]
     ovo_weighted_score = np.average(pair_scores, weights=prevalence)
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
-        ovo_weighted_score)
+        roc_auc_score(
+            y_true,
+            y_scores,
+            labels=labels,
+            multiclass="ovo",
+            average="weighted"), ovo_weighted_score)
 
 
-def test_multi_ovr_auc_toydata():
+@pytest.mark.parametrize(
+    "y_true, labels",
+    [(np.array([0, 1, 2, 2]), None),
+     (["a", "b", "c", "c"], None),
+     (["c", "b", "a", "a"], ["c", "b", "a"])])
+def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
-    y_true = np.array([0, 1, 2, 2])
     y_scores = np.array(
         [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
     # Compute the expected result by individually computing the 'one-vs-rest'
@@ -529,15 +542,52 @@ def test_multi_ovr_auc_toydata():
     result_unweighted = (out_0 + out_1 + out_2) / 3.
 
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovr"),
+        roc_auc_score(y_true, y_scores, multiclass="ovr", labels=labels),
         result_unweighted)
 
     # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
     # on the same input (Provost & Domingos, 2001)
     result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
-        result_weighted)
+        roc_auc_score(
+            y_true,
+            y_scores,
+            multiclass="ovr",
+            labels=labels,
+            average="weighted"), result_weighted)
+
+
+@pytest.mark.parametrize(
+    "msg, y_true, labels",
+    [("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]),
+     ("Parameter 'labels' must be unique", np.array(["a", "b", "c", "c"]),
+      ["a", "a", "b"]),
+     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+      np.array([0, 1, 2, 2]), [0, 1]),
+     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+      np.array(["a", "b", "c", "c"]), ["a", "b"]),
+     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+      np.array([0, 1, 2, 2]), [0, 1, 2, 3]),
+     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+      np.array(["a", "b", "c", "c"]), ["a", "b", "c", "d"]),
+     ("'y_true' contains labels not in parameter 'labels'",
+      np.array(["a", "b", "c", "e"]), ["a", "b", "c"]),
+     ("'y_true' contains labels not in parameter 'labels'",
+      np.array([0, 1, 2, 3]), [0, 1, 2]),
+     ("'y_true' contains labels not in parameter 'labels'",
+      np.array(["a", "b", "c", "d"]), ["a", "b", "c"]),
+     ("'y_true' contains labels not in parameter 'labels'",
+      np.array([0, 1, 2, 3]), [0, 1, 2])])
+@pytest.mark.parametrize("multiclass", ["ovo", "ovr"])
+def test_roc_auc_score_multiclass_labels_error(
+        msg, y_true, labels, multiclass):
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+
+    with pytest.raises(ValueError) as exc_info:
+        roc_auc_score(y_true, y_scores, labels=labels, multiclass=multiclass)
+
+    assert str(exc_info.value) == msg
 
 
 @pytest.mark.parametrize("msg, kwargs", [
@@ -554,7 +604,7 @@ def test_multi_ovr_auc_toydata():
     (("Parameter multiclass='ovp' is not supported for multiclass ROC AUC. "
       "'multiclass' must be one of ('ovo', 'ovr')."), {"multiclass": "ovp"})
 ])
-def test_auc_score_multi_error(msg, kwargs):
+def test_roc_auc_score_multiclass_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying
     # to compute multiclass AUC for parameters where an output
     # is not defined.

From 07a6f8aff7c67da1534792c1e1c6ec1fe545c7d9 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Jan 2019 14:25:33 -0500
Subject: [PATCH 40/93] RFC: Encodes y_true before passing to auc score

---
 sklearn/metrics/base.py    | 14 ++++----------
 sklearn/metrics/ranking.py | 13 +++++++++----
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 95f791404121c..48e1db0f9045e 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -170,12 +170,7 @@ def _average_multiclass_ovo_score(
     """
     check_consistent_length(y_true, y_score)
 
-    if labels is None:
-        classes = np.unique(y_true)
-    else:
-        classes = np.array(labels)
-
-    n_classes = len(classes)
+    n_classes = len(np.unique(y_true))
     n_pairs = n_classes * (n_classes - 1) // 2
     pair_scores = np.empty(n_pairs)
 
@@ -183,8 +178,7 @@ def _average_multiclass_ovo_score(
     if is_weighted:
         prevalence = np.empty(n_pairs)
 
-    all_combinations = enumerate(combinations(enumerate(classes), 2))
-    for ix, ((a_ix, a), (b_ix, b)) in all_combinations:
+    for ix, (a, b) in enumerate(combinations(range(n_classes), 2)):
         a_mask = y_true == a
         ab_mask = np.logical_or(a_mask, y_true == b)
 
@@ -194,8 +188,8 @@ def _average_multiclass_ovo_score(
         a_true = a_mask[ab_mask]
         b_true = np.logical_not(a_true)
 
-        a_true_score = binary_metric(a_true, y_score[ab_mask, a_ix])
-        b_true_score = binary_metric(b_true, y_score[ab_mask, b_ix])
+        a_true_score = binary_metric(a_true, y_score[ab_mask, a])
+        b_true_score = binary_metric(b_true, y_score[ab_mask, b])
         pair_scores[ix] = (a_true_score + b_true_score) / 2
 
     return (np.average(pair_scores, weights=prevalence)
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 2e9c5c9afeb17..767956beab311 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -406,7 +406,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 raise ValueError(
                     "Parameter 'labels' not equal to the number of columns in "
                     "'y_score'")
-            if set(classes) < set(np.unique(y_true)):
+            if set(np.unique(y_true)) > set(classes):
                 raise ValueError(
                     "'y_true' contains labels not in parameter 'labels'")
         if multiclass == "ovo":
@@ -415,12 +415,17 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                                  " for multiclass one-vs-one ROC AUC."
                                  " 'sample_weight' must be None in this case.")
             # Hand & Till (2001) implementation
+            if labels is not None:
+                y_true_multiclass = np.empty_like(y_true, dtype=np.int32)
+                for i, label in enumerate(labels):
+                    y_true_multiclass[y_true == label] = i
+                y_true = y_true_multiclass
+            else:
+                _, y_true = np.unique(y_true, return_inverse=True)
             return _average_multiclass_ovo_score(
-                _binary_roc_auc_score, y_true, y_score,
-                labels=labels, average=average)
+                _binary_roc_auc_score, y_true, y_score, average=average)
         else:
             # ovr is same as multi-label
-            y_true = y_true.reshape((-1, 1))
             # Order y_true by labels
             if labels is not None:
                 for i, label in enumerate(labels):

From d006581a747da40b9ccb977d37c30a71dbcd4cd7 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Jan 2019 14:30:12 -0500
Subject: [PATCH 41/93] RFC: Adds ovr scorer

---
 sklearn/metrics/scorer.py                   | 10 +++-------
 sklearn/metrics/tests/test_score_objects.py |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index f031634f8fcd6..13c85e7cd7b44 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -492,11 +492,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 recall_scorer = make_scorer(recall_score)
 roc_auc_ovo_scorer = make_scorer(roc_auc_score, needs_threshold=True,
                                  multiclass='ovo')
-roc_auc_weighted_scorer = make_scorer(roc_auc_score, average='weighted',
-                                      needs_threshold=True)
-roc_auc_ovo_weighted_scorer = make_scorer(roc_auc_score, average='weighted',
-                                          multiclass='ovo',
-                                          needs_threshold=True)
+roc_auc_ovr_scorer = make_scorer(roc_auc_score, needs_threshold=True,
+                                 multiclass='ovr')
 
 # Score function for probabilistic classification
 neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
@@ -526,8 +523,7 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                neg_mean_squared_log_error=neg_mean_squared_log_error_scorer,
                accuracy=accuracy_scorer, roc_auc=roc_auc_scorer,
                roc_auc_ovo=roc_auc_ovo_scorer,
-               roc_auc_weighted=roc_auc_weighted_scorer,
-               roc_auc_ovo_weighted=roc_auc_ovo_weighted_scorer,
+               roc_auc_ovr=roc_auc_ovr_scorer,
                balanced_accuracy=balanced_accuracy_scorer,
                average_precision=average_precision_scorer,
                neg_log_loss=neg_log_loss_scorer,
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
index 8247dfa3ab10d..449b0f0e0abc9 100644
--- a/sklearn/metrics/tests/test_score_objects.py
+++ b/sklearn/metrics/tests/test_score_objects.py
@@ -53,7 +53,7 @@
                'precision_weighted', 'precision_macro', 'precision_micro',
                'recall', 'recall_weighted', 'recall_macro', 'recall_micro',
                'neg_log_loss', 'log_loss', 'brier_score_loss',
-               'roc_auc_weighted', 'roc_auc_ovo_weighted', 'roc_auc_ovo']
+               'roc_auc_ovr', 'roc_auc_ovo']
 
 # All supervised cluster scorers (They behave like classification metric)
 CLUSTER_SCORERS = ["adjusted_rand_score",

From 3a0961e2c7f2362d5d86832d34200ff9a7fba112 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 3 Jan 2019 14:34:06 -0500
Subject: [PATCH 42/93] DOC: Adds roc_auc_score to multiclass docs

---
 doc/modules/model_evaluation.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 5e6d2da20f1b0..9f1d14beb4a70 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -295,6 +295,7 @@ Others also work in the multiclass case:
    confusion_matrix
    hinge_loss
    matthews_corrcoef
+   roc_auc_score
 
 
 Some also work in the multilabel case:

From 6e6d99826acc92f2271ebecd5dddd8c35b5bb0f2 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 15 Jan 2019 09:27:39 -0500
Subject: [PATCH 43/93] RFC: Rename variable

---
 sklearn/metrics/ranking.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 767956beab311..075f53b048cfd 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -399,14 +399,14 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              " one of {1}.".format(
                                  multiclass, multiclass_options))
         if labels is not None:
-            classes = np.unique(labels)
-            if len(classes) != len(labels):
+            unique_labels = np.unique(labels)
+            if len(unique_labels) != len(labels):
                 raise ValueError("Parameter 'labels' must be unique")
-            if len(classes) != y_score.shape[1]:
+            if len(unique_labels) != y_score.shape[1]:
                 raise ValueError(
                     "Parameter 'labels' not equal to the number of columns in "
                     "'y_score'")
-            if set(np.unique(y_true)) > set(classes):
+            if set(np.unique(y_true)) > set(unique_labels):
                 raise ValueError(
                     "'y_true' contains labels not in parameter 'labels'")
         if multiclass == "ovo":

From 9058b24cb665d6e207668e74f933e279072a62d1 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 15 Jan 2019 09:30:55 -0500
Subject: [PATCH 44/93] RFC: Rewords error msg

---
 sklearn/metrics/ranking.py            | 4 ++--
 sklearn/metrics/tests/test_ranking.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 075f53b048cfd..ed1dfed0c3a12 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -404,8 +404,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 raise ValueError("Parameter 'labels' must be unique")
             if len(unique_labels) != y_score.shape[1]:
                 raise ValueError(
-                    "Parameter 'labels' not equal to the number of columns in "
-                    "'y_score'")
+                    "Number of given labels not equal to the number of "
+                    "columns in 'y_score'")
             if set(np.unique(y_true)) > set(unique_labels):
                 raise ValueError(
                     "'y_true' contains labels not in parameter 'labels'")
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 94624ed98093d..d195799bf31bf 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -562,13 +562,13 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     [("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]),
      ("Parameter 'labels' must be unique", np.array(["a", "b", "c", "c"]),
       ["a", "a", "b"]),
-     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+     ("Number of given labels not equal to the number of columns in 'y_score'",
       np.array([0, 1, 2, 2]), [0, 1]),
-     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+     ("Number of given labels not equal to the number of columns in 'y_score'",
       np.array(["a", "b", "c", "c"]), ["a", "b"]),
-     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+     ("Number of given labels not equal to the number of columns in 'y_score'",
       np.array([0, 1, 2, 2]), [0, 1, 2, 3]),
-     ("Parameter 'labels' not equal to the number of columns in 'y_score'",
+     ("Number of given labels not equal to the number of columns in 'y_score'",
       np.array(["a", "b", "c", "c"]), ["a", "b", "c", "d"]),
      ("'y_true' contains labels not in parameter 'labels'",
       np.array(["a", "b", "c", "e"]), ["a", "b", "c"]),

From d26d5286f772341a743d60dae767b00c454cd39e Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sat, 19 Jan 2019 15:14:09 -0500
Subject: [PATCH 45/93] RFC: Increases efficiency

---
 sklearn/metrics/ranking.py            | 11 +++++++----
 sklearn/metrics/tests/test_ranking.py |  3 ++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index ed1dfed0c3a12..89076fe2b83be 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -33,7 +33,7 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
-from ..preprocessing import LabelBinarizer, label_binarize
+from ..preprocessing import LabelBinarizer, label_binarize, LabelEncoder
 
 from .base import _average_binary_score, _average_multiclass_ovo_score
 
@@ -427,10 +427,13 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         else:
             # ovr is same as multi-label
             # Order y_true by labels
+            lb = LabelBinarizer()
             if labels is not None:
-                for i, label in enumerate(labels):
-                    y_true[y_true == label] = i
-            y_true_multilabel = LabelBinarizer().fit_transform(y_true)
+                lb.fit(labels)
+                lb.classes_ = labels
+            else:
+                lb.fit(y_true)
+            y_true_multilabel = lb.transform(y_true)
             return _average_binary_score(
                  _binary_roc_auc_score, y_true_multilabel, y_score, average,
                  sample_weight=sample_weight)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index d195799bf31bf..695c13dbf9afd 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -528,7 +528,8 @@ def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
     "y_true, labels",
     [(np.array([0, 1, 2, 2]), None),
      (["a", "b", "c", "c"], None),
-     (["c", "b", "a", "a"], ["c", "b", "a"])])
+     (["c", "b", "a", "a"], ["c", "b", "a"]),
+     (["c", "a", "b", "b"], ["c", "a", "b"])])
 def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.

From 6978943cca4d9c7e335a2c27dbe53e1498d56638 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sat, 19 Jan 2019 15:28:53 -0500
Subject: [PATCH 46/93] STY: Flake8

---
 sklearn/metrics/ranking.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 89076fe2b83be..8263d690931b8 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -33,7 +33,7 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
-from ..preprocessing import LabelBinarizer, label_binarize, LabelEncoder
+from ..preprocessing import LabelBinarizer, label_binarize
 
 from .base import _average_binary_score, _average_multiclass_ovo_score
 
@@ -414,7 +414,6 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 raise ValueError("Parameter 'sample_weight' is not supported"
                                  " for multiclass one-vs-one ROC AUC."
                                  " 'sample_weight' must be None in this case.")
-            # Hand & Till (2001) implementation
             if labels is not None:
                 y_true_multiclass = np.empty_like(y_true, dtype=np.int32)
                 for i, label in enumerate(labels):
@@ -422,6 +421,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 y_true = y_true_multiclass
             else:
                 _, y_true = np.unique(y_true, return_inverse=True)
+
+            # Hand & Till (2001) implementation
             return _average_multiclass_ovo_score(
                 _binary_roc_auc_score, y_true, y_score, average=average)
         else:

From acedbaa2398e9510e8e4b403c11b88cdf6ba488c Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sat, 19 Jan 2019 15:40:54 -0500
Subject: [PATCH 47/93] RFC

---
 sklearn/metrics/ranking.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 8263d690931b8..158d419f82a80 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -418,13 +418,15 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 y_true_multiclass = np.empty_like(y_true, dtype=np.int32)
                 for i, label in enumerate(labels):
                     y_true_multiclass[y_true == label] = i
-                y_true = y_true_multiclass
             else:
-                _, y_true = np.unique(y_true, return_inverse=True)
+                _, y_true_multiclass = np.unique(y_true, return_inverse=True)
 
             # Hand & Till (2001) implementation
             return _average_multiclass_ovo_score(
-                _binary_roc_auc_score, y_true, y_score, average=average)
+                _binary_roc_auc_score,
+                y_true_multiclass,
+                y_score,
+                average=average)
         else:
             # ovr is same as multi-label
             # Order y_true by labels

From f39a0671032618e7b9611ba0d5bf9fc655a741bf Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 30 Jan 2019 12:56:33 -0500
Subject: [PATCH 48/93] ENH: Uses object form of plt

---
 examples/model_selection/plot_roc.py | 56 ++++++++++++++--------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 586919943ca97..f73f8e1f620be 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -97,19 +97,19 @@
       y_test_binarized.ravel(), y_score.ravel())
 roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
-plt.figure()
+fig, ax = plt.subplots()
 lw = 2
-plt.plot(fpr[2], tpr[2], color='darkorange',
-         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
-plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
-plt.xlim([0.0, 1.0])
-plt.ylim([0.0, 1.05])
-plt.xlabel('False Positive Rate')
-plt.ylabel('True Positive Rate')
-plt.title('Receiver operating characteristic example for {}'
+ax.plot(fpr[2], tpr[2], color='darkorange',
+        lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
+ax.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
+ax.set_xlim([0.0, 1.0])
+ax.set_ylim([0.0, 1.05])
+ax.set_xlabel('False Positive Rate')
+ax.set_ylabel('True Positive Rate')
+ax.set_title('Receiver operating characteristic example for {}'
           .format(iris.target_names[2]))
-plt.legend(loc="lower right")
-plt.show()
+ax.legend(loc="lower right")
+fig.show()
 
 ###############################################################################
 # Plot ROC curves for the multiclass problem using One-vs-Rest
@@ -139,16 +139,16 @@
 roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
 
 # Plot all ROC curves
-plt.figure()
-plt.plot(fpr["micro"], tpr["micro"],
-         label='micro-average ROC curve (area = {0:0.2f})'
-               ''.format(roc_auc["micro"]),
-         color='deeppink', linestyle=':', linewidth=4)
+fig, ax = plt.subplots()
+ax.plot(fpr["micro"], tpr["micro"],
+        label='micro-average ROC curve (area = {0:0.2f})'
+              ''.format(roc_auc["micro"]),
+        color='deeppink', linestyle=':', linewidth=4)
 
-plt.plot(fpr["macro"], tpr["macro"],
-         label='macro-average ROC curve (area = {0:0.2f})'
-               ''.format(roc_auc["macro"]),
-         color='navy', linestyle=':', linewidth=4)
+ax.plot(fpr["macro"], tpr["macro"],
+        label='macro-average ROC curve (area = {0:0.2f})'
+              ''.format(roc_auc["macro"]),
+        color='navy', linestyle=':', linewidth=4)
 
 colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
 for i, color in zip(range(n_classes), colors):
@@ -156,14 +156,14 @@
              label='ROC curve of class {0} (area = {1:0.2f})'
              ''.format(i, roc_auc[i]))
 
-plt.plot([0, 1], [0, 1], 'k--', lw=lw)
-plt.xlim([0.0, 1.0])
-plt.ylim([0.0, 1.05])
-plt.xlabel('False Positive Rate')
-plt.ylabel('True Positive Rate')
-plt.title('An extension of ROC to multi-class using One-vs-Rest')
-plt.legend(loc="lower right")
-plt.show()
+ax.plot([0, 1], [0, 1], 'k--', lw=lw)
+ax.set_xlim([0.0, 1.0])
+ax.set_ylim([0.0, 1.05])
+ax.set_xlabel('False Positive Rate')
+ax.set_ylabel('True Positive Rate')
+ax.set_title('An extension of ROC to multi-class using One-vs-Rest')
+ax.legend(loc="lower right")
+fig.show()
 
 ###############################################################################
 # One-vs-One

From 765c71bc776e0bd1833bf986d101e2f43bbcf345 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 30 Jan 2019 13:29:57 -0500
Subject: [PATCH 49/93] STY: Flake8

---
 examples/model_selection/plot_roc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index f73f8e1f620be..70551112cbbfe 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -106,8 +106,8 @@
 ax.set_ylim([0.0, 1.05])
 ax.set_xlabel('False Positive Rate')
 ax.set_ylabel('True Positive Rate')
-ax.set_title('Receiver operating characteristic example for {}'
-          .format(iris.target_names[2]))
+ax.set_title('Receiver operating characteristic example for {}'.format(
+    iris.target_names[2]))
 ax.legend(loc="lower right")
 fig.show()
 

From 139f3afbe66802ce649aee984b9a3e754a1a1945 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 12:39:25 -0500
Subject: [PATCH 50/93] DOC Address comments

---
 doc/modules/model_evaluation.rst      |  8 +++----
 sklearn/metrics/base.py               | 31 ++++++++++++++-------------
 sklearn/metrics/ranking.py            |  4 ++--
 sklearn/metrics/tests/test_ranking.py |  4 +---
 4 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index b8a98cee2a156..12b4c58d2b2d9 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1309,7 +1309,7 @@ extended by averaging over the labels as :ref:`above <average>`.
 Compared to metrics such as the subset accuracy, the Hamming loss, or the
 F1 score, ROC doesn't require optimizing a threshold for each label.
 
-The :func:`roc_auc_score` function can also be used in mult-class
+The :func:`roc_auc_score` function can also be used in multi-class
 classification. Two averaging strategies are currently supported: the
 one-vs-one algorithm computes the average of the pairwise ROC AUC scores, and
 the one-vs-rest algorithm computes the average of the ROC AUC scores for each
@@ -1318,13 +1318,13 @@ provided in an array with values from 0 to ``n_classes``, and the scores
 correspond to the probability estimates that a sample belongs to a particular
 class.
 
-**One-vs-one Algorithm**: Computes the AUC of all possible pairwise
+**One-vs-one Algorithm**: Computes the average AUC of all possible pairwise
 combinations of classes. [HT2001]_ defines a multiclass AUC metric weighted
 uniformly:
 
 .. math::
 
-   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k \neq j}^c (\text{AUC}(j | k) +
+   \frac{2}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c (\text{AUC}(j | k) +
    \text{AUC}(k | j))
 
 where :math:`c` is the number of classes and :math:`\text{AUC}(j | k)` is the
@@ -1339,7 +1339,7 @@ prevalence:
 
 .. math::
 
-   \frac{1}{c(c-1)}\sum_{j=1}^{c}\sum_{k \neq j}^c p(j \cup k)(
+   \frac{2}{c(c-1)}\sum_{j=1}^{c}\sum_{k > j}^c p(j \cup k)(
    \text{AUC}(j | k) + \text{AUC}(k | j))
 
 where :math:`c` is the number of classes. This algorithm is used by setting
diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 982173f265eb6..74cff612808d0 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -127,24 +127,24 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
 
 
 def _average_multiclass_ovo_score(
-        binary_metric, y_true, y_score, labels=None, average='macro'):
-    """Uses the binary metric for one-vs-one multiclass classification,
+        binary_metric, y_true, y_score, average='macro'):
+    """Average one-versus-one scores for multiclass classification.
+
+    Uses the binary metric for one-vs-one multiclass classification,
     where the score is computed according to the Hand & Till (2001) algorithm.
 
     Parameters
     ----------
-    y_true : array, shape = [n_samples]
+    y_true : array-like, shape = (n_samples, )
         True multiclass labels.
 
-    y_score : array, shape = [n_samples, n_classes]
+    y_score : array-like, shape = (n_samples, n_classes)
         Target scores corresponding to probability estimates of a sample
         belonging to a particular class
 
-    labels : array, shape = [n_classes] or None, optional (default=None)
-        List of labels to index ``y_score``. If ``None``,
-        the lexicon order of ``y_true`` is used to index ``y_score``.
-
-    average : 'macro' or 'weighted', default='macro'
+    average : 'macro' or 'weighted', optional (default='macro')
+        Determines the type of averaging performed on the pairwise binary
+        metric scores
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
             mean. This does not take label imbalance into account. Classes
@@ -153,8 +153,8 @@ def _average_multiclass_ovo_score(
             Calculate metrics for each label, taking into account the
             prevalence of the classes.
 
-    binary_metric : callable, the binary metric function to use.
-        Accepts the following as input
+    binary_metric : callable
+        The binary metric function to use that accepts the following as input
             y_true_target : array, shape = [n_samples_target]
                 Some sub-array of y_true for a pair of classes designated
                 positive and negative in the one-vs-one scheme.
@@ -165,11 +165,11 @@ def _average_multiclass_ovo_score(
     Returns
     -------
     score : float
-        Average the sum of pairwise binary metric scores
+        Average of the pairwise binary metric scores
     """
     check_consistent_length(y_true, y_score)
 
-    n_classes = len(np.unique(y_true))
+    n_classes = np.unique(y_true).shape[0]
     n_pairs = n_classes * (n_classes - 1) // 2
     pair_scores = np.empty(n_pairs)
 
@@ -179,13 +179,14 @@ def _average_multiclass_ovo_score(
 
     for ix, (a, b) in enumerate(combinations(range(n_classes), 2)):
         a_mask = y_true == a
-        ab_mask = np.logical_or(a_mask, y_true == b)
+        b_mask = y_true == b
+        ab_mask = np.logical_or(a_mask, b_mask)
 
         if is_weighted:
             prevalence[ix] = np.sum(ab_mask) / len(y_true)
 
         a_true = a_mask[ab_mask]
-        b_true = np.logical_not(a_true)
+        b_true = b_mask[ab_mask]
 
         a_true_score = binary_metric(a_true, y_score[ab_mask, a])
         b_true_score = binary_metric(b_true, y_score[ab_mask, b])
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index baca3599fe9cd..90d14ecda60de 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -256,7 +256,7 @@ def roc_auc_score(y_true, y_score, labels=None,
     y_true : array, shape = [n_samples] or [n_samples, n_classes]
         True binary labels or binary label indicators.
         The multiclass case expects shape = [n_samples] and labels
-        with values from 0 to (n_classes-1), inclusive.
+        with values in ``range(n_classes)``.
 
     y_score : array, shape = [n_samples] or [n_samples, n_classes]
         Target scores, can either be probability estimates of the positive
@@ -270,7 +270,7 @@ def roc_auc_score(y_true, y_score, labels=None,
         List of labels to index ``y_score`` used for multiclass. If ``None``,
         the lexicon order of ``y_true`` is used to index ``y_score``.
 
-    multiclass : string, 'ovr' or 'ovo', (default='ovr')
+    multiclass : string, 'ovr' or 'ovo', optional(default='ovr')
         Note: multiclass ROC AUC currently only handles the 'macro' and
         'weighted' averages.
 
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 00a28a375292a..0ec9968d7fa77 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -584,11 +584,9 @@ def test_roc_auc_score_multiclass_labels_error(
     y_scores = np.array(
         [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
 
-    with pytest.raises(ValueError) as exc_info:
+    with pytest.raises(ValueError, match=msg):
         roc_auc_score(y_true, y_scores, labels=labels, multiclass=multiclass)
 
-    assert str(exc_info.value) == msg
-
 
 @pytest.mark.parametrize("msg, kwargs", [
     (("Parameter 'average' must be one of ('macro', 'weighted') for "

From 99f5498f6fd7268f256a112856c7d33ffe8d6540 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 13:30:12 -0500
Subject: [PATCH 51/93] RFC: Uses pytest.rases(match=...)

---
 sklearn/metrics/tests/test_ranking.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 0ec9968d7fa77..b11642bd0e013 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -589,18 +589,18 @@ def test_roc_auc_score_multiclass_labels_error(
 
 
 @pytest.mark.parametrize("msg, kwargs", [
-    (("Parameter 'average' must be one of ('macro', 'weighted') for "
-      "multiclass problems."), {"average": "samples"}),
-    (("Parameter 'average' must be one of ('macro', 'weighted') for "
-      "multiclass problems."), {"average": "micro"}),
-    (("Parameter 'sample_weight' is not supported for multiclass one-vs-one "
-      "ROC AUC. 'sample_weight' must be None in this case."),
+    ((r"Parameter 'average' must be one of \('macro', 'weighted'\) for "
+      r"multiclass problems\."), {"average": "samples"}),
+    ((r"Parameter 'average' must be one of \('macro', 'weighted'\) for "
+      r"multiclass problems\."), {"average": "micro"}),
+    ((r"Parameter 'sample_weight' is not supported for multiclass one-vs-one "
+      r"ROC AUC. 'sample_weight' must be None in this case\."),
      {"multiclass": "ovo", "sample_weight": []}),
-    (("Partial AUC computation not available in multiclass setting. "
-      "Parameter 'max_fpr' must be set to `None`. Received `max_fpr=0.5` "
-      "instead."), {"multiclass": "ovo", "max_fpr": 0.5}),
-    (("Parameter multiclass='ovp' is not supported for multiclass ROC AUC. "
-      "'multiclass' must be one of ('ovo', 'ovr')."), {"multiclass": "ovp"})
+    ((r"Partial AUC computation not available in multiclass setting\. "
+      r"Parameter 'max_fpr' must be set to `None`. Received `max_fpr=0.5` "
+      r"instead\."), {"multiclass": "ovo", "max_fpr": 0.5}),
+    ((r"Parameter multiclass='ovp' is not supported for multiclass ROC AUC\. "
+      r"'multiclass' must be one of \('ovo', 'ovr'\)\."), {"multiclass": "ovp"})
 ])
 def test_roc_auc_score_multiclass_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying
@@ -610,9 +610,8 @@ def test_roc_auc_score_multiclass_error(msg, kwargs):
     y_pred = rng.rand(10, 3)
     y_pred = y_pred / y_pred.sum(axis=1, keepdims=True)
     y_true = rng.randint(0, 3, size=10)
-    with pytest.raises(ValueError) as exc_info:
+    with pytest.raises(ValueError, match=msg):
         roc_auc_score(y_true, y_pred, **kwargs)
-    assert str(exc_info.value) == msg
 
 
 def test_auc_score_non_binary_class():

From 8b4dd6e91a9cda0623eb75893ceb8e6c970926ec Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 15:04:25 -0500
Subject: [PATCH 52/93] RFC Address comments

---
 examples/model_selection/plot_roc.py  | 12 +++++-----
 sklearn/metrics/base.py               | 26 +++++++++++-----------
 sklearn/metrics/ranking.py            | 17 +++++++-------
 sklearn/metrics/tests/test_common.py  |  6 ++---
 sklearn/metrics/tests/test_ranking.py | 32 +++++++++++++--------------
 5 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 70551112cbbfe..df649bdb88fe8 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -15,9 +15,6 @@
 The "steepness" of ROC curves is also important, since it is ideal to maximize
 the true positive rate while minimizing the false positive rate.
 
-Multiclass settings
--------------------
-
 ROC curves are typically used in binary classification to study the output of
 a classifier. The ROC curve and ROC area can be extended to multi-class or
 multi-label classification by using the One-vs-Rest or One-vs-One scheme.
@@ -27,6 +24,7 @@
     See also :func:`sklearn.metrics.roc_auc_score`,
              :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`.
 """
+print(__doc__)
 ###############################################################################
 # One-vs-Rest
 # -----------
@@ -76,6 +74,9 @@
 ###############################################################################
 # Plotting the ROC curve for virginica
 # ....................................
+# One can draw a ROC curve by considering each element of the label indicator
+# matrix as a binary prediction (micro-averaging). In the following, we 
+# draw the ROC curve for virginica.
 import matplotlib.pyplot as plt
 from sklearn.preprocessing import label_binarize
 from sklearn.metrics import roc_curve, auc
@@ -114,9 +115,6 @@
 ###############################################################################
 # Plot ROC curves for the multiclass problem using One-vs-Rest
 # ............................................................
-# One can draw a ROC curve by considering each element of the label indicator
-# matrix as a binary prediction (micro-averaging).
-#
 # Another evaluation measure for one-vs-rest multi-class classification is
 # macro-averaging, which gives equal weight to the classification of each
 # label.
@@ -168,7 +166,7 @@
 ###############################################################################
 # One-vs-One
 # ---------------------
-# The One-vs-One scheme compares every pairwise combiantion of classes.
+# The One-vs-One scheme compares every unique pairwise combination of classes.
 #
 # Compute the AUC score
 # .....................
diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 74cff612808d0..d0864e6cdf0b0 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -135,6 +135,15 @@ def _average_multiclass_ovo_score(
 
     Parameters
     ----------
+    binary_metric : callable
+        The binary metric function to use that accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
     y_true : array-like, shape = (n_samples, )
         True multiclass labels.
 
@@ -153,15 +162,6 @@ def _average_multiclass_ovo_score(
             Calculate metrics for each label, taking into account the
             prevalence of the classes.
 
-    binary_metric : callable
-        The binary metric function to use that accepts the following as input
-            y_true_target : array, shape = [n_samples_target]
-                Some sub-array of y_true for a pair of classes designated
-                positive and negative in the one-vs-one scheme.
-            y_score_target : array, shape = [n_samples_target]
-                Scores corresponding to the probability estimates
-                of a sample belonging to the designated positive class label
-
     Returns
     -------
     score : float
@@ -174,9 +174,10 @@ def _average_multiclass_ovo_score(
     pair_scores = np.empty(n_pairs)
 
     is_weighted = average == "weighted"
-    if is_weighted:
-        prevalence = np.empty(n_pairs)
+    prevalence = np.empty(n_pairs) if is_weighted else None
 
+    # Compute scores treating a as positive class and b as negative class,
+    # then b as positive class and a as negative class
     for ix, (a, b) in enumerate(combinations(range(n_classes), 2)):
         a_mask = y_true == a
         b_mask = y_true == b
@@ -192,5 +193,4 @@ def _average_multiclass_ovo_score(
         b_true_score = binary_metric(b_true, y_score[ab_mask, b])
         pair_scores[ix] = (a_true_score + b_true_score) / 2
 
-    return (np.average(pair_scores, weights=prevalence)
-            if is_weighted else np.average(pair_scores))
+    return np.average(pair_scores, weights=prevalence)
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 90d14ecda60de..cfc5a6939f4c3 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -271,9 +271,7 @@ def roc_auc_score(y_true, y_score, labels=None,
         the lexicon order of ``y_true`` is used to index ``y_score``.
 
     multiclass : string, 'ovr' or 'ovo', optional(default='ovr')
-        Note: multiclass ROC AUC currently only handles the 'macro' and
-        'weighted' averages.
-
+        Determines the type of multiclass configuration to use.
         ``'ovr'``:
             Calculate metrics for the multiclass case using the one-vs-rest
             approach.
@@ -284,6 +282,8 @@ def roc_auc_score(y_true, y_score, labels=None,
     average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages.
 
         ``'micro'``:
             Calculate metrics globally by considering each element of the label
@@ -304,9 +304,9 @@ def roc_auc_score(y_true, y_score, labels=None,
 
     max_fpr : float > 0 and <= 1, optional
         If not ``None``, the standardized partial AUC [3]_ over the range
-        [0, max_fpr] is returned. If multiclass task, should be either
-        equal to ``None`` or ``1.0`` as AUC ROC partial computation currently
-        not supported in this case.
+        [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,
+        should be either equal to ``None`` or ``1.0`` as AUC ROC partial
+        computation currently is not supported for multiclass.
 
     Returns
     -------
@@ -403,8 +403,9 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 raise ValueError("Parameter 'labels' must be unique")
             if len(unique_labels) != y_score.shape[1]:
                 raise ValueError(
-                    "Number of given labels not equal to the number of "
-                    "columns in 'y_score'")
+                    "Number of given labels, {0}, not equal to the number "
+                    "of columns in 'y_score', {1}".format(
+                        len(unique_labels), y_score.shape[1]))
             if set(np.unique(y_true)) > set(unique_labels):
                 raise ValueError(
                     "'y_true' contains labels not in parameter 'labels'")
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 2f8877dc4904b..ddf0d5a754629 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1266,9 +1266,9 @@ def test_multiclass_score_permutation_invariance(name):
     metric = ALL_METRICS[name]
     score = metric(y_true, y_score)
     for perm in permutations(range(n_classes), n_classes):
-        inv_perm = np.zeros(n_classes, dtype=int)
-        inv_perm[list(perm)] = np.arange(n_classes)
-        y_score_perm = y_score[:, inv_perm]
+        inverse_perm = np.zeros(n_classes, dtype=int)
+        inverse_perm[list(perm)] = np.arange(n_classes)
+        y_score_perm = y_score[:, inverse_perm]
         y_true_perm = np.take(perm, y_true)
 
         current_score = metric(y_true_perm, y_score_perm)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index b11642bd0e013..ab212d29d4132 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -472,14 +472,13 @@ def test_deprecated_auc_reorder():
 
 @pytest.mark.parametrize(
     "y_true, labels",
-    [(np.array([0, 1, 0, 2]), None),
-     (["a", "b", "a", "c"], None),
-     (["c", "b", "c", "a"], ["c", "b", "a"])]
+    # [(np.array([0, 1, 0, 2]), None),
+    #  (["a", "b", "a", "c"], None),
+    [(["c", "b", "c", "a"], ["c", "b", "a"])]
 )
 def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
     # Tests the one-vs-one multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
-    n_labels = len(np.unique(y_true))
     y_scores = np.array(
         [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
 
@@ -489,22 +488,21 @@ def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
     score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
     # positive label is 1, negative label is 0
     score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
-    average_score_01 = (score_01 + score_10) / 2.
+    average_score_01 = (score_01 + score_10) / 2
 
     # Consider labels 0 and 2:
     score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
     score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
-    average_score_02 = (score_02 + score_20) / 2.
+    average_score_02 = (score_02 + score_20) / 2
 
     # Consider labels 1 and 2:
     score_12 = roc_auc_score([1, 0], [0.4, 0.2])
     score_21 = roc_auc_score([0, 1], [0.3, 0.8])
-    average_score_12 = (score_12 + score_21) / 2.
+    average_score_12 = (score_12 + score_21) / 2
 
     # Unweighted, one-vs-one multiclass ROC AUC algorithm
-    sum_avg_scores = average_score_01 + average_score_02 + average_score_12
-    ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1))
-    ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores
+    ovo_unweighted_score = (
+        average_score_01 + average_score_02 + average_score_12) / 3
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, labels=labels, multiclass="ovo"),
         ovo_unweighted_score)
@@ -562,18 +560,20 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     [("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]),
      ("Parameter 'labels' must be unique", np.array(["a", "b", "c", "c"]),
       ["a", "a", "b"]),
-     ("Number of given labels not equal to the number of columns in 'y_score'",
+     ("Number of given labels, 2, not equal to the number of columns in "
+      "'y_score', 3",
       np.array([0, 1, 2, 2]), [0, 1]),
-     ("Number of given labels not equal to the number of columns in 'y_score'",
+     ("Number of given labels, 2, not equal to the number of columns in "
+      "'y_score', 3",
       np.array(["a", "b", "c", "c"]), ["a", "b"]),
-     ("Number of given labels not equal to the number of columns in 'y_score'",
+     ("Number of given labels, 4, not equal to the number of columns in "
+      "'y_score', 3",
       np.array([0, 1, 2, 2]), [0, 1, 2, 3]),
-     ("Number of given labels not equal to the number of columns in 'y_score'",
+     ("Number of given labels, 4, not equal to the number of columns in "
+      "'y_score', 3",
       np.array(["a", "b", "c", "c"]), ["a", "b", "c", "d"]),
      ("'y_true' contains labels not in parameter 'labels'",
       np.array(["a", "b", "c", "e"]), ["a", "b", "c"]),
-     ("'y_true' contains labels not in parameter 'labels'",
-      np.array([0, 1, 2, 3]), [0, 1, 2]),
      ("'y_true' contains labels not in parameter 'labels'",
       np.array(["a", "b", "c", "d"]), ["a", "b", "c"]),
      ("'y_true' contains labels not in parameter 'labels'",

From bd032f9355113de41254cfd0f61177d214addc9a Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 15:07:28 -0500
Subject: [PATCH 53/93] REV Add tests back

---
 sklearn/metrics/tests/test_ranking.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index ab212d29d4132..9d56135afc198 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -472,9 +472,9 @@ def test_deprecated_auc_reorder():
 
 @pytest.mark.parametrize(
     "y_true, labels",
-    # [(np.array([0, 1, 0, 2]), None),
-    #  (["a", "b", "a", "c"], None),
-    [(["c", "b", "c", "a"], ["c", "b", "a"])]
+    [(np.array([0, 1, 0, 2]), None),
+     (["a", "b", "a", "c"], None),
+     (["c", "b", "c", "a"], ["c", "b", "a"])]
 )
 def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
     # Tests the one-vs-one multiclass ROC AUC algorithm

From 36fe5837c3311365514b539919a5b72092cbd08c Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 15:22:53 -0500
Subject: [PATCH 54/93] RFC Uses better name

---
 sklearn/metrics/tests/test_ranking.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 9d56135afc198..2e66c071b20ec 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -607,11 +607,11 @@ def test_roc_auc_score_multiclass_error(msg, kwargs):
     # to compute multiclass AUC for parameters where an output
     # is not defined.
     rng = check_random_state(404)
-    y_pred = rng.rand(10, 3)
-    y_pred = y_pred / y_pred.sum(axis=1, keepdims=True)
-    y_true = rng.randint(0, 3, size=10)
+    y_score = rng.rand(20, 3)
+    y_score = y_score / y_score.sum(axis=1, keepdims=True)
+    y_true = rng.randint(0, 3, size=20)
     with pytest.raises(ValueError, match=msg):
-        roc_auc_score(y_true, y_pred, **kwargs)
+        roc_auc_score(y_true, y_score, **kwargs)
 
 
 def test_auc_score_non_binary_class():

From 21203e4e48f3bb6ea7dcd12d49383133ded5e324 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 15:38:35 -0500
Subject: [PATCH 55/93] DOC

---
 examples/model_selection/plot_roc.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index df649bdb88fe8..5fa1d462008c3 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -75,8 +75,8 @@
 # Plotting the ROC curve for virginica
 # ....................................
 # One can draw a ROC curve by considering each element of the label indicator
-# matrix as a binary prediction (micro-averaging). In the following, we 
-# draw the ROC curve for virginica.
+# matrix as a binary prediction (micro-averaging). In the following, the ROC 
+# curve for virginica is drawn.
 import matplotlib.pyplot as plt
 from sklearn.preprocessing import label_binarize
 from sklearn.metrics import roc_curve, auc
@@ -181,6 +181,7 @@
 ###############################################################################
 # Plot ROC curves for the multiclass problem using One-vs-One
 # ...........................................................
+# The ROC curve for every pair of classes are drawn.
 from itertools import combinations
 
 for a, b in combinations(range(n_classes), 2):
@@ -196,21 +197,21 @@
           y_test[ab_mask] == b, y_score[ab_mask, b])
     roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
 
-plt.figure()
+fig, ax = plt.subplots()
 for a, b in combinations(range(n_classes), 2):
-    plt.plot(
+    ax.plot(
           fpr[(a, b)], tpr[(a, b)],
           lw=lw, label='ROC curve: class {0} vs. {1} '
           '(area = {2:0.2f})'.format(a, b, roc_auc[(a, b)]))
-    plt.plot(
+    ax.plot(
            fpr[(b, a)], tpr[(b, a)],
            lw=lw, label='ROC curve: class {0} vs. {1} '
            '(area = {2:0.2f})'.format(b, a, roc_auc[(b, a)]))
-plt.plot([0, 1], [0, 1], 'k--', lw=lw)
-plt.xlim([0.0, 1.0])
-plt.ylim([0.0, 1.05])
-plt.xlabel('False Positive Rate')
-plt.ylabel('True Positive Rate')
-plt.title('An extension of ROC to multi-class using One-vs-One')
-plt.legend(loc="lower right")
-plt.show()
+ax.plot([0, 1], [0, 1], 'k--', lw=lw)
+ax.set_xlim([0.0, 1.0])
+ax.set_ylim([0.0, 1.05])
+ax.set_xlabel('False Positive Rate')
+ax.set_ylabel('True Positive Rate')
+ax.set_title('An extension of ROC to multi-class using One-vs-One')
+ax.legend(loc="lower right")
+fig.show()

From 43bd6bb8db795612a8a0f9fc6e2da4f4b39d108b Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 15:49:20 -0500
Subject: [PATCH 56/93] DOC

---
 examples/model_selection/plot_roc.py  | 2 +-
 sklearn/metrics/tests/test_ranking.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 5fa1d462008c3..7c40c313edd6a 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -124,7 +124,7 @@
 # First aggregate all false positive rates
 all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
 
-# Then interpolate all ROC curves at this points
+# Then interpolate all ROC curves at these points
 mean_tpr = np.zeros_like(all_fpr)
 for i in range(n_classes):
     mean_tpr += interp(all_fpr, fpr[i], tpr[i])
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 2e66c071b20ec..a24a2e370d4ea 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -600,7 +600,8 @@ def test_roc_auc_score_multiclass_labels_error(
       r"Parameter 'max_fpr' must be set to `None`. Received `max_fpr=0.5` "
       r"instead\."), {"multiclass": "ovo", "max_fpr": 0.5}),
     ((r"Parameter multiclass='ovp' is not supported for multiclass ROC AUC\. "
-      r"'multiclass' must be one of \('ovo', 'ovr'\)\."), {"multiclass": "ovp"})
+      r"'multiclass' must be one of \('ovo', 'ovr'\)\."),
+     {"multiclass": "ovp"})
 ])
 def test_roc_auc_score_multiclass_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying

From 941d810490535ea97299c1348e94abefe5776bb6 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 16:04:30 -0500
Subject: [PATCH 57/93] RFC Uses average

---
 sklearn/metrics/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index d0864e6cdf0b0..1ce1450eb74cc 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -184,7 +184,7 @@ def _average_multiclass_ovo_score(
         ab_mask = np.logical_or(a_mask, b_mask)
 
         if is_weighted:
-            prevalence[ix] = np.sum(ab_mask) / len(y_true)
+            prevalence[ix] = np.average(ab_mask)
 
         a_true = a_mask[ab_mask]
         b_true = b_mask[ab_mask]

From fa11e2d5e99c4b1d8f691652ea9c8e2e7276926e Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 17:25:42 -0500
Subject: [PATCH 58/93] DOC Adds multiclass macro and weighted curves

---
 examples/model_selection/plot_roc.py | 66 ++++++++++++++++++++++------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 7c40c313edd6a..2e1e9b537c625 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -75,7 +75,7 @@
 # Plotting the ROC curve for virginica
 # ....................................
 # One can draw a ROC curve by considering each element of the label indicator
-# matrix as a binary prediction (micro-averaging). In the following, the ROC 
+# matrix as a binary prediction (micro-averaging). In the following, the ROC
 # curve for virginica is drawn.
 import matplotlib.pyplot as plt
 from sklearn.preprocessing import label_binarize
@@ -171,19 +171,22 @@
 # Compute the AUC score
 # .....................
 # The ROC area can be approximated by taking the average either weighted
-# uniformly or by the priori class distribution.
-unweighted_roc_auc_ovo = roc_auc_score(y_test, y_score_norm, multiclass="ovo")
+# uniformly (macro) or by prevalence.
+macro_roc_auc_ovo = roc_auc_score(
+      y_test, y_score_norm, multiclass="ovo", average="macro")
 weighted_roc_auc_ovo = roc_auc_score(
       y_test, y_score_norm, multiclass="ovo", average="weighted")
-print("One-vs-One ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
-       unweighted_roc_auc_ovo, weighted_roc_auc_ovo))
+print("One-vs-One ROC AUC scores: {0} (uniform), {1} (weighted by prevalence)"
+      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
 
 ###############################################################################
 # Plot ROC curves for the multiclass problem using One-vs-One
 # ...........................................................
-# The ROC curve for every pair of classes are drawn.
-from itertools import combinations
+# The ROC curve for every pair of classes are drawn together with the
+# average weighted uniformly and weighted by prevalence.
+from itertools import combinations, permutations
 
+prevalence = dict()
 for a, b in combinations(range(n_classes), 2):
     ab_mask = np.logical_or(y_test == a, y_test == b)
 
@@ -191,22 +194,59 @@
     fpr[(a, b)], tpr[(a, b)], _ = roc_curve(
           y_test[ab_mask] == a, y_score[ab_mask, a])
     roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)])
+    prevalence[(a, b)] = np.average(ab_mask)
 
     # Compute ROC curve and ROC area with `b` as the positive class
     fpr[(b, a)], tpr[(b, a)], _ = roc_curve(
           y_test[ab_mask] == b, y_score[ab_mask, b])
     roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
+    prevalence[(b, a)] = np.average(ab_mask)
+
+class_permutations = list(permutations(range(n_classes), 2))
+all_multiclass_fpr = np.unique(
+      np.concatenate([fpr[(a, b)] for a, b in class_permutations]))
+
+multiclass_interp_tpr = dict()
+for a, b in class_permutations:
+    multiclass_interp_tpr[(a, b)] = interp(all_multiclass_fpr, fpr[(a, b)], tpr[(a, b)])
+
+all_multiclass_tpr = np.array(
+      [multiclass_interp_tpr[(a, b)] for a, b in class_permutations])
+all_prevalence = np.array([prevalence[(a, b)] for a, b in class_permutations])
+
+roc_auc_uniform_average_tpr = np.average(all_multiclass_tpr, axis=0)
+roc_auc_prevalence_average_tpr = np.average(
+      all_multiclass_tpr, axis=0, weights=all_prevalence)
+
 
 fig, ax = plt.subplots()
-for a, b in combinations(range(n_classes), 2):
+# plot roc curve as a macro average
+ax.plot(
+    all_multiclass_fpr,
+    roc_auc_uniform_average_tpr,
+    color='navy',
+    linestyle=':',
+    lw=4,
+    label='macro average (area = {0:0.2f})'.format(
+        macro_roc_auc_ovo),
+)
+# plot roc curve as a weighted average
+ax.plot(
+    all_multiclass_fpr,
+    roc_auc_prevalence_average_tpr,
+    color='deeppink',
+    linestyle=':',
+    lw=4,
+    label='weighted average (area = {0:0.2f})'.format(
+        weighted_roc_auc_ovo),
+)
+
+# plot roc curve for every of classes
+for a, b in permutations(range(n_classes), 2):
     ax.plot(
           fpr[(a, b)], tpr[(a, b)],
-          lw=lw, label='ROC curve: class {0} vs. {1} '
+          lw=lw, label='class {0} vs. {1} '
           '(area = {2:0.2f})'.format(a, b, roc_auc[(a, b)]))
-    ax.plot(
-           fpr[(b, a)], tpr[(b, a)],
-           lw=lw, label='ROC curve: class {0} vs. {1} '
-           '(area = {2:0.2f})'.format(b, a, roc_auc[(b, a)]))
 ax.plot([0, 1], [0, 1], 'k--', lw=lw)
 ax.set_xlim([0.0, 1.0])
 ax.set_ylim([0.0, 1.05])

From 1ec9b3f0d8b33b8cfd3837984ad5a95673b117f1 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 5 Feb 2019 17:54:20 -0500
Subject: [PATCH 59/93] STY Flake8

---
 examples/model_selection/plot_roc.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 2e1e9b537c625..a7448a3ab58ce 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -208,7 +208,8 @@
 
 multiclass_interp_tpr = dict()
 for a, b in class_permutations:
-    multiclass_interp_tpr[(a, b)] = interp(all_multiclass_fpr, fpr[(a, b)], tpr[(a, b)])
+    multiclass_interp_tpr[(a, b)] = interp(
+          all_multiclass_fpr, fpr[(a, b)], tpr[(a, b)])
 
 all_multiclass_tpr = np.array(
       [multiclass_interp_tpr[(a, b)] for a, b in class_permutations])

From e40218e0a91597dc084489e7c21bc1d880b09de4 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 6 Feb 2019 15:37:50 -0500
Subject: [PATCH 60/93] ENH Adds support for integer y_true

---
 sklearn/metrics/base.py               |  5 ++-
 sklearn/metrics/ranking.py            | 60 +++++++++++++++++++--------
 sklearn/metrics/tests/test_ranking.py | 54 ++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 20 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 1ce1450eb74cc..02cfc9c6f05f7 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -169,7 +169,8 @@ def _average_multiclass_ovo_score(
     """
     check_consistent_length(y_true, y_score)
 
-    n_classes = np.unique(y_true).shape[0]
+    y_true_unique = np.unique(y_true)
+    n_classes = y_true_unique.shape[0]
     n_pairs = n_classes * (n_classes - 1) // 2
     pair_scores = np.empty(n_pairs)
 
@@ -178,7 +179,7 @@ def _average_multiclass_ovo_score(
 
     # Compute scores treating a as positive class and b as negative class,
     # then b as positive class and a as negative class
-    for ix, (a, b) in enumerate(combinations(range(n_classes), 2)):
+    for ix, (a, b) in enumerate(combinations(y_true_unique, 2)):
         a_mask = y_true == a
         b_mask = y_true == b
         ab_mask = np.logical_or(a_mask, b_mask)
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index cfc5a6939f4c3..d8c63df85b31e 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -32,7 +32,7 @@
 from ..utils.extmath import stable_cumsum
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
-from ..preprocessing import LabelBinarizer, label_binarize
+from ..preprocessing import label_binarize
 
 from .base import _average_binary_score, _average_multiclass_ovo_score
 
@@ -414,29 +414,19 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 raise ValueError("Parameter 'sample_weight' is not supported"
                                  " for multiclass one-vs-one ROC AUC."
                                  " 'sample_weight' must be None in this case.")
-            if labels is not None:
-                y_true_multiclass = np.empty_like(y_true, dtype=np.int32)
-                for i, label in enumerate(labels):
-                    y_true_multiclass[y_true == label] = i
-            else:
-                _, y_true_multiclass = np.unique(y_true, return_inverse=True)
-
-            # Hand & Till (2001) implementation
+            y_true_encoded = _encode_y_true_multiclass_ovo(
+                y_true, y_score, labels)
+            # Hand & Till (2001) implementation (ovo)
             return _average_multiclass_ovo_score(
                 _binary_roc_auc_score,
-                y_true_multiclass,
+                y_true_encoded,
                 y_score,
                 average=average)
         else:
             # ovr is same as multi-label
-            # Order y_true by labels
-            lb = LabelBinarizer()
-            if labels is not None:
-                lb.fit(labels)
-                lb.classes_ = labels
-            else:
-                lb.fit(y_true)
-            y_true_multilabel = lb.transform(y_true)
+            if labels is None:
+                labels = np.unique(y_true)
+            y_true_multilabel = label_binarize(y_true, labels)
             return _average_binary_score(
                  _binary_roc_auc_score, y_true_multilabel, y_score, average,
                  sample_weight=sample_weight)
@@ -452,6 +442,40 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
             sample_weight=sample_weight)
 
 
+def _encode_y_true_multiclass_ovo(y_true, y_score, labels):
+    """Encodes y_true for multiclass scoring where y_score is a probability
+    matrix
+
+    Parameters
+    ----------
+    y_true : numpy array, shape = (n_samples, )
+        True multiclass labels
+
+    y_score : numpy array, shape = (n_samples, n_classes)
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    labels : array-like, shape = (n_classes, ) or None
+        List of labels to index ``y_score`` used. If ``None``,
+        the lexicon order of ``y_true`` is used to index ``y_score``.
+
+    Returns
+    -------
+    y_true_encoded : numpy array, shape = (n_samples, )
+        Encoded y_true
+    """
+    if labels is not None:
+        y_true_encoded = np.empty_like(y_true, dtype=np.int32)
+        for i, label in enumerate(labels):
+            y_true_encoded[y_true == label] = i
+        return y_true_encoded
+
+    if np.issubdtype(y_true.dtype, np.integer):
+        return y_true
+
+    return np.unique(y_true, return_inverse=True)[1]
+
+
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     """Calculate true and false positives per binary classification threshold.
 
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index a24a2e370d4ea..2dc1c5d8c4316 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -29,6 +29,7 @@
 from sklearn.metrics import label_ranking_loss
 from sklearn.metrics import roc_auc_score
 from sklearn.metrics import roc_curve
+from sklearn.metrics.ranking import _encode_y_true_multiclass_ovo
 
 from sklearn.exceptions import UndefinedMetricWarning
 
@@ -521,6 +522,59 @@ def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
             average="weighted"), ovo_weighted_score)
 
 
+@pytest.mark.parametrize(
+    "labels", [None, [0, 1, 2]])
+@pytest.mark.parametrize("multiclass", ["ovo"])
+def test_multiclass_ovo_roc_auc_toydata_binary(labels, multiclass):
+    y_true = np.array([0, 2, 0, 2])
+    # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true
+    #
+    # on a small example, representative of an expected use case.
+    y_scores = np.array(
+        [[0.2, 0.0, 0.8], [0.6, 0.0, 0.4], [0.55, 0.0, 0.45], [0.4, 0.0, 0.6]])
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1, 0], [0.2, 0.6, 0.55, 0.4])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0, 1], [0.8, 0.4, 0.45, 0.6])
+    ovo_score = (score_01 + score_10) / 2
+
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, labels, multiclass=multiclass),
+        ovo_score)
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    assert_almost_equal(
+        roc_auc_score(
+            y_true,
+            y_scores,
+            labels=labels,
+            multiclass=multiclass,
+            average="weighted"), ovo_score)
+
+
+@pytest.mark.parametrize("y_true, y_true_encoded, labels", [
+    (np.array([0, 1, 0, 2, 1]), np.array([0, 1, 0, 2, 1]), None),
+    (np.array([0, 2, 0, 2, 2]), np.array([0, 2, 0, 2, 2]), None),
+    (np.array(["a", "b", "a", "b", "b"]), np.array([0, 1, 0, 1, 1]), None),
+    (np.array(["a", "b", "a", "b", "c"]), np.array([0, 1, 0, 1, 2]), None),
+    (np.array([0, 1, 0, 2, 1]), np.array([2, 0, 2, 1, 0]), [1, 2, 0]),
+    (np.array([0, 2, 0, 2, 2]), np.array([2, 0, 2, 0, 0]), [2, 1, 0]),
+    (np.array(["a", "b", "a", "b", "b"]),
+     np.array([0, 2, 0, 2, 2]), ["a", "c", "b"]),
+    (np.array(["a", "b", "a", "b", "c"]),
+     np.array([1, 2, 1, 2, 0]), ["c", "a", "b"]),
+])
+def test_encode_y_true_multiclass_ovo(y_true, y_true_encoded, labels):
+    y_score = check_random_state(404).rand(5, 3)
+    y_score = y_score / y_score.sum(axis=1, keepdims=True)
+    assert_almost_equal(
+        _encode_y_true_multiclass_ovo(y_true, y_score, labels),
+        y_true_encoded)
+
+
 @pytest.mark.parametrize(
     "y_true, labels",
     [(np.array([0, 1, 2, 2]), None),

From 5a4eaf502df97c9e18b3090ba6375889b81ad494 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 6 Feb 2019 20:08:15 -0500
Subject: [PATCH 61/93] Trigger CI


From 8c00a1fd7439081cb5368e0e4870fbdeabcd993f Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 18 Feb 2019 13:24:51 -0500
Subject: [PATCH 62/93] RF Address comments

---
 examples/model_selection/plot_roc.py |   4 +-
 sklearn/metrics/ranking.py           | 118 ++++++++++++++-------------
 sklearn/metrics/tests/test_common.py |   2 +-
 3 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index a7448a3ab58ce..c1883a41f42ef 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -60,8 +60,8 @@
 ###############################################################################
 # Compute the AUC scores
 # ......................
-# The ROC area can be approximated by taking the average either weighted
-# uniformly or by the priori class distribution.
+# The ROC area can be approximated by taking the average either unweighted
+# or weighted by the support (the number of true instances for each label).
 from sklearn.metrics import roc_auc_score
 
 y_score_norm = y_score / y_score.sum(1, keepdims=True)
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index d8c63df85b31e..c0aef342ff4ba 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -373,63 +373,10 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
     if y_type == "multiclass" or (y_type == "binary" and
                                   y_score.ndim == 2 and
                                   y_score.shape[1] > 2):
-        # validation of the input y_score
-        if not np.allclose(1, y_score.sum(axis=1)):
-            raise ValueError(
-                "Target scores need to be probabilities for multiclass "
-                "roc_auc, i.e. they should sum up to 1.0 over classes.")
-
-        # do not support partial ROC computation for multiclass
-        if max_fpr is not None and max_fpr != 1.:
-            raise ValueError("Partial AUC computation not available in "
-                             "multiclass setting. Parameter 'max_fpr' must be"
-                             " set to `None`. Received `max_fpr={0}` "
-                             "instead.".format(max_fpr))
-
-        # validation for multiclass parameter specifications
-        average_options = ("macro", "weighted")
-        if average not in average_options:
-            raise ValueError("Parameter 'average' must be one of {0} for"
-                             " multiclass problems.".format(average_options))
-        multiclass_options = ("ovo", "ovr")
-        if multiclass not in multiclass_options:
-            raise ValueError("Parameter multiclass='{0}' is not supported"
-                             " for multiclass ROC AUC. 'multiclass' must be"
-                             " one of {1}.".format(
-                                 multiclass, multiclass_options))
-        if labels is not None:
-            unique_labels = np.unique(labels)
-            if len(unique_labels) != len(labels):
-                raise ValueError("Parameter 'labels' must be unique")
-            if len(unique_labels) != y_score.shape[1]:
-                raise ValueError(
-                    "Number of given labels, {0}, not equal to the number "
-                    "of columns in 'y_score', {1}".format(
-                        len(unique_labels), y_score.shape[1]))
-            if set(np.unique(y_true)) > set(unique_labels):
-                raise ValueError(
-                    "'y_true' contains labels not in parameter 'labels'")
-        if multiclass == "ovo":
-            if sample_weight is not None:
-                raise ValueError("Parameter 'sample_weight' is not supported"
-                                 " for multiclass one-vs-one ROC AUC."
-                                 " 'sample_weight' must be None in this case.")
-            y_true_encoded = _encode_y_true_multiclass_ovo(
-                y_true, y_score, labels)
-            # Hand & Till (2001) implementation (ovo)
-            return _average_multiclass_ovo_score(
-                _binary_roc_auc_score,
-                y_true_encoded,
-                y_score,
-                average=average)
-        else:
-            # ovr is same as multi-label
-            if labels is None:
-                labels = np.unique(y_true)
-            y_true_multilabel = label_binarize(y_true, labels)
-            return _average_binary_score(
-                 _binary_roc_auc_score, y_true_multilabel, y_score, average,
-                 sample_weight=sample_weight)
+        return _multiclass_roc_auc_score(_binary_roc_auc_score,
+                                         y_true, y_score, labels,
+                                         multiclass, average, sample_weight,
+                                         max_fpr)
     elif y_type == "binary":
         labels = np.unique(y_true)
         y_true = label_binarize(y_true, labels)[:, 0]
@@ -442,6 +389,63 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
             sample_weight=sample_weight)
 
 
+def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
+                              multiclass, average, sample_weight, max_fpr):
+    # validation of the input y_score
+    if not np.allclose(1, y_score.sum(axis=1)):
+        raise ValueError(
+            "Target scores need to be probabilities for multiclass "
+            "roc_auc, i.e. they should sum up to 1.0 over classes.")
+
+    # do not support partial ROC computation for multiclass
+    if max_fpr is not None and max_fpr != 1.:
+        raise ValueError("Partial AUC computation not available in "
+                         "multiclass setting. Parameter 'max_fpr' must be"
+                         " set to `None`. Received `max_fpr={0}` "
+                         "instead.".format(max_fpr))
+
+    # validation for multiclass parameter specifications
+    average_options = ("macro", "weighted")
+    if average not in average_options:
+        raise ValueError("Parameter 'average' must be one of {0} for"
+                         " multiclass problems.".format(average_options))
+    multiclass_options = ("ovo", "ovr")
+    if multiclass not in multiclass_options:
+        raise ValueError("Parameter multiclass='{0}' is not supported"
+                         " for multiclass ROC AUC. 'multiclass' must be"
+                         " one of {1}.".format(
+                                multiclass, multiclass_options))
+    if labels is not None:
+        unique_labels = np.unique(labels)
+        if len(unique_labels) != len(labels):
+            raise ValueError("Parameter 'labels' must be unique")
+        if len(unique_labels) != y_score.shape[1]:
+            raise ValueError(
+                "Number of given labels, {0}, not equal to the number "
+                "of columns in 'y_score', {1}".format(
+                    len(unique_labels), y_score.shape[1]))
+        if set(np.unique(y_true)) > set(unique_labels):
+            raise ValueError(
+                "'y_true' contains labels not in parameter 'labels'")
+    if multiclass == "ovo":
+        if sample_weight is not None:
+            raise ValueError("Parameter 'sample_weight' is not supported"
+                             " for multiclass one-vs-one ROC AUC."
+                             " 'sample_weight' must be None in this case.")
+        y_true_encoded = _encode_y_true_multiclass_ovo(
+            y_true, y_score, labels)
+        # Hand & Till (2001) implementation (ovo)
+        return _average_multiclass_ovo_score(binary_metric, y_true_encoded,
+                                             y_score, average=average)
+    else:
+        # ovr is same as multi-label
+        if labels is None:
+            labels = np.unique(y_true)
+        y_true_multilabel = label_binarize(y_true, labels)
+        return _average_binary_score(binary_metric, y_true_multilabel, y_score,
+                                     average, sample_weight=sample_weight)
+
+
 def _encode_y_true_multiclass_ovo(y_true, y_score, labels):
     """Encodes y_true for multiclass scoring where y_score is a probability
     matrix
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index ddf0d5a754629..8464d8f5cabf0 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1255,7 +1255,7 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name):
 @pytest.mark.parametrize(
     'name',
     set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
-def test_multiclass_score_permutation_invariance(name):
+def test_thresdhold_metric_is_permutation_invariance(name):
     n_samples, n_classes = 100, 3
     random_state = check_random_state(0)
 

From 2da0866ac6def17dcc221b197d77a8515fe69b2b Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 18 Feb 2019 15:07:17 -0500
Subject: [PATCH 63/93] RF Address comments

---
 examples/model_selection/plot_roc.py | 2 +-
 sklearn/metrics/tests/test_common.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index c1883a41f42ef..01c8bd334ab65 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -64,7 +64,7 @@
 # or weighted by the support (the number of true instances for each label).
 from sklearn.metrics import roc_auc_score
 
-y_score_norm = y_score / y_score.sum(1, keepdims=True)
+y_score_norm = np.exp(y_score)/np.exp(y_score).sum(axis=-1, keepdims=True)
 unweighted_roc_auc_ovr = roc_auc_score(y_test, y_score_norm, multiclass="ovr")
 weighted_roc_auc_ovr = roc_auc_score(
       y_test, y_score_norm, multiclass="ovr", average="weighted")
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index b3be952e92b67..924511ca4c0be 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1260,7 +1260,7 @@ def test_thresdhold_metric_is_permutation_invariance(name):
     random_state = check_random_state(0)
 
     y_score = random_state.rand(n_samples, n_classes)
-    y_score = y_score / y_score.sum(axis=1, keepdims=True)
+    y_score = np.exp(y_score)/np.exp(y_score).sum(axis=-1, keepdims=True)
     y_true = random_state.randint(0, n_classes, size=n_samples)
 
     metric = ALL_METRICS[name]

From 66b66905145a546969148255a9af3219e7c4e722 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 18 Feb 2019 15:13:55 -0500
Subject: [PATCH 64/93] RF Uses _encode_python

---
 sklearn/metrics/ranking.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index c0aef342ff4ba..a5c063396707a 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -33,6 +33,7 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
+from ..preprocessing.label import _encode_python
 
 from .base import _average_binary_score, _average_multiclass_ovo_score
 
@@ -427,6 +428,7 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
         if set(np.unique(y_true)) > set(unique_labels):
             raise ValueError(
                 "'y_true' contains labels not in parameter 'labels'")
+
     if multiclass == "ovo":
         if sample_weight is not None:
             raise ValueError("Parameter 'sample_weight' is not supported"
@@ -469,9 +471,9 @@ def _encode_y_true_multiclass_ovo(y_true, y_score, labels):
         Encoded y_true
     """
     if labels is not None:
-        y_true_encoded = np.empty_like(y_true, dtype=np.int32)
-        for i, label in enumerate(labels):
-            y_true_encoded[y_true == label] = i
+        _, y_true_encoded = _encode_python(y_true,
+                                           uniques=np.array(labels),
+                                           encode=True)
         return y_true_encoded
 
     if np.issubdtype(y_true.dtype, np.integer):

From 870941e14399410b3acca2d0e8ee610ef7117104 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 18 Feb 2019 15:17:13 -0500
Subject: [PATCH 65/93] RF Adds comments

---
 sklearn/metrics/ranking.py | 65 ++++++++++++++++++++++++++++++++------
 1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index a5c063396707a..97ec3ca42d24a 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -374,10 +374,15 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
     if y_type == "multiclass" or (y_type == "binary" and
                                   y_score.ndim == 2 and
                                   y_score.shape[1] > 2):
+        # do not support partial ROC computation for multiclass
+        if max_fpr is not None and max_fpr != 1.:
+            raise ValueError("Partial AUC computation not available in "
+                             "multiclass setting. Parameter 'max_fpr' must be"
+                             " set to `None`. Received `max_fpr={0}` "
+                             "instead.".format(max_fpr))
         return _multiclass_roc_auc_score(_binary_roc_auc_score,
                                          y_true, y_score, labels,
-                                         multiclass, average, sample_weight,
-                                         max_fpr)
+                                         multiclass, average, sample_weight)
     elif y_type == "binary":
         labels = np.unique(y_true)
         y_true = label_binarize(y_true, labels)[:, 0]
@@ -391,19 +396,61 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
 
 
 def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
-                              multiclass, average, sample_weight, max_fpr):
+                              multiclass, average, sample_weight):
+    """Multiclass roc auc score
+
+    Parameters
+    ----------
+    binary_metric : callable
+        The binary metric function to use that accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    y_true : array-like, shape = (n_samples, )
+        True multiclass labels.
+
+    y_score : array-like, shape = (n_samples, n_classes)
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    labels : array, shape = [n_classes] or None, optional (default=None)
+        List of labels to index ``y_score`` used for multiclass. If ``None``,
+        the lexicon order of ``y_true`` is used to index ``y_score``.
+
+    multiclass : string, 'ovr' or 'ovo', optional(default='ovr')
+        Determines the type of multiclass configuration to use.
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    average : 'macro' or 'weighted', optional (default='macro')
+        Determines the type of averaging performed on the pairwise binary
+        metric scores
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes.
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    """
     # validation of the input y_score
     if not np.allclose(1, y_score.sum(axis=1)):
         raise ValueError(
             "Target scores need to be probabilities for multiclass "
             "roc_auc, i.e. they should sum up to 1.0 over classes.")
 
-    # do not support partial ROC computation for multiclass
-    if max_fpr is not None and max_fpr != 1.:
-        raise ValueError("Partial AUC computation not available in "
-                         "multiclass setting. Parameter 'max_fpr' must be"
-                         " set to `None`. Received `max_fpr={0}` "
-                         "instead.".format(max_fpr))
 
     # validation for multiclass parameter specifications
     average_options = ("macro", "weighted")

From 9b8a8435ffbba457166f8753c64caa69cd9868d0 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 18 Feb 2019 15:33:16 -0500
Subject: [PATCH 66/93] STY flake8

---
 sklearn/metrics/ranking.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 97ec3ca42d24a..09eeec3b367a4 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -451,7 +451,6 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
             "Target scores need to be probabilities for multiclass "
             "roc_auc, i.e. they should sum up to 1.0 over classes.")
 
-
     # validation for multiclass parameter specifications
     average_options = ("macro", "weighted")
     if average not in average_options:

From 8c8f1de2cb197bf4083f04e0e3a7930e470f2eca Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 19 Feb 2019 09:52:03 -0500
Subject: [PATCH 67/93] RFC Address comments

---
 examples/model_selection/plot_roc.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 01c8bd334ab65..995fe58984ec5 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -60,8 +60,8 @@
 ###############################################################################
 # Compute the AUC scores
 # ......................
-# The ROC area can be approximated by taking the average either unweighted
-# or weighted by the support (the number of true instances for each label).
+# The ROC area can be calculated by taking the average either unweighted
+# or weighted by the number of true instances for each label.
 from sklearn.metrics import roc_auc_score
 
 y_score_norm = np.exp(y_score)/np.exp(y_score).sum(axis=-1, keepdims=True)
@@ -74,7 +74,7 @@
 ###############################################################################
 # Plotting the ROC curve for virginica
 # ....................................
-# One can draw a ROC curve by considering each element of the label indicator
+# A ROC curve is drawn by considering each element of the label indicator
 # matrix as a binary prediction (micro-averaging). In the following, the ROC
 # curve for virginica is drawn.
 import matplotlib.pyplot as plt
@@ -115,9 +115,8 @@
 ###############################################################################
 # Plot ROC curves for the multiclass problem using One-vs-Rest
 # ............................................................
-# Another evaluation measure for one-vs-rest multi-class classification is
-# macro-averaging, which gives equal weight to the classification of each
-# label.
+# A ROC curve is drawn using macro-averaging, which gives equal weight to the
+# classification of each label.
 from itertools import cycle
 from scipy import interp
 
@@ -170,8 +169,8 @@
 #
 # Compute the AUC score
 # .....................
-# The ROC area can be approximated by taking the average either weighted
-# uniformly (macro) or by prevalence.
+# The ROC area can be calculated by taking the average either unweighted
+# or weighted by the number of true instances for each label.
 macro_roc_auc_ovo = roc_auc_score(
       y_test, y_score_norm, multiclass="ovo", average="macro")
 weighted_roc_auc_ovo = roc_auc_score(

From c7d686d931fad636f2d1ade05224a025301fb78a Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 4 Apr 2019 12:39:37 -0400
Subject: [PATCH 68/93] CLN Address comments

---
 doc/modules/model_evaluation.rst |  7 ++++++-
 sklearn/metrics/ranking.py       | 31 +++++++++++++++----------------
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index ecf2b11f9809d..db3e7c3f36f72 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1347,7 +1347,7 @@ uniformly:
    \text{AUC}(k | j))
 
 where :math:`c` is the number of classes and :math:`\text{AUC}(j | k)` is the
-auc with class :math:`j` as the positive class and class :math:`k` as the
+AUC with class :math:`j` as the positive class and class :math:`k` as the
 negative class. In general,
 :math:`\text{AUC}(j | k) \neq \text{AUC}(k | j))` in the multiclass
 case. This algorithm is used by setting the keyword argument ``multiclass``
@@ -1400,6 +1400,11 @@ to the given limit.
        <http://link.springer.com/article/10.1023/A:1010920819831>`_
        Machine learning, 45(2), pp.171-186.
 
+    .. [FC2009] Ferri, Cèsar & Hernandez-Orallo, Jose & Modroiu, R. (2009). 
+       An Experimental Comparison of Performance Measures for Classification. 
+       <https://www.math.ucdavis.edu/~saito/data/roc/ferri-class-perf-metrics.pdf>`_
+       Pattern Recognition Letters. 30. 27-38. 
+
 .. _zero_one_loss:
 
 Zero one loss
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 543f2b9d98cae..1139b925252ca 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -241,9 +241,8 @@ def _binary_uninterpolated_average_precision(
                                  average, sample_weight=sample_weight)
 
 
-def roc_auc_score(y_true, y_score, labels=None,
-                  multiclass="ovr", average="macro",
-                  sample_weight=None, max_fpr=None):
+def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
+                  max_fpr=None, multiclass="ovr", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
 
@@ -267,19 +266,6 @@ def roc_auc_score(y_true, y_score, labels=None,
         label. The multiclass case expects shape = [n_samples, n_classes]
         where the scores correspond to probability estimates.
 
-    labels : array, shape = [n_classes] or None, optional (default=None)
-        List of labels to index ``y_score`` used for multiclass. If ``None``,
-        the lexicon order of ``y_true`` is used to index ``y_score``.
-
-    multiclass : string, 'ovr' or 'ovo', optional(default='ovr')
-        Determines the type of multiclass configuration to use.
-        ``'ovr'``:
-            Calculate metrics for the multiclass case using the one-vs-rest
-            approach.
-        ``'ovo'``:
-            Calculate metrics for the multiclass case using the one-vs-one
-            approach.
-
     average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
@@ -309,6 +295,19 @@ def roc_auc_score(y_true, y_score, labels=None,
         should be either equal to ``None`` or ``1.0`` as AUC ROC partial
         computation currently is not supported for multiclass.
 
+    multiclass : string, 'ovr' or 'ovo', optional(default='ovr')
+        Determines the type of multiclass configuration to use.
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    labels : array, shape = [n_classes] or None, optional (default=None)
+        List of labels to index ``y_score`` used for multiclass. If ``None``,
+        the lexicon order of ``y_true`` is used to index ``y_score``.
+
     Returns
     -------
     auc : float

From d96a7d99508bfb291067ab6e5781c58c0a2950f0 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 4 Apr 2019 12:40:00 -0400
Subject: [PATCH 69/93] CLN Style

---
 sklearn/metrics/base.py    |  4 ++--
 sklearn/metrics/ranking.py | 16 +++++-----------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 02cfc9c6f05f7..288730354139c 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -126,8 +126,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         return score
 
 
-def _average_multiclass_ovo_score(
-        binary_metric, y_true, y_score, average='macro'):
+def _average_multiclass_ovo_score(binary_metric, y_true, y_score,
+                                  average='macro'):
     """Average one-versus-one scores for multiclass classification.
 
     Uses the binary metric for one-vs-one multiclass classification,
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 1139b925252ca..596172623ec56 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -479,8 +479,7 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
             raise ValueError("Parameter 'sample_weight' is not supported"
                              " for multiclass one-vs-one ROC AUC."
                              " 'sample_weight' must be None in this case.")
-        y_true_encoded = _encode_y_true_multiclass_ovo(
-            y_true, y_score, labels)
+        y_true_encoded = _encode_y_true_multiclass_ovo(y_true, labels)
         # Hand & Till (2001) implementation (ovo)
         return _average_multiclass_ovo_score(binary_metric, y_true_encoded,
                                              y_score, average=average)
@@ -493,22 +492,17 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
                                      average, sample_weight=sample_weight)
 
 
-def _encode_y_true_multiclass_ovo(y_true, y_score, labels):
-    """Encodes y_true for multiclass scoring where y_score is a probability
-    matrix
+def _encode_y_true_multiclass_ovo(y_true, labels):
+    """Encodes y_true for multiclass ovo scoring
 
     Parameters
     ----------
     y_true : numpy array, shape = (n_samples, )
         True multiclass labels
 
-    y_score : numpy array, shape = (n_samples, n_classes)
-        Target scores corresponding to probability estimates of a sample
-        belonging to a particular class
-
     labels : array-like, shape = (n_classes, ) or None
-        List of labels to index ``y_score`` used. If ``None``,
-        the lexicon order of ``y_true`` is used to index ``y_score``.
+        List of labels to index. If ``None``, the lexicon order of ``y_true``
+        is used.
 
     Returns
     -------

From 40cc0a1551a760fe948ec90f25ef5060f14f0c28 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 4 Apr 2019 15:50:14 -0400
Subject: [PATCH 70/93] DOC Adds new example for mutliclass roc

---
 doc/modules/model_evaluation.rst              |   3 +
 .../model_selection/plot_roc_multiclass.py    | 150 ++++++++++++++++++
 2 files changed, 153 insertions(+)
 create mode 100644 examples/model_selection/plot_roc_multiclass.py

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index db3e7c3f36f72..d6c9eac97eaa2 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1393,6 +1393,9 @@ to the given limit.
     for an example of using ROC to
     model species distribution.
 
+  * See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`
+    for an example of using multiclass one-vs-one ROC.
+
 .. topic:: References:
 
     .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
diff --git a/examples/model_selection/plot_roc_multiclass.py b/examples/model_selection/plot_roc_multiclass.py
new file mode 100644
index 0000000000000..fe42b0a1afb4e
--- /dev/null
+++ b/examples/model_selection/plot_roc_multiclass.py
@@ -0,0 +1,150 @@
+"""
+==================================================
+Multiclass Receiver Operating Characteristic (ROC)
+==================================================
+
+The multiclass One-vs-Rest scheme is functionally the same as the multilabel
+case with one label per sample. See
+:ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for an example
+of multilabel roc.
+
+The mutliclass One-vs-One scheme compares every unique pairwise combination
+of classes. In this example, we train a SVM classifier on the iris dataset and
+calcuate the AUC using a macro average and a average weighted by the number of
+true instances for each label combination.
+
+.. topic:: References:
+
+    .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
+       of the area under the ROC curve for multiple class classification
+       problems. <http://link.springer.com/article/10.1023/A:1010920819831>`_
+       Machine learning, 45(2), pp.171-186.
+"""
+print(__doc__)
+
+###############################################################################
+# Load iris dataset and train a SVC
+# .................................
+from sklearn import svm, datasets
+from sklearn.model_selection import train_test_split
+from sklearn.multiclass import OneVsRestClassifier
+import numpy as np
+
+# Import some data to play with
+iris = datasets.load_iris()
+X = iris.data
+y = iris.target
+
+# Add noisy features to make the problem harder
+random_state = np.random.RandomState(0)
+n_samples, n_features = X.shape
+X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
+
+# shuffle and split training and test sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
+                                                    random_state=0)
+
+# Learn to predict each class against the other
+classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
+                                 random_state=random_state))
+y_prob = classifier.fit(X_train, y_train).predict_proba(X_test)
+
+###############################################################################
+# Compute the AUC score
+# .....................
+# The ROC area can be calculated by taking the a macro average or an
+# average weighted by the number of true instances for each label combination.
+from sklearn.metrics import roc_auc_score
+
+macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
+                                  average="macro")
+weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
+                                     average="weighted")
+print("One-vs-One ROC AUC scores: {0} (uniform), {1} (weighted by prevalence)"
+      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
+
+###############################################################################
+# Plot ROC curves for the multiclass problem using One-vs-One
+# ...........................................................
+# The ROC curve for every pair of classes are drawn together with the
+# average weighted uniformly and weighted by prevalence.
+import matplotlib.pyplot as plt
+from itertools import combinations, permutations
+from sklearn.metrics import roc_curve, auc
+from scipy import interp
+
+n_classes = len(np.unique(y))
+
+fpr = dict()
+tpr = dict()
+roc_auc = dict()
+prevalence = dict()
+for a, b in combinations(range(n_classes), 2):
+    ab_mask = np.logical_or(y_test == a, y_test == b)
+
+    # Compute ROC curve and ROC area with `a` as the positive class
+    fpr[(a, b)], tpr[(a, b)], _ = roc_curve(
+          y_test[ab_mask] == a, y_prob[ab_mask, a])
+    roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)])
+    prevalence[(a, b)] = np.average(ab_mask)
+
+    # Compute ROC curve and ROC area with `b` as the positive class
+    fpr[(b, a)], tpr[(b, a)], _ = roc_curve(
+          y_test[ab_mask] == b, y_prob[ab_mask, b])
+    roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
+    prevalence[(b, a)] = np.average(ab_mask)
+
+class_permutations = list(permutations(range(n_classes), 2))
+all_multiclass_fpr = np.unique(
+      np.concatenate([fpr[(a, b)] for a, b in class_permutations]))
+
+multiclass_interp_tpr = dict()
+for a, b in class_permutations:
+    multiclass_interp_tpr[(a, b)] = interp(
+          all_multiclass_fpr, fpr[(a, b)], tpr[(a, b)])
+
+all_multiclass_tpr = np.array(
+      [multiclass_interp_tpr[(a, b)] for a, b in class_permutations])
+all_prevalence = np.array([prevalence[(a, b)] for a, b in class_permutations])
+
+roc_auc_uniform_average_tpr = np.average(all_multiclass_tpr, axis=0)
+roc_auc_prevalence_average_tpr = np.average(
+      all_multiclass_tpr, axis=0, weights=all_prevalence)
+
+
+fig, ax = plt.subplots()
+# plot roc curve as a macro average
+ax.plot(
+    all_multiclass_fpr,
+    roc_auc_uniform_average_tpr,
+    color='navy',
+    linestyle=':',
+    lw=4,
+    label='macro average (area = {0:0.2f})'.format(
+        macro_roc_auc_ovo),
+)
+# plot roc curve as a weighted average
+ax.plot(
+    all_multiclass_fpr,
+    roc_auc_prevalence_average_tpr,
+    color='deeppink',
+    linestyle=':',
+    lw=4,
+    label='weighted average (area = {0:0.2f})'.format(
+        weighted_roc_auc_ovo),
+)
+
+# plot roc curve for every of classes
+for a, b in permutations(range(n_classes), 2):
+    ax.plot(
+          fpr[(a, b)], tpr[(a, b)],
+          lw=2, label='class {0} vs. {1} '
+          '(area = {2:0.2f})'.format(a, b, roc_auc[(a, b)]))
+ax.plot([0, 1], [0, 1], 'k--', lw=2)
+ax.set_xlim([0.0, 1.0])
+ax.set_ylim([0.0, 1.05])
+ax.set_xlabel('False Positive Rate')
+ax.set_ylabel('True Positive Rate')
+ax.set_title('An extension of ROC to multi-class using One-vs-One')
+ax.legend(loc="lower right")
+fig.show()

From 551c32a4b359a24bad3ea40da780b8ae5aa772dd Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 4 Apr 2019 16:50:17 -0400
Subject: [PATCH 71/93] DOC Updates example

---
 examples/model_selection/plot_roc.py          | 239 +++++-------------
 .../model_selection/plot_roc_multiclass.py    |  58 ++---
 2 files changed, 80 insertions(+), 217 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 995fe58984ec5..475d7b4aba7a6 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -15,34 +15,48 @@
 The "steepness" of ROC curves is also important, since it is ideal to maximize
 the true positive rate while minimizing the false positive rate.
 
+Multiclass settings
+-------------------
+
 ROC curves are typically used in binary classification to study the output of
-a classifier. The ROC curve and ROC area can be extended to multi-class or
-multi-label classification by using the One-vs-Rest or One-vs-One scheme.
+a classifier. In order to extend ROC curve and ROC area to multi-class
+or multi-label classification, it is necessary to binarize the output. One ROC
+curve can be drawn per label, but one can also draw a ROC curve by considering
+each element of the label indicator matrix as a binary prediction
+(micro-averaging).
+
+Another evaluation measure for multi-class classification is
+macro-averaging, which gives equal weight to the classification of each
+label.
 
 .. note::
 
     See also :func:`sklearn.metrics.roc_auc_score`,
              :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`.
+
 """
 print(__doc__)
-###############################################################################
-# One-vs-Rest
-# -----------
-# The One-vs-Rest scheme compares the each class against all of the other
-# classes ("the rest").
-#
-# Load iris dataset and train a SVC
-# .................................
+
+import numpy as np
+import matplotlib.pyplot as plt
+from itertools import cycle
+
 from sklearn import svm, datasets
+from sklearn.metrics import roc_curve, auc
 from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import label_binarize
 from sklearn.multiclass import OneVsRestClassifier
-import numpy as np
+from scipy import interp
 
 # Import some data to play with
 iris = datasets.load_iris()
 X = iris.data
 y = iris.target
 
+# Binarize the output
+y = label_binarize(y, classes=[0, 1, 2])
+n_classes = y.shape[1]
+
 # Add noisy features to make the problem harder
 random_state = np.random.RandomState(0)
 n_samples, n_features = X.shape
@@ -57,73 +71,44 @@
                                  random_state=random_state))
 y_score = classifier.fit(X_train, y_train).decision_function(X_test)
 
-###############################################################################
-# Compute the AUC scores
-# ......................
-# The ROC area can be calculated by taking the average either unweighted
-# or weighted by the number of true instances for each label.
-from sklearn.metrics import roc_auc_score
-
-y_score_norm = np.exp(y_score)/np.exp(y_score).sum(axis=-1, keepdims=True)
-unweighted_roc_auc_ovr = roc_auc_score(y_test, y_score_norm, multiclass="ovr")
-weighted_roc_auc_ovr = roc_auc_score(
-      y_test, y_score_norm, multiclass="ovr", average="weighted")
-print("One-vs-Rest ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
-       unweighted_roc_auc_ovr, weighted_roc_auc_ovr))
-
-###############################################################################
-# Plotting the ROC curve for virginica
-# ....................................
-# A ROC curve is drawn by considering each element of the label indicator
-# matrix as a binary prediction (micro-averaging). In the following, the ROC
-# curve for virginica is drawn.
-import matplotlib.pyplot as plt
-from sklearn.preprocessing import label_binarize
-from sklearn.metrics import roc_curve, auc
-
-# Binarize y_test to compute the ROC curve
-classes = np.unique(y)
-n_classes = len(classes)
-y_test_binarized = label_binarize(y_test, classes=classes)
-
 # Compute ROC curve and ROC area for each class
 fpr = dict()
 tpr = dict()
 roc_auc = dict()
 for i in range(n_classes):
-    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
+    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
     roc_auc[i] = auc(fpr[i], tpr[i])
 
-fpr["micro"], tpr["micro"], _ = roc_curve(
-      y_test_binarized.ravel(), y_score.ravel())
+# Compute micro-average ROC curve and ROC area
+fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
 roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
-fig, ax = plt.subplots()
+
+##############################################################################
+# Plot of a ROC curve for a specific class
+plt.figure()
 lw = 2
-ax.plot(fpr[2], tpr[2], color='darkorange',
-        lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
-ax.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
-ax.set_xlim([0.0, 1.0])
-ax.set_ylim([0.0, 1.05])
-ax.set_xlabel('False Positive Rate')
-ax.set_ylabel('True Positive Rate')
-ax.set_title('Receiver operating characteristic example for {}'.format(
-    iris.target_names[2]))
-ax.legend(loc="lower right")
-fig.show()
+plt.plot(fpr[2], tpr[2], color='darkorange',
+         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
+plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
+plt.xlim([0.0, 1.0])
+plt.ylim([0.0, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('Receiver operating characteristic example')
+plt.legend(loc="lower right")
+plt.show()
 
-###############################################################################
-# Plot ROC curves for the multiclass problem using One-vs-Rest
-# ............................................................
-# A ROC curve is drawn using macro-averaging, which gives equal weight to the
-# classification of each label.
-from itertools import cycle
-from scipy import interp
+
+##############################################################################
+# Plot ROC curves for the multiclass problem
+
+# Compute macro-average ROC curve and ROC area
 
 # First aggregate all false positive rates
 all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
 
-# Then interpolate all ROC curves at these points
+# Then interpolate all ROC curves at this points
 mean_tpr = np.zeros_like(all_fpr)
 for i in range(n_classes):
     mean_tpr += interp(all_fpr, fpr[i], tpr[i])
@@ -136,16 +121,16 @@
 roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
 
 # Plot all ROC curves
-fig, ax = plt.subplots()
-ax.plot(fpr["micro"], tpr["micro"],
-        label='micro-average ROC curve (area = {0:0.2f})'
-              ''.format(roc_auc["micro"]),
-        color='deeppink', linestyle=':', linewidth=4)
+plt.figure()
+plt.plot(fpr["micro"], tpr["micro"],
+         label='micro-average ROC curve (area = {0:0.2f})'
+               ''.format(roc_auc["micro"]),
+         color='deeppink', linestyle=':', linewidth=4)
 
-ax.plot(fpr["macro"], tpr["macro"],
-        label='macro-average ROC curve (area = {0:0.2f})'
-              ''.format(roc_auc["macro"]),
-        color='navy', linestyle=':', linewidth=4)
+plt.plot(fpr["macro"], tpr["macro"],
+         label='macro-average ROC curve (area = {0:0.2f})'
+               ''.format(roc_auc["macro"]),
+         color='navy', linestyle=':', linewidth=4)
 
 colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
 for i, color in zip(range(n_classes), colors):
@@ -153,105 +138,11 @@
              label='ROC curve of class {0} (area = {1:0.2f})'
              ''.format(i, roc_auc[i]))
 
-ax.plot([0, 1], [0, 1], 'k--', lw=lw)
-ax.set_xlim([0.0, 1.0])
-ax.set_ylim([0.0, 1.05])
-ax.set_xlabel('False Positive Rate')
-ax.set_ylabel('True Positive Rate')
-ax.set_title('An extension of ROC to multi-class using One-vs-Rest')
-ax.legend(loc="lower right")
-fig.show()
-
-###############################################################################
-# One-vs-One
-# ---------------------
-# The One-vs-One scheme compares every unique pairwise combination of classes.
-#
-# Compute the AUC score
-# .....................
-# The ROC area can be calculated by taking the average either unweighted
-# or weighted by the number of true instances for each label.
-macro_roc_auc_ovo = roc_auc_score(
-      y_test, y_score_norm, multiclass="ovo", average="macro")
-weighted_roc_auc_ovo = roc_auc_score(
-      y_test, y_score_norm, multiclass="ovo", average="weighted")
-print("One-vs-One ROC AUC scores: {0} (uniform), {1} (weighted by prevalence)"
-      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
-
-###############################################################################
-# Plot ROC curves for the multiclass problem using One-vs-One
-# ...........................................................
-# The ROC curve for every pair of classes are drawn together with the
-# average weighted uniformly and weighted by prevalence.
-from itertools import combinations, permutations
-
-prevalence = dict()
-for a, b in combinations(range(n_classes), 2):
-    ab_mask = np.logical_or(y_test == a, y_test == b)
-
-    # Compute ROC curve and ROC area with `a` as the positive class
-    fpr[(a, b)], tpr[(a, b)], _ = roc_curve(
-          y_test[ab_mask] == a, y_score[ab_mask, a])
-    roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)])
-    prevalence[(a, b)] = np.average(ab_mask)
-
-    # Compute ROC curve and ROC area with `b` as the positive class
-    fpr[(b, a)], tpr[(b, a)], _ = roc_curve(
-          y_test[ab_mask] == b, y_score[ab_mask, b])
-    roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
-    prevalence[(b, a)] = np.average(ab_mask)
-
-class_permutations = list(permutations(range(n_classes), 2))
-all_multiclass_fpr = np.unique(
-      np.concatenate([fpr[(a, b)] for a, b in class_permutations]))
-
-multiclass_interp_tpr = dict()
-for a, b in class_permutations:
-    multiclass_interp_tpr[(a, b)] = interp(
-          all_multiclass_fpr, fpr[(a, b)], tpr[(a, b)])
-
-all_multiclass_tpr = np.array(
-      [multiclass_interp_tpr[(a, b)] for a, b in class_permutations])
-all_prevalence = np.array([prevalence[(a, b)] for a, b in class_permutations])
-
-roc_auc_uniform_average_tpr = np.average(all_multiclass_tpr, axis=0)
-roc_auc_prevalence_average_tpr = np.average(
-      all_multiclass_tpr, axis=0, weights=all_prevalence)
-
-
-fig, ax = plt.subplots()
-# plot roc curve as a macro average
-ax.plot(
-    all_multiclass_fpr,
-    roc_auc_uniform_average_tpr,
-    color='navy',
-    linestyle=':',
-    lw=4,
-    label='macro average (area = {0:0.2f})'.format(
-        macro_roc_auc_ovo),
-)
-# plot roc curve as a weighted average
-ax.plot(
-    all_multiclass_fpr,
-    roc_auc_prevalence_average_tpr,
-    color='deeppink',
-    linestyle=':',
-    lw=4,
-    label='weighted average (area = {0:0.2f})'.format(
-        weighted_roc_auc_ovo),
-)
-
-# plot roc curve for every of classes
-for a, b in permutations(range(n_classes), 2):
-    ax.plot(
-          fpr[(a, b)], tpr[(a, b)],
-          lw=lw, label='class {0} vs. {1} '
-          '(area = {2:0.2f})'.format(a, b, roc_auc[(a, b)]))
-ax.plot([0, 1], [0, 1], 'k--', lw=lw)
-ax.set_xlim([0.0, 1.0])
-ax.set_ylim([0.0, 1.05])
-ax.set_xlabel('False Positive Rate')
-ax.set_ylabel('True Positive Rate')
-ax.set_title('An extension of ROC to multi-class using One-vs-One')
-ax.legend(loc="lower right")
-fig.show()
+plt.plot([0, 1], [0, 1], 'k--', lw=lw)
+plt.xlim([0.0, 1.0])
+plt.ylim([0.0, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('Some extension of Receiver operating characteristic to multi-class')
+plt.legend(loc="lower right")
+plt.show()
diff --git a/examples/model_selection/plot_roc_multiclass.py b/examples/model_selection/plot_roc_multiclass.py
index fe42b0a1afb4e..7e7b05606d376 100644
--- a/examples/model_selection/plot_roc_multiclass.py
+++ b/examples/model_selection/plot_roc_multiclass.py
@@ -60,18 +60,16 @@
                                   average="macro")
 weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
                                      average="weighted")
-print("One-vs-One ROC AUC scores: {0} (uniform), {1} (weighted by prevalence)"
+print("One-vs-One ROC AUC scores:\n{:.6f} (uniform),\n{:.6f} "
+      "(weighted by prevalence)"
       .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
 
 ###############################################################################
-# Plot ROC curves for the multiclass problem using One-vs-One
-# ...........................................................
-# The ROC curve for every pair of classes are drawn together with the
-# average weighted uniformly and weighted by prevalence.
+# Manually calcuate the one-vs-one multiclass auc score
+# .....................................................
 import matplotlib.pyplot as plt
 from itertools import combinations, permutations
 from sklearn.metrics import roc_curve, auc
-from scipy import interp
 
 n_classes = len(np.unique(y))
 
@@ -94,46 +92,20 @@
     roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
     prevalence[(b, a)] = np.average(ab_mask)
 
-class_permutations = list(permutations(range(n_classes), 2))
-all_multiclass_fpr = np.unique(
-      np.concatenate([fpr[(a, b)] for a, b in class_permutations]))
-
-multiclass_interp_tpr = dict()
-for a, b in class_permutations:
-    multiclass_interp_tpr[(a, b)] = interp(
-          all_multiclass_fpr, fpr[(a, b)], tpr[(a, b)])
-
-all_multiclass_tpr = np.array(
-      [multiclass_interp_tpr[(a, b)] for a, b in class_permutations])
-all_prevalence = np.array([prevalence[(a, b)] for a, b in class_permutations])
-
-roc_auc_uniform_average_tpr = np.average(all_multiclass_tpr, axis=0)
-roc_auc_prevalence_average_tpr = np.average(
-      all_multiclass_tpr, axis=0, weights=all_prevalence)
+roc_auc_values = np.fromiter(roc_auc.values(), dtype=np.float32)
+prevalence_values = np.fromiter(prevalence.values(), dtype=np.float32)
 
+macro_roc_auc_ovo_manual = np.average(roc_auc_values)
+weighted_roc_auc_ovo_manual = np.average(roc_auc_values,
+                                         weights=prevalence_values)
+print(("Manual One-vs-One ROC AUC scores: \n{:.6f} (uniform),\n{:.6f} "
+      "(weighted by prevalence)").format(macro_roc_auc_ovo_manual,
+                                         weighted_roc_auc_ovo_manual))
 
+###############################################################################
+# Plot ROC curves for the multiclass problem using One-vs-One
+# ...........................................................
 fig, ax = plt.subplots()
-# plot roc curve as a macro average
-ax.plot(
-    all_multiclass_fpr,
-    roc_auc_uniform_average_tpr,
-    color='navy',
-    linestyle=':',
-    lw=4,
-    label='macro average (area = {0:0.2f})'.format(
-        macro_roc_auc_ovo),
-)
-# plot roc curve as a weighted average
-ax.plot(
-    all_multiclass_fpr,
-    roc_auc_prevalence_average_tpr,
-    color='deeppink',
-    linestyle=':',
-    lw=4,
-    label='weighted average (area = {0:0.2f})'.format(
-        weighted_roc_auc_ovo),
-)
-
 # plot roc curve for every of classes
 for a, b in permutations(range(n_classes), 2):
     ax.plot(

From 3536851adf279302d7ab843a0f36726520c3cbdf Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 4 Apr 2019 17:02:26 -0400
Subject: [PATCH 72/93] CLN Uses softmax

---
 sklearn/metrics/tests/test_ranking.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 2dc1c5d8c4316..1355dc29ea810 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -7,6 +7,7 @@
 from sklearn import datasets
 from sklearn import svm
 
+from sklearn.utils.extmath import softmax
 from sklearn.datasets import make_multilabel_classification
 from sklearn.random_projection import sparse_random_matrix
 from sklearn.utils.validation import check_array, check_consistent_length
@@ -568,10 +569,8 @@ def test_multiclass_ovo_roc_auc_toydata_binary(labels, multiclass):
      np.array([1, 2, 1, 2, 0]), ["c", "a", "b"]),
 ])
 def test_encode_y_true_multiclass_ovo(y_true, y_true_encoded, labels):
-    y_score = check_random_state(404).rand(5, 3)
-    y_score = y_score / y_score.sum(axis=1, keepdims=True)
     assert_almost_equal(
-        _encode_y_true_multiclass_ovo(y_true, y_score, labels),
+        _encode_y_true_multiclass_ovo(y_true, labels),
         y_true_encoded)
 
 
@@ -663,10 +662,10 @@ def test_roc_auc_score_multiclass_error(msg, kwargs):
     # is not defined.
     rng = check_random_state(404)
     y_score = rng.rand(20, 3)
-    y_score = y_score / y_score.sum(axis=1, keepdims=True)
+    y_prob = softmax(y_score)
     y_true = rng.randint(0, 3, size=20)
     with pytest.raises(ValueError, match=msg):
-        roc_auc_score(y_true, y_score, **kwargs)
+        roc_auc_score(y_true, y_prob, **kwargs)
 
 
 def test_auc_score_non_binary_class():

From e3c9e79561ed5cdb7aa4fb2fd0a65e29ef6e6256 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 4 Apr 2019 19:34:20 -0400
Subject: [PATCH 73/93] BUG Fix

---
 sklearn/metrics/tests/test_ranking.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 1355dc29ea810..d4d62ad46ed00 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -543,17 +543,13 @@ def test_multiclass_ovo_roc_auc_toydata_binary(labels, multiclass):
     ovo_score = (score_01 + score_10) / 2
 
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, labels, multiclass=multiclass),
+        roc_auc_score(y_true, y_scores, labels=labels, multiclass=multiclass),
         ovo_score)
 
     # Weighted, one-vs-one multiclass ROC AUC algorithm
     assert_almost_equal(
-        roc_auc_score(
-            y_true,
-            y_scores,
-            labels=labels,
-            multiclass=multiclass,
-            average="weighted"), ovo_score)
+        roc_auc_score(y_true, y_scores, labels=labels, multiclass=multiclass,
+                      average="weighted"), ovo_score)
 
 
 @pytest.mark.parametrize("y_true, y_true_encoded, labels", [

From 86286721b285c626098b2d8423a27de10683f7bb Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 17 Apr 2019 21:42:42 -0400
Subject: [PATCH 74/93] ENH Forces order for labels

---
 examples/model_selection/plot_roc.py          | 13 ++---
 .../model_selection/plot_roc_multiclass.py    |  9 ++--
 sklearn/metrics/ranking.py                    | 53 +++++--------------
 sklearn/metrics/tests/test_ranking.py         | 38 ++++---------
 4 files changed, 35 insertions(+), 78 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 475d7b4aba7a6..0d7cb96aed771 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -15,24 +15,25 @@
 The "steepness" of ROC curves is also important, since it is ideal to maximize
 the true positive rate while minimizing the false positive rate.
 
-Multiclass settings
+Multi-label settings
 -------------------
 
 ROC curves are typically used in binary classification to study the output of
-a classifier. In order to extend ROC curve and ROC area to multi-class
-or multi-label classification, it is necessary to binarize the output. One ROC
+a classifier. In order to extend ROC curve and ROC area to multi-label
+classification, it is necessary to binarize the output. One ROC
 curve can be drawn per label, but one can also draw a ROC curve by considering
 each element of the label indicator matrix as a binary prediction
 (micro-averaging).
 
-Another evaluation measure for multi-class classification is
+Another evaluation measure for multi-label classification is
 macro-averaging, which gives equal weight to the classification of each
 label.
 
 .. note::
 
     See also :func:`sklearn.metrics.roc_auc_score`,
-             :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`.
+             :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`,
+             :ref:`sphx_glr_auto_examples_model_selection_plot_roc_multiclass.py`.
 
 """
 print(__doc__)
@@ -101,7 +102,7 @@
 
 
 ##############################################################################
-# Plot ROC curves for the multiclass problem
+# Plot ROC curves for the multilabel problem
 
 # Compute macro-average ROC curve and ROC area
 
diff --git a/examples/model_selection/plot_roc_multiclass.py b/examples/model_selection/plot_roc_multiclass.py
index 7e7b05606d376..d4fc9be1fa14f 100644
--- a/examples/model_selection/plot_roc_multiclass.py
+++ b/examples/model_selection/plot_roc_multiclass.py
@@ -9,16 +9,17 @@
 of multilabel roc.
 
 The mutliclass One-vs-One scheme compares every unique pairwise combination
-of classes. In this example, we train a SVM classifier on the iris dataset and
-calcuate the AUC using a macro average and a average weighted by the number of
-true instances for each label combination.
+of classes. [1]_ In this example, we train a SVM classifier on the iris dataset
+and calcuate the AUC using a macro average and a average weighted by the number
+of true instances for each label combination.
 
 .. topic:: References:
 
-    .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
+    .. [1] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
        of the area under the ROC curve for multiple class classification
        problems. <http://link.springer.com/article/10.1023/A:1010920819831>`_
        Machine learning, 45(2), pp.171-186.
+
 """
 print(__doc__)
 
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 596172623ec56..2ed49502929a3 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -33,7 +33,7 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
-from ..preprocessing.label import _encode_python
+from ..preprocessing.label import _encode
 
 from .base import _average_binary_score, _average_multiclass_ovo_score
 
@@ -461,66 +461,41 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
                          " for multiclass ROC AUC. 'multiclass' must be"
                          " one of {1}.".format(
                                 multiclass, multiclass_options))
+
     if labels is not None:
-        unique_labels = np.unique(labels)
-        if len(unique_labels) != len(labels):
+        labels = column_or_1d(labels)
+        classes = _encode(labels)
+        if len(classes) != len(labels):
             raise ValueError("Parameter 'labels' must be unique")
-        if len(unique_labels) != y_score.shape[1]:
+        if not np.array_equal(classes, labels):
+            raise ValueError("Parameter 'labels' must be ordered")
+        if len(classes) != y_score.shape[1]:
             raise ValueError(
                 "Number of given labels, {0}, not equal to the number "
                 "of columns in 'y_score', {1}".format(
-                    len(unique_labels), y_score.shape[1]))
-        if set(np.unique(y_true)) > set(unique_labels):
+                    len(classes), y_score.shape[1]))
+        if len(np.setdiff1d(y_true, classes)):
             raise ValueError(
                 "'y_true' contains labels not in parameter 'labels'")
+    else:
+        classes = _encode(y_true)
 
     if multiclass == "ovo":
         if sample_weight is not None:
             raise ValueError("Parameter 'sample_weight' is not supported"
                              " for multiclass one-vs-one ROC AUC."
                              " 'sample_weight' must be None in this case.")
-        y_true_encoded = _encode_y_true_multiclass_ovo(y_true, labels)
+        _, y_true_encoded = _encode(y_true, uniques=classes, encode=True)
         # Hand & Till (2001) implementation (ovo)
         return _average_multiclass_ovo_score(binary_metric, y_true_encoded,
                                              y_score, average=average)
     else:
         # ovr is same as multi-label
-        if labels is None:
-            labels = np.unique(y_true)
-        y_true_multilabel = label_binarize(y_true, labels)
+        y_true_multilabel = label_binarize(y_true, classes)
         return _average_binary_score(binary_metric, y_true_multilabel, y_score,
                                      average, sample_weight=sample_weight)
 
 
-def _encode_y_true_multiclass_ovo(y_true, labels):
-    """Encodes y_true for multiclass ovo scoring
-
-    Parameters
-    ----------
-    y_true : numpy array, shape = (n_samples, )
-        True multiclass labels
-
-    labels : array-like, shape = (n_classes, ) or None
-        List of labels to index. If ``None``, the lexicon order of ``y_true``
-        is used.
-
-    Returns
-    -------
-    y_true_encoded : numpy array, shape = (n_samples, )
-        Encoded y_true
-    """
-    if labels is not None:
-        _, y_true_encoded = _encode_python(y_true,
-                                           uniques=np.array(labels),
-                                           encode=True)
-        return y_true_encoded
-
-    if np.issubdtype(y_true.dtype, np.integer):
-        return y_true
-
-    return np.unique(y_true, return_inverse=True)[1]
-
-
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     """Calculate true and false positives per binary classification threshold.
 
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index d4d62ad46ed00..5a4bc863ca259 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -30,7 +30,6 @@
 from sklearn.metrics import label_ranking_loss
 from sklearn.metrics import roc_auc_score
 from sklearn.metrics import roc_curve
-from sklearn.metrics.ranking import _encode_y_true_multiclass_ovo
 
 from sklearn.exceptions import UndefinedMetricWarning
 
@@ -475,8 +474,7 @@ def test_deprecated_auc_reorder():
 @pytest.mark.parametrize(
     "y_true, labels",
     [(np.array([0, 1, 0, 2]), None),
-     (["a", "b", "a", "c"], None),
-     (["c", "b", "c", "a"], ["c", "b", "a"])]
+     (["a", "b", "a", "c"], None)]
 )
 def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
     # Tests the one-vs-one multiclass ROC AUC algorithm
@@ -523,10 +521,8 @@ def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
             average="weighted"), ovo_weighted_score)
 
 
-@pytest.mark.parametrize(
-    "labels", [None, [0, 1, 2]])
-@pytest.mark.parametrize("multiclass", ["ovo"])
-def test_multiclass_ovo_roc_auc_toydata_binary(labels, multiclass):
+@pytest.mark.parametrize("labels", [None, [0, 1, 2]])
+def test_multiclass_ovo_roc_auc_toydata_binary(labels):
     y_true = np.array([0, 2, 0, 2])
     # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true
     #
@@ -543,39 +539,21 @@ def test_multiclass_ovo_roc_auc_toydata_binary(labels, multiclass):
     ovo_score = (score_01 + score_10) / 2
 
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, labels=labels, multiclass=multiclass),
+        roc_auc_score(y_true, y_scores, labels=labels, multiclass='ovo'),
         ovo_score)
 
     # Weighted, one-vs-one multiclass ROC AUC algorithm
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, labels=labels, multiclass=multiclass,
+        roc_auc_score(y_true, y_scores, labels=labels, multiclass='ovo',
                       average="weighted"), ovo_score)
 
 
-@pytest.mark.parametrize("y_true, y_true_encoded, labels", [
-    (np.array([0, 1, 0, 2, 1]), np.array([0, 1, 0, 2, 1]), None),
-    (np.array([0, 2, 0, 2, 2]), np.array([0, 2, 0, 2, 2]), None),
-    (np.array(["a", "b", "a", "b", "b"]), np.array([0, 1, 0, 1, 1]), None),
-    (np.array(["a", "b", "a", "b", "c"]), np.array([0, 1, 0, 1, 2]), None),
-    (np.array([0, 1, 0, 2, 1]), np.array([2, 0, 2, 1, 0]), [1, 2, 0]),
-    (np.array([0, 2, 0, 2, 2]), np.array([2, 0, 2, 0, 0]), [2, 1, 0]),
-    (np.array(["a", "b", "a", "b", "b"]),
-     np.array([0, 2, 0, 2, 2]), ["a", "c", "b"]),
-    (np.array(["a", "b", "a", "b", "c"]),
-     np.array([1, 2, 1, 2, 0]), ["c", "a", "b"]),
-])
-def test_encode_y_true_multiclass_ovo(y_true, y_true_encoded, labels):
-    assert_almost_equal(
-        _encode_y_true_multiclass_ovo(y_true, labels),
-        y_true_encoded)
-
-
 @pytest.mark.parametrize(
     "y_true, labels",
     [(np.array([0, 1, 2, 2]), None),
      (["a", "b", "c", "c"], None),
-     (["c", "b", "a", "a"], ["c", "b", "a"]),
-     (["c", "a", "b", "b"], ["c", "a", "b"])])
+     ([0, 1, 2, 2], [0, 1, 2]),
+     (["a", "b", "c", "c"], ["a", "b", "c"])])
 def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
@@ -609,6 +587,8 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     [("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]),
      ("Parameter 'labels' must be unique", np.array(["a", "b", "c", "c"]),
       ["a", "a", "b"]),
+     ("Parameter 'labels' must be ordered", np.array(["a", "b", "c", "c"]),
+      ["a", "c", "b"]),
      ("Number of given labels, 2, not equal to the number of columns in "
       "'y_score', 3",
       np.array([0, 1, 2, 2]), [0, 1]),

From 24f7c9827943fe5321d1886b8e21705613d46027 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 17 Apr 2019 21:52:20 -0400
Subject: [PATCH 75/93] CLN Uses the word processing

---
 sklearn/metrics/ranking.py            |  5 +++++
 sklearn/metrics/tests/test_ranking.py | 14 +++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 2ed49502929a3..b267b863b0496 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -479,6 +479,10 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
                 "'y_true' contains labels not in parameter 'labels'")
     else:
         classes = _encode(y_true)
+        if len(classes) != y_score.shape[1]:
+            raise ValueError(
+                "Number of classes in y_true not equal to the number of "
+                "columns in 'y_score'")
 
     if multiclass == "ovo":
         if sample_weight is not None:
@@ -486,6 +490,7 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
                              " for multiclass one-vs-one ROC AUC."
                              " 'sample_weight' must be None in this case.")
         _, y_true_encoded = _encode(y_true, uniques=classes, encode=True)
+        print(classes, y_true_encoded)
         # Hand & Till (2001) implementation (ovo)
         return _average_multiclass_ovo_score(binary_metric, y_true_encoded,
                                              y_score, average=average)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 5a4bc863ca259..2d2dc6cc83154 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -1,4 +1,3 @@
-
 import pytest
 import numpy as np
 import warnings
@@ -473,7 +472,9 @@ def test_deprecated_auc_reorder():
 
 @pytest.mark.parametrize(
     "y_true, labels",
-    [(np.array([0, 1, 0, 2]), None),
+    [(np.array([0, 1, 0, 2]), [0, 1, 2]),
+     (np.array([0, 1, 0, 2]), None),
+     (["a", "b", "a", "c"], ["a", "b", "c"]),
      (["a", "b", "a", "c"], None)]
 )
 def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
@@ -521,9 +522,10 @@ def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
             average="weighted"), ovo_weighted_score)
 
 
-@pytest.mark.parametrize("labels", [None, [0, 1, 2]])
-def test_multiclass_ovo_roc_auc_toydata_binary(labels):
-    y_true = np.array([0, 2, 0, 2])
+@pytest.mark.parametrize("y_true, labels",
+                         [(np.array([0, 2, 0, 2]), [0, 1, 2]),
+                          (np.array(['a', 'd', 'a', 'd']), ['a', 'b', 'd'])])
+def test_multiclass_ovo_roc_auc_toydata_binary(y_true, labels):
     # Tests the one-vs-one multiclass ROC AUC algorithm for binary y_true
     #
     # on a small example, representative of an expected use case.
@@ -587,6 +589,8 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     [("Parameter 'labels' must be unique", np.array([0, 1, 2, 2]), [0, 2, 0]),
      ("Parameter 'labels' must be unique", np.array(["a", "b", "c", "c"]),
       ["a", "a", "b"]),
+     ("Number of classes in y_true not equal to the number of columns "
+      "in 'y_score'", np.array([0, 2, 0, 2]), None),
      ("Parameter 'labels' must be ordered", np.array(["a", "b", "c", "c"]),
       ["a", "c", "b"]),
      ("Number of given labels, 2, not equal to the number of columns in "

From 3304b66fee715288ebf38ec94d0e7f8a2d5e45b9 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 31 May 2019 12:21:51 -0400
Subject: [PATCH 76/93] CLN Address comments

---
 examples/model_selection/plot_roc.py          |  5 +--
 .../model_selection/plot_roc_multiclass.py    | 45 +++++++++++--------
 sklearn/metrics/ranking.py                    |  1 -
 sklearn/metrics/tests/test_common.py          |  2 +-
 4 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 0d7cb96aed771..1dd13a9e7c70a 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -15,9 +15,6 @@
 The "steepness" of ROC curves is also important, since it is ideal to maximize
 the true positive rate while minimizing the false positive rate.
 
-Multi-label settings
--------------------
-
 ROC curves are typically used in binary classification to study the output of
 a classifier. In order to extend ROC curve and ROC area to multi-label
 classification, it is necessary to binarize the output. One ROC
@@ -103,7 +100,7 @@
 
 ##############################################################################
 # Plot ROC curves for the multilabel problem
-
+# ..........................................
 # Compute macro-average ROC curve and ROC area
 
 # First aggregate all false positive rates
diff --git a/examples/model_selection/plot_roc_multiclass.py b/examples/model_selection/plot_roc_multiclass.py
index d4fc9be1fa14f..ed447db879c9c 100644
--- a/examples/model_selection/plot_roc_multiclass.py
+++ b/examples/model_selection/plot_roc_multiclass.py
@@ -6,12 +6,12 @@
 The multiclass One-vs-Rest scheme is functionally the same as the multilabel
 case with one label per sample. See
 :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for an example
-of multilabel roc.
+of multilabel ROC.
 
 The mutliclass One-vs-One scheme compares every unique pairwise combination
 of classes. [1]_ In this example, we train a SVM classifier on the iris dataset
-and calcuate the AUC using a macro average and a average weighted by the number
-of true instances for each label combination.
+and calcuate the AUC using the OvR and OvO schemes. We report a macro
+average, and a prevalence-weighted average.
 
 .. topic:: References:
 
@@ -53,20 +53,27 @@
 ###############################################################################
 # Compute the AUC score
 # .....................
-# The ROC area can be calculated by taking the a macro average or an
-# average weighted by the number of true instances for each label combination.
+# The ROC area can be calculated by taking the a macro average and a
+# prevalence-weighted average.
 from sklearn.metrics import roc_auc_score
 
 macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
                                   average="macro")
 weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
                                      average="weighted")
-print("One-vs-One ROC AUC scores:\n{:.6f} (uniform),\n{:.6f} "
+macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
+                                  average="macro")
+weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
+                                     average="weighted")
+print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
       "(weighted by prevalence)"
       .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
+print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
+      "(weighted by prevalence)"
+      .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))
 
 ###############################################################################
-# Manually calcuate the one-vs-one multiclass auc score
+# Manually calcuate the one-vs-one multiclass AUC score
 # .....................................................
 import matplotlib.pyplot as plt
 from itertools import combinations, permutations
@@ -74,24 +81,24 @@
 
 n_classes = len(np.unique(y))
 
-fpr = dict()
-tpr = dict()
-roc_auc = dict()
-prevalence = dict()
+fpr = {}
+tpr = {}
+roc_auc = {}
+prevalence = {}
 for a, b in combinations(range(n_classes), 2):
     ab_mask = np.logical_or(y_test == a, y_test == b)
 
     # Compute ROC curve and ROC area with `a` as the positive class
-    fpr[(a, b)], tpr[(a, b)], _ = roc_curve(
+    fpr[a, b], tpr[a, b], _ = roc_curve(
           y_test[ab_mask] == a, y_prob[ab_mask, a])
-    roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)])
-    prevalence[(a, b)] = np.average(ab_mask)
+    roc_auc[a, b] = auc(fpr[a, b], tpr[a, b])
+    prevalence[a, b] = np.average(ab_mask)
 
     # Compute ROC curve and ROC area with `b` as the positive class
-    fpr[(b, a)], tpr[(b, a)], _ = roc_curve(
+    fpr[b, a], tpr[b, a], _ = roc_curve(
           y_test[ab_mask] == b, y_prob[ab_mask, b])
-    roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
-    prevalence[(b, a)] = np.average(ab_mask)
+    roc_auc[b, a] = auc(fpr[b, a], tpr[b, a])
+    prevalence[b, a] = np.average(ab_mask)
 
 roc_auc_values = np.fromiter(roc_auc.values(), dtype=np.float32)
 prevalence_values = np.fromiter(prevalence.values(), dtype=np.float32)
@@ -99,7 +106,7 @@
 macro_roc_auc_ovo_manual = np.average(roc_auc_values)
 weighted_roc_auc_ovo_manual = np.average(roc_auc_values,
                                          weights=prevalence_values)
-print(("Manual One-vs-One ROC AUC scores: \n{:.6f} (uniform),\n{:.6f} "
+print(("Manual One-vs-One ROC AUC scores: \n{:.6f} (macro),\n{:.6f} "
       "(weighted by prevalence)").format(macro_roc_auc_ovo_manual,
                                          weighted_roc_auc_ovo_manual))
 
@@ -110,7 +117,7 @@
 # plot roc curve for every of classes
 for a, b in permutations(range(n_classes), 2):
     ax.plot(
-          fpr[(a, b)], tpr[(a, b)],
+          fpr[a, b], tpr[a, b],
           lw=2, label='class {0} vs. {1} '
           '(area = {2:0.2f})'.format(a, b, roc_auc[(a, b)]))
 ax.plot([0, 1], [0, 1], 'k--', lw=2)
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index b267b863b0496..97251f3894a84 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -490,7 +490,6 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
                              " for multiclass one-vs-one ROC AUC."
                              " 'sample_weight' must be None in this case.")
         _, y_true_encoded = _encode(y_true, uniques=classes, encode=True)
-        print(classes, y_true_encoded)
         # Hand & Till (2001) implementation (ovo)
         return _average_multiclass_ovo_score(binary_metric, y_true_encoded,
                                              y_score, average=average)
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 572fd9b66b0be..e1e4779bf07f9 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1285,7 +1285,7 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name):
 @pytest.mark.parametrize(
     'name',
     set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
-def test_thresdhold_metric_is_permutation_invariance(name):
+def test_thresdhold_metric_permutation_invariance(name):
     n_samples, n_classes = 100, 3
     random_state = check_random_state(0)
 

From 566f313888354888c0027758ae1ccfa22386447d Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 31 May 2019 12:23:27 -0400
Subject: [PATCH 77/93] REV Removes test

---
 sklearn/metrics/tests/test_ranking.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 84c7545fe3d04..14c2a5f3629d8 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -442,15 +442,6 @@ def test_auc_errors():
     assert_raise_message(ValueError, error_message, auc, x, y)
 
 
-def test_deprecated_auc_reorder():
-    depr_message = ("The 'reorder' parameter has been deprecated in version "
-                    "0.20 and will be removed in 0.22. It is recommended not "
-                    "to set 'reorder' and ensure that x is monotonic "
-                    "increasing or monotonic decreasing.")
-    assert_warns_message(DeprecationWarning, depr_message, auc,
-                         [1, 2], [2, 3], reorder=True)
-
-
 @pytest.mark.parametrize(
     "y_true, labels",
     [(np.array([0, 1, 0, 2]), [0, 1, 2]),

From 9acd61bcc539e7d2bbc4324c123ca17a8c6e5e80 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 31 May 2019 12:33:36 -0400
Subject: [PATCH 78/93] DOC Adds reference

---
 doc/modules/model_evaluation.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 163913cb62c73..468f8f6609089 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1361,7 +1361,8 @@ prevalence:
 
 where :math:`c` is the number of classes. This algorithm is used by setting
 the keyword argument ``multiclass`` to ``'ovo'`` and ``average`` to
-``'weighted'``.
+``'weighted'``. The ``'weighted'`` option returns a prevalence-weighted average 
+as described in [FC2009]_.
 
 **One-vs-rest Algorithm**: Computes the AUC of each class against the rest.
 The algorithm is functionally the same as the multilabel case. To enable this

From 8f7c4ef5798a206ddd45ea6e565867a499ad5119 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 31 May 2019 12:45:32 -0400
Subject: [PATCH 79/93] BLD Trigger CI


From 3fcf96fa02b4cc4652a57861f48a4f09050a17b0 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Fri, 31 May 2019 13:59:39 -0400
Subject: [PATCH 80/93] DOC Removes whats_new

---
 doc/whats_new/v0.21.rst | 4 ----
 doc/whats_new/v0.22.rst | 6 ++++++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 2d42c13110c69..e6e0e6cf620dc 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -639,10 +639,6 @@ Support for Python 3.4 and below has been officially dropped.
   neighbors calculation.) :pr:`12568` by :user:`Wei Xue <xuewei4d>`,
   :user:`Emmanuel Arias <eamanu>` and `Joel Nothman`_.
 
-- |Feature| Added multiclass support to :func:`metrics.roc_auc_score`.
-  :issue:`12789` by :user:`Kathy Chen <kathyxchen>`,
-  :user:`Mohamed Maskani <maskani-moh>`, and :user:`Thomas Fan <thomasjpfan>`.
-  
 - |Efficiency| Faster :func:`metrics.pairwise_distances` with `n_jobs`
   > 1 by using a thread-based backend, instead of process-based backends.
   :pr:`8216` by :user:`Pierre Glaser <pierreglaser>` and
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index d82c305843e3c..8aa0744f4e3b7 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -55,6 +55,12 @@ Changelog
   of the maximization procedure in :term:`fit`.
   :pr:`13618` by :user:`Yoshihiro Uchida <c56pony>`.
 
+:mod:`sklearn.metrics`
+......................
+
+- |Feature| Added multiclass support to :func:`metrics.roc_auc_score`.
+  :issue:`12789` by :user:`Kathy Chen <kathyxchen>`,
+  :user:`Mohamed Maskani <maskani-moh>`, and :user:`Thomas Fan <thomasjpfan>`.
 
 :mod:`sklearn.svm`
 ..................

From 1095d7f9e3123dec43218f3bd4e055c683f89c95 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sun, 16 Jun 2019 18:57:06 -0400
Subject: [PATCH 81/93] DOC Adds more references

---
 doc/modules/model_evaluation.rst | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 5f43ff90d9c4f..08f381cee79fa 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1356,7 +1356,9 @@ as described in [FC2009]_.
 
 **One-vs-rest Algorithm**: Computes the AUC of each class against the rest.
 The algorithm is functionally the same as the multilabel case. To enable this
-algorithm set the keyword argument ``multiclass`` to ``'ovr'``.
+algorithm set the keyword argument ``multiclass`` to ``'ovr'``. Similar to
+OvO, OvR supports two types of averaging: ``'macro'`` [F2006]_ and
+``'weighted'`` [F2001]_.
 
 In applications where a high false positive rate is not tolerable the parameter
 ``max_fpr`` of :func:`roc_auc_score` can be used to summarize the ROC curve up
@@ -1387,16 +1389,26 @@ to the given limit.
 
 .. topic:: References:
 
-    .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
+    .. [HT2001] Hand, D.J. and Till, R.J., (2001). `A simple generalisation
        of the area under the ROC curve for multiple class classification problems.
        <http://link.springer.com/article/10.1023/A:1010920819831>`_
        Machine learning, 45(2), pp.171-186.
 
     .. [FC2009] Ferri, Cèsar & Hernandez-Orallo, Jose & Modroiu, R. (2009). 
-       An Experimental Comparison of Performance Measures for Classification. 
+       `An Experimental Comparison of Performance Measures for Classification. 
        <https://www.math.ucdavis.edu/~saito/data/roc/ferri-class-perf-metrics.pdf>`_
        Pattern Recognition Letters. 30. 27-38. 
 
+    .. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis.
+       <http://www.sciencedirect.com/science/article/pii/S016786550500303X>`_
+       Pattern Recognition Letters, 27(8), pp. 861-874.
+
+    .. [F2001] Fawcett, T., 2001. `Using rule sets to maximize 
+       ROC performance <http://ieeexplore.ieee.org/document/989510/>`_
+       In Data Mining, 2001.
+       Proceedings IEEE International Conference, pp. 131-138.
+
+
 .. _zero_one_loss:
 
 Zero one loss

From 95e25e980babb471d4c24ded4f6d3bf8bc7b475c Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sun, 16 Jun 2019 19:19:55 -0400
Subject: [PATCH 82/93] CLN Address comments

---
 doc/modules/model_evaluation.rst     | 3 ++-
 sklearn/metrics/tests/test_common.py | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 08f381cee79fa..bc39b6562a968 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1323,7 +1323,8 @@ the one-vs-rest algorithm computes the average of the ROC AUC scores for each
 class against all other classes. In both cases, the predicted labels are
 provided in an array with values from 0 to ``n_classes``, and the scores
 correspond to the probability estimates that a sample belongs to a particular
-class.
+class. The OvO and OvR algorithms supports weighting uniformly 
+(``average='macro'``) and weighting by the prevalence (``average='weighted'``).
 
 **One-vs-one Algorithm**: Computes the average AUC of all possible pairwise
 combinations of classes. [HT2001]_ defines a multiclass AUC metric weighted
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 1998d5a38460f..cdf975af37a5a 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1188,7 +1188,9 @@ def test_multiclass_sample_weight_invariance(name):
     y_score = random_state.random_sample(size=(n_samples, 5))
     metric = ALL_METRICS[name]
     if name in THRESHOLDED_METRICS:
-        y_score_norm = y_score / y_score.sum(1, keepdims=True)
+        # softmax
+        temp = np.exp(-y_score)
+        y_score_norm = temp / temp.sum(axis=-1).reshape(-1, 1)
         check_sample_weight_invariance(name, metric, y_true, y_score_norm)
     else:
         check_sample_weight_invariance(name, metric, y_true, y_pred)

From 76036a7969362709f8c4a26b33bdefc209c0c03a Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sun, 16 Jun 2019 19:23:08 -0400
Subject: [PATCH 83/93] CLN Change order

---
 sklearn/metrics/scorer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index acab5f1269325..31c83b0264ee9 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -519,8 +519,8 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                neg_mean_squared_error=neg_mean_squared_error_scorer,
                neg_mean_squared_log_error=neg_mean_squared_log_error_scorer,
                accuracy=accuracy_scorer, roc_auc=roc_auc_scorer,
-               roc_auc_ovo=roc_auc_ovo_scorer,
                roc_auc_ovr=roc_auc_ovr_scorer,
+               roc_auc_ovo=roc_auc_ovo_scorer,
                balanced_accuracy=balanced_accuracy_scorer,
                average_precision=average_precision_scorer,
                neg_log_loss=neg_log_loss_scorer,

From 146491d409abfaa03cf4a7ab5190bcbf1344c1e9 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sun, 16 Jun 2019 19:32:59 -0400
Subject: [PATCH 84/93] CLN Removes multiclass example

---
 doc/modules/model_evaluation.rst              |   3 -
 examples/model_selection/plot_roc.py          |  30 +++-
 .../model_selection/plot_roc_multiclass.py    | 130 ------------------
 3 files changed, 28 insertions(+), 135 deletions(-)
 delete mode 100644 examples/model_selection/plot_roc_multiclass.py

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index bc39b6562a968..789ffa038f25d 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -1385,9 +1385,6 @@ to the given limit.
     for an example of using ROC to
     model species distribution.
 
-  * See :ref:`sphx_glr_auto_examples_model_selection_plot_roc.py`
-    for an example of using multiclass one-vs-one ROC.
-
 .. topic:: References:
 
     .. [HT2001] Hand, D.J. and Till, R.J., (2001). `A simple generalisation
diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 1dd13a9e7c70a..1e26a491568c4 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -29,8 +29,7 @@
 .. note::
 
     See also :func:`sklearn.metrics.roc_auc_score`,
-             :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`,
-             :ref:`sphx_glr_auto_examples_model_selection_plot_roc_multiclass.py`.
+             :ref:`sphx_glr_auto_examples_model_selection_plot_roc_crossval.py`
 
 """
 print(__doc__)
@@ -45,6 +44,7 @@
 from sklearn.preprocessing import label_binarize
 from sklearn.multiclass import OneVsRestClassifier
 from scipy import interp
+from sklearn.metrics import roc_auc_score
 
 # Import some data to play with
 iris = datasets.load_iris()
@@ -144,3 +144,29 @@
 plt.title('Some extension of Receiver operating characteristic to multi-class')
 plt.legend(loc="lower right")
 plt.show()
+
+
+##############################################################################
+# Area under ROC for the multiclass problem
+# .........................................
+# The :func:`sklearn.metrics.roc_auc_score` function can be used for
+# multi-class classification. The mutliclass One-vs-One scheme compares every
+# unique pairwise combination of classes. In this section, we calcuate the AUC
+# using the OvR and OvO schemes. We report a macro average, and a
+# prevalence-weighted average.
+y_prob = classifier.predict_proba(X_test)
+
+macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
+                                  average="macro")
+weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
+                                     average="weighted")
+macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
+                                  average="macro")
+weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
+                                     average="weighted")
+print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
+      "(weighted by prevalence)"
+      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
+print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
+      "(weighted by prevalence)"
+      .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))
diff --git a/examples/model_selection/plot_roc_multiclass.py b/examples/model_selection/plot_roc_multiclass.py
deleted file mode 100644
index ed447db879c9c..0000000000000
--- a/examples/model_selection/plot_roc_multiclass.py
+++ /dev/null
@@ -1,130 +0,0 @@
-"""
-==================================================
-Multiclass Receiver Operating Characteristic (ROC)
-==================================================
-
-The multiclass One-vs-Rest scheme is functionally the same as the multilabel
-case with one label per sample. See
-:ref:`sphx_glr_auto_examples_model_selection_plot_roc.py` for an example
-of multilabel ROC.
-
-The mutliclass One-vs-One scheme compares every unique pairwise combination
-of classes. [1]_ In this example, we train a SVM classifier on the iris dataset
-and calcuate the AUC using the OvR and OvO schemes. We report a macro
-average, and a prevalence-weighted average.
-
-.. topic:: References:
-
-    .. [1] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
-       of the area under the ROC curve for multiple class classification
-       problems. <http://link.springer.com/article/10.1023/A:1010920819831>`_
-       Machine learning, 45(2), pp.171-186.
-
-"""
-print(__doc__)
-
-###############################################################################
-# Load iris dataset and train a SVC
-# .................................
-from sklearn import svm, datasets
-from sklearn.model_selection import train_test_split
-from sklearn.multiclass import OneVsRestClassifier
-import numpy as np
-
-# Import some data to play with
-iris = datasets.load_iris()
-X = iris.data
-y = iris.target
-
-# Add noisy features to make the problem harder
-random_state = np.random.RandomState(0)
-n_samples, n_features = X.shape
-X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
-
-# shuffle and split training and test sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
-                                                    random_state=0)
-
-# Learn to predict each class against the other
-classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
-                                 random_state=random_state))
-y_prob = classifier.fit(X_train, y_train).predict_proba(X_test)
-
-###############################################################################
-# Compute the AUC score
-# .....................
-# The ROC area can be calculated by taking the a macro average and a
-# prevalence-weighted average.
-from sklearn.metrics import roc_auc_score
-
-macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
-                                  average="macro")
-weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
-                                     average="weighted")
-macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
-                                  average="macro")
-weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
-                                     average="weighted")
-print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
-      "(weighted by prevalence)"
-      .format(macro_roc_auc_ovo, weighted_roc_auc_ovo))
-print("One-vs-Rest ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
-      "(weighted by prevalence)"
-      .format(macro_roc_auc_ovr, weighted_roc_auc_ovr))
-
-###############################################################################
-# Manually calcuate the one-vs-one multiclass AUC score
-# .....................................................
-import matplotlib.pyplot as plt
-from itertools import combinations, permutations
-from sklearn.metrics import roc_curve, auc
-
-n_classes = len(np.unique(y))
-
-fpr = {}
-tpr = {}
-roc_auc = {}
-prevalence = {}
-for a, b in combinations(range(n_classes), 2):
-    ab_mask = np.logical_or(y_test == a, y_test == b)
-
-    # Compute ROC curve and ROC area with `a` as the positive class
-    fpr[a, b], tpr[a, b], _ = roc_curve(
-          y_test[ab_mask] == a, y_prob[ab_mask, a])
-    roc_auc[a, b] = auc(fpr[a, b], tpr[a, b])
-    prevalence[a, b] = np.average(ab_mask)
-
-    # Compute ROC curve and ROC area with `b` as the positive class
-    fpr[b, a], tpr[b, a], _ = roc_curve(
-          y_test[ab_mask] == b, y_prob[ab_mask, b])
-    roc_auc[b, a] = auc(fpr[b, a], tpr[b, a])
-    prevalence[b, a] = np.average(ab_mask)
-
-roc_auc_values = np.fromiter(roc_auc.values(), dtype=np.float32)
-prevalence_values = np.fromiter(prevalence.values(), dtype=np.float32)
-
-macro_roc_auc_ovo_manual = np.average(roc_auc_values)
-weighted_roc_auc_ovo_manual = np.average(roc_auc_values,
-                                         weights=prevalence_values)
-print(("Manual One-vs-One ROC AUC scores: \n{:.6f} (macro),\n{:.6f} "
-      "(weighted by prevalence)").format(macro_roc_auc_ovo_manual,
-                                         weighted_roc_auc_ovo_manual))
-
-###############################################################################
-# Plot ROC curves for the multiclass problem using One-vs-One
-# ...........................................................
-fig, ax = plt.subplots()
-# plot roc curve for every of classes
-for a, b in permutations(range(n_classes), 2):
-    ax.plot(
-          fpr[a, b], tpr[a, b],
-          lw=2, label='class {0} vs. {1} '
-          '(area = {2:0.2f})'.format(a, b, roc_auc[(a, b)]))
-ax.plot([0, 1], [0, 1], 'k--', lw=2)
-ax.set_xlim([0.0, 1.0])
-ax.set_ylim([0.0, 1.05])
-ax.set_xlabel('False Positive Rate')
-ax.set_ylabel('True Positive Rate')
-ax.set_title('An extension of ROC to multi-class using One-vs-One')
-ax.legend(loc="lower right")
-fig.show()

From f01b435dcc444d6cc493c4f9451fc86ccb2fc2e7 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Sun, 16 Jun 2019 23:23:36 -0400
Subject: [PATCH 85/93] TST Pytest-dist ordering

---
 sklearn/metrics/tests/test_common.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index cdf975af37a5a..06010aa4124b8 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -1291,13 +1291,14 @@ def test_thresholded_multilabel_multioutput_permutations_invariance(name):
 
 @pytest.mark.parametrize(
     'name',
-    set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS)
-def test_thresdhold_metric_permutation_invariance(name):
+    sorted(set(THRESHOLDED_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS))
+def test_thresholded_metric_permutation_invariance(name):
     n_samples, n_classes = 100, 3
     random_state = check_random_state(0)
 
     y_score = random_state.rand(n_samples, n_classes)
-    y_score = np.exp(y_score)/np.exp(y_score).sum(axis=-1, keepdims=True)
+    temp = np.exp(-y_score)
+    y_score = temp / temp.sum(axis=-1).reshape(-1, 1)
     y_true = random_state.randint(0, n_classes, size=n_samples)
 
     metric = ALL_METRICS[name]

From 89de04f198af835852b030565503940feb19644a Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Mon, 24 Jun 2019 14:17:48 -0400
Subject: [PATCH 86/93] DOC Spacing

---
 sklearn/metrics/ranking.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index d0ee69529eda6..8b44622f4e55c 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -269,7 +269,7 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
         should be either equal to ``None`` or ``1.0`` as AUC ROC partial
         computation currently is not supported for multiclass.
 
-    multiclass : string, 'ovr' or 'ovo', optional(default='ovr')
+    multiclass : string, 'ovr' or 'ovo', optional (default='ovr')
         Determines the type of multiclass configuration to use.
         ``'ovr'``:
             Calculate metrics for the multiclass case using the one-vs-rest

From 11e87bb1dd92edab19256b165b9f8414c41151e1 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 2 Jul 2019 14:16:45 -0400
Subject: [PATCH 87/93] ENH Raises when multi_class is not specified

---
 examples/model_selection/plot_roc.py  |  8 +++---
 sklearn/metrics/ranking.py            | 36 ++++++++++++++-----------
 sklearn/metrics/scorer.py             |  4 +--
 sklearn/metrics/tests/test_common.py  |  4 +--
 sklearn/metrics/tests/test_ranking.py | 39 ++++++++++++++-------------
 5 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 1e26a491568c4..653c448d5cda4 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -156,13 +156,13 @@
 # prevalence-weighted average.
 y_prob = classifier.predict_proba(X_test)
 
-macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
+macro_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                   average="macro")
-weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multiclass="ovo",
+weighted_roc_auc_ovo = roc_auc_score(y_test, y_prob, multi_class="ovo",
                                      average="weighted")
-macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
+macro_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                   average="macro")
-weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multiclass="ovr",
+weighted_roc_auc_ovr = roc_auc_score(y_test, y_prob, multi_class="ovr",
                                      average="weighted")
 print("One-vs-One ROC AUC scores:\n{:.6f} (macro),\n{:.6f} "
       "(weighted by prevalence)"
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 8b44622f4e55c..402935b2e4f0f 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -216,7 +216,7 @@ def _binary_uninterpolated_average_precision(
 
 
 def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
-                  max_fpr=None, multiclass="ovr", labels=None):
+                  max_fpr=None, multi_class="raise", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
 
@@ -269,8 +269,9 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
         should be either equal to ``None`` or ``1.0`` as AUC ROC partial
         computation currently is not supported for multiclass.
 
-    multiclass : string, 'ovr' or 'ovo', optional (default='ovr')
+    multi_class : string, 'ovr' or 'ovo', optional (default='raise')
         Determines the type of multiclass configuration to use.
+        ``multi_class`` must be provided when ``y_true`` is multiclass.
         ``'ovr'``:
             Calculate metrics for the multiclass case using the one-vs-rest
             approach.
@@ -350,12 +351,15 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         # do not support partial ROC computation for multiclass
         if max_fpr is not None and max_fpr != 1.:
             raise ValueError("Partial AUC computation not available in "
-                             "multiclass setting. Parameter 'max_fpr' must be"
-                             " set to `None`. Received `max_fpr={0}` "
-                             "instead.".format(max_fpr))
+                             "multiclass setting, 'max_fpr' must be"
+                             " set to `None`, received `max_fpr={0}` "
+                             "instead".format(max_fpr))
+        if multi_class == 'raise':
+            raise ValueError("Parameter multi_class must be one of "
+                             "('ovo' or 'ovr')")
         return _multiclass_roc_auc_score(_binary_roc_auc_score,
                                          y_true, y_score, labels,
-                                         multiclass, average, sample_weight)
+                                         multi_class, average, sample_weight)
     elif y_type == "binary":
         labels = np.unique(y_true)
         y_true = label_binarize(y_true, labels)[:, 0]
@@ -369,7 +373,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
 
 
 def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
-                              multiclass, average, sample_weight):
+                              multi_class, average, sample_weight):
     """Multiclass roc auc score
 
     Parameters
@@ -394,7 +398,7 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
         List of labels to index ``y_score`` used for multiclass. If ``None``,
         the lexicon order of ``y_true`` is used to index ``y_score``.
 
-    multiclass : string, 'ovr' or 'ovo', optional(default='ovr')
+    multi_class : string, 'ovr' or 'ovo'
         Determines the type of multiclass configuration to use.
         ``'ovr'``:
             Calculate metrics for the multiclass case using the one-vs-rest
@@ -422,19 +426,19 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
     if not np.allclose(1, y_score.sum(axis=1)):
         raise ValueError(
             "Target scores need to be probabilities for multiclass "
-            "roc_auc, i.e. they should sum up to 1.0 over classes.")
+            "roc_auc, i.e. they should sum up to 1.0 over classes")
 
     # validation for multiclass parameter specifications
     average_options = ("macro", "weighted")
     if average not in average_options:
         raise ValueError("Parameter 'average' must be one of {0} for"
-                         " multiclass problems.".format(average_options))
+                         " multiclass problems".format(average_options))
     multiclass_options = ("ovo", "ovr")
-    if multiclass not in multiclass_options:
-        raise ValueError("Parameter multiclass='{0}' is not supported"
-                         " for multiclass ROC AUC. 'multiclass' must be"
-                         " one of {1}.".format(
-                                multiclass, multiclass_options))
+    if multi_class not in multiclass_options:
+        raise ValueError("Parameter multi_class='{0}' is not supported"
+                         " for multiclass ROC AUC, multi_class must be"
+                         " one of {1}".format(
+                                multi_class, multiclass_options))
 
     if labels is not None:
         labels = column_or_1d(labels)
@@ -458,7 +462,7 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
                 "Number of classes in y_true not equal to the number of "
                 "columns in 'y_score'")
 
-    if multiclass == "ovo":
+    if multi_class == "ovo":
         if sample_weight is not None:
             raise ValueError("Parameter 'sample_weight' is not supported"
                              " for multiclass one-vs-one ROC AUC."
diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index 31c83b0264ee9..c300d9e291bed 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -488,9 +488,9 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
 average_precision_scorer = make_scorer(average_precision_score,
                                        needs_threshold=True)
 roc_auc_ovo_scorer = make_scorer(roc_auc_score, needs_threshold=True,
-                                 multiclass='ovo')
+                                 multi_class='ovo')
 roc_auc_ovr_scorer = make_scorer(roc_auc_score, needs_threshold=True,
-                                 multiclass='ovr')
+                                 multi_class='ovr')
 
 # Score function for probabilistic classification
 neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False,
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 12e76986f404b..03394c3d771b0 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -211,9 +211,9 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "weighted_roc_auc": partial(roc_auc_score, average="weighted"),
     "samples_roc_auc": partial(roc_auc_score, average="samples"),
     "micro_roc_auc": partial(roc_auc_score, average="micro"),
-    "ovo_roc_auc": partial(roc_auc_score, average="macro", multiclass='ovo'),
+    "ovo_roc_auc": partial(roc_auc_score, average="macro", multi_class='ovo'),
     "weighted_ovo_roc_auc": partial(roc_auc_score, average="weighted",
-                                    multiclass='ovo'),
+                                    multi_class='ovo'),
     "partial_roc_auc": partial(roc_auc_score, max_fpr=0.5),
 
     "average_precision_score":
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 2be0836c134d3..c26bec45f9d6f 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -475,7 +475,7 @@ def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
     ovo_unweighted_score = (
         average_score_01 + average_score_02 + average_score_12) / 3
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, labels=labels, multiclass="ovo"),
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class="ovo"),
         ovo_unweighted_score)
 
     # Weighted, one-vs-one multiclass ROC AUC algorithm
@@ -488,7 +488,7 @@ def test_multiclass_ovo_roc_auc_toydata(y_true, labels):
             y_true,
             y_scores,
             labels=labels,
-            multiclass="ovo",
+            multi_class="ovo",
             average="weighted"), ovo_weighted_score)
 
 
@@ -511,12 +511,12 @@ def test_multiclass_ovo_roc_auc_toydata_binary(y_true, labels):
     ovo_score = (score_01 + score_10) / 2
 
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, labels=labels, multiclass='ovo'),
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class='ovo'),
         ovo_score)
 
     # Weighted, one-vs-one multiclass ROC AUC algorithm
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, labels=labels, multiclass='ovo',
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class='ovo',
                       average="weighted"), ovo_score)
 
 
@@ -539,7 +539,7 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
     result_unweighted = (out_0 + out_1 + out_2) / 3.
 
     assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovr", labels=labels),
+        roc_auc_score(y_true, y_scores, multi_class="ovr", labels=labels),
         result_unweighted)
 
     # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
@@ -549,7 +549,7 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
         roc_auc_score(
             y_true,
             y_scores,
-            multiclass="ovr",
+            multi_class="ovr",
             labels=labels,
             average="weighted"), result_weighted)
 
@@ -581,30 +581,31 @@ def test_multiclass_ovr_roc_auc_toydata(y_true, labels):
       np.array(["a", "b", "c", "d"]), ["a", "b", "c"]),
      ("'y_true' contains labels not in parameter 'labels'",
       np.array([0, 1, 2, 3]), [0, 1, 2])])
-@pytest.mark.parametrize("multiclass", ["ovo", "ovr"])
+@pytest.mark.parametrize("multi_class", ["ovo", "ovr"])
 def test_roc_auc_score_multiclass_labels_error(
-        msg, y_true, labels, multiclass):
+        msg, y_true, labels, multi_class):
     y_scores = np.array(
         [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
 
     with pytest.raises(ValueError, match=msg):
-        roc_auc_score(y_true, y_scores, labels=labels, multiclass=multiclass)
+        roc_auc_score(y_true, y_scores, labels=labels, multi_class=multi_class)
 
 
 @pytest.mark.parametrize("msg, kwargs", [
     ((r"Parameter 'average' must be one of \('macro', 'weighted'\) for "
-      r"multiclass problems\."), {"average": "samples"}),
+      r"multiclass problems"), {"average": "samples", "multi_class": "ovo"}),
     ((r"Parameter 'average' must be one of \('macro', 'weighted'\) for "
-      r"multiclass problems\."), {"average": "micro"}),
+      r"multiclass problems"), {"average": "micro", "multi_class": "ovr"}),
     ((r"Parameter 'sample_weight' is not supported for multiclass one-vs-one "
-      r"ROC AUC. 'sample_weight' must be None in this case\."),
-     {"multiclass": "ovo", "sample_weight": []}),
-    ((r"Partial AUC computation not available in multiclass setting\. "
-      r"Parameter 'max_fpr' must be set to `None`. Received `max_fpr=0.5` "
-      r"instead\."), {"multiclass": "ovo", "max_fpr": 0.5}),
-    ((r"Parameter multiclass='ovp' is not supported for multiclass ROC AUC\. "
-      r"'multiclass' must be one of \('ovo', 'ovr'\)\."),
-     {"multiclass": "ovp"})
+      r"ROC AUC. 'sample_weight' must be None in this case"),
+     {"multi_class": "ovo", "sample_weight": []}),
+    ((r"Partial AUC computation not available in multiclass setting, "
+      r"'max_fpr' must be set to `None`, received `max_fpr=0.5` "
+      r"instead"), {"multi_class": "ovo", "max_fpr": 0.5}),
+    ((r"Parameter multi_class='ovp' is not supported for multiclass ROC AUC, "
+      r"multi_class must be one of \('ovo', 'ovr'\)"),
+     {"multi_class": "ovp"}),
+    (r"multi_class must be one of \('ovo' or 'ovr'\)", {})
 ])
 def test_roc_auc_score_multiclass_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying

From df7efe085d25364176e8cbf9f98e68b729caa91f Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 2 Jul 2019 19:16:44 -0400
Subject: [PATCH 88/93] REV Defaults to ovr

---
 sklearn/metrics/ranking.py            | 8 ++------
 sklearn/metrics/tests/test_ranking.py | 3 +--
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 402935b2e4f0f..69b5471885b2f 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -216,7 +216,7 @@ def _binary_uninterpolated_average_precision(
 
 
 def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
-                  max_fpr=None, multi_class="raise", labels=None):
+                  max_fpr=None, multi_class="ovr", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
 
@@ -269,9 +269,8 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
         should be either equal to ``None`` or ``1.0`` as AUC ROC partial
         computation currently is not supported for multiclass.
 
-    multi_class : string, 'ovr' or 'ovo', optional (default='raise')
+    multi_class : string, 'ovr' or 'ovo', optional (default='ovr')
         Determines the type of multiclass configuration to use.
-        ``multi_class`` must be provided when ``y_true`` is multiclass.
         ``'ovr'``:
             Calculate metrics for the multiclass case using the one-vs-rest
             approach.
@@ -354,9 +353,6 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              "multiclass setting, 'max_fpr' must be"
                              " set to `None`, received `max_fpr={0}` "
                              "instead".format(max_fpr))
-        if multi_class == 'raise':
-            raise ValueError("Parameter multi_class must be one of "
-                             "('ovo' or 'ovr')")
         return _multiclass_roc_auc_score(_binary_roc_auc_score,
                                          y_true, y_score, labels,
                                          multi_class, average, sample_weight)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index c26bec45f9d6f..d856f263a102f 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -604,8 +604,7 @@ def test_roc_auc_score_multiclass_labels_error(
       r"instead"), {"multi_class": "ovo", "max_fpr": 0.5}),
     ((r"Parameter multi_class='ovp' is not supported for multiclass ROC AUC, "
       r"multi_class must be one of \('ovo', 'ovr'\)"),
-     {"multi_class": "ovp"}),
-    (r"multi_class must be one of \('ovo' or 'ovr'\)", {})
+     {"multi_class": "ovp"})
 ])
 def test_roc_auc_score_multiclass_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying

From 0646612e48b63a78582f45b8f0168334e62c86ad Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 2 Jul 2019 19:36:48 -0400
Subject: [PATCH 89/93] STY Minor

---
 sklearn/metrics/ranking.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 69b5471885b2f..5d7cfe027092a 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -427,13 +427,13 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
     # validation for multiclass parameter specifications
     average_options = ("macro", "weighted")
     if average not in average_options:
-        raise ValueError("Parameter 'average' must be one of {0} for"
-                         " multiclass problems".format(average_options))
+        raise ValueError("Parameter 'average' must be one of {0} for "
+                         "multiclass problems".format(average_options))
     multiclass_options = ("ovo", "ovr")
     if multi_class not in multiclass_options:
-        raise ValueError("Parameter multi_class='{0}' is not supported"
-                         " for multiclass ROC AUC, multi_class must be"
-                         " one of {1}".format(
+        raise ValueError("Parameter multi_class='{0}' is not supported "
+                         "for multiclass ROC AUC, multi_class must be "
+                         "one of {1}".format(
                                 multi_class, multiclass_options))
 
     if labels is not None:

From bfc73c95374e008dd87c62b8121ff1b9736be0a4 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Tue, 2 Jul 2019 19:54:59 -0400
Subject: [PATCH 90/93] TST roc_auc_score defaults to not support multiclass

---
 sklearn/metrics/ranking.py            | 20 ++++++++++++--------
 sklearn/metrics/tests/test_common.py  |  5 +++++
 sklearn/metrics/tests/test_ranking.py | 15 ++++++++-------
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 5d7cfe027092a..de21f19110405 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -216,7 +216,7 @@ def _binary_uninterpolated_average_precision(
 
 
 def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
-                  max_fpr=None, multi_class="ovr", labels=None):
+                  max_fpr=None, multi_class="raise", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
     from prediction scores.
 
@@ -269,8 +269,9 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
         should be either equal to ``None`` or ``1.0`` as AUC ROC partial
         computation currently is not supported for multiclass.
 
-    multi_class : string, 'ovr' or 'ovo', optional (default='ovr')
+    multi_class : string, 'ovr' or 'ovo', optional(default='raise')
         Determines the type of multiclass configuration to use.
+        ``multi_class`` must be provided when ``y_true`` is multiclass.
         ``'ovr'``:
             Calculate metrics for the multiclass case using the one-vs-rest
             approach.
@@ -353,6 +354,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              "multiclass setting, 'max_fpr' must be"
                              " set to `None`, received `max_fpr={0}` "
                              "instead".format(max_fpr))
+        if multi_class == 'raise':
+            raise ValueError("multi_class must be in ('ovo', 'ovr')")
         return _multiclass_roc_auc_score(_binary_roc_auc_score,
                                          y_true, y_score, labels,
                                          multi_class, average, sample_weight)
@@ -427,13 +430,14 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
     # validation for multiclass parameter specifications
     average_options = ("macro", "weighted")
     if average not in average_options:
-        raise ValueError("Parameter 'average' must be one of {0} for "
+        raise ValueError("average must be one of {0} for "
                          "multiclass problems".format(average_options))
+
     multiclass_options = ("ovo", "ovr")
     if multi_class not in multiclass_options:
-        raise ValueError("Parameter multi_class='{0}' is not supported "
+        raise ValueError("multi_class='{0}' is not supported "
                          "for multiclass ROC AUC, multi_class must be "
-                         "one of {1}".format(
+                         "in {1}".format(
                                 multi_class, multiclass_options))
 
     if labels is not None:
@@ -460,9 +464,9 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
 
     if multi_class == "ovo":
         if sample_weight is not None:
-            raise ValueError("Parameter 'sample_weight' is not supported"
-                             " for multiclass one-vs-one ROC AUC."
-                             " 'sample_weight' must be None in this case.")
+            raise ValueError("sample_weight is not supported "
+                             "for multiclass one-vs-one ROC AUC, "
+                             "'sample_weight' must be None in this case.")
         _, y_true_encoded = _encode(y_true, uniques=classes, encode=True)
         # Hand & Till (2001) implementation (ovo)
         return _average_multiclass_ovo_score(binary_metric, y_true_encoded,
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index 03394c3d771b0..6442b11834671 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -211,6 +211,9 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "weighted_roc_auc": partial(roc_auc_score, average="weighted"),
     "samples_roc_auc": partial(roc_auc_score, average="samples"),
     "micro_roc_auc": partial(roc_auc_score, average="micro"),
+    "ovr_roc_auc": partial(roc_auc_score, average="macro", multi_class='ovr'),
+    "weighted_ovr_roc_auc": partial(roc_auc_score, average="weighted",
+                                    multi_class='ovr'),
     "ovo_roc_auc": partial(roc_auc_score, average="macro", multi_class='ovo'),
     "weighted_ovo_roc_auc": partial(roc_auc_score, average="weighted",
                                     multi_class='ovo'),
@@ -264,6 +267,8 @@ def precision_recall_curve_padded_thresholds(*args, **kwargs):
     "micro_roc_auc",
     "samples_roc_auc",
     "partial_roc_auc",
+    "roc_auc_score",
+    "weighted_roc_auc",
 
     "average_precision_score",
     "weighted_average_precision_score",
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index d856f263a102f..c202aef1added 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -592,19 +592,20 @@ def test_roc_auc_score_multiclass_labels_error(
 
 
 @pytest.mark.parametrize("msg, kwargs", [
-    ((r"Parameter 'average' must be one of \('macro', 'weighted'\) for "
+    ((r"average must be one of \('macro', 'weighted'\) for "
       r"multiclass problems"), {"average": "samples", "multi_class": "ovo"}),
-    ((r"Parameter 'average' must be one of \('macro', 'weighted'\) for "
+    ((r"average must be one of \('macro', 'weighted'\) for "
       r"multiclass problems"), {"average": "micro", "multi_class": "ovr"}),
-    ((r"Parameter 'sample_weight' is not supported for multiclass one-vs-one "
-      r"ROC AUC. 'sample_weight' must be None in this case"),
+    ((r"sample_weight is not supported for multiclass one-vs-one "
+      r"ROC AUC, 'sample_weight' must be None in this case"),
      {"multi_class": "ovo", "sample_weight": []}),
     ((r"Partial AUC computation not available in multiclass setting, "
       r"'max_fpr' must be set to `None`, received `max_fpr=0.5` "
       r"instead"), {"multi_class": "ovo", "max_fpr": 0.5}),
-    ((r"Parameter multi_class='ovp' is not supported for multiclass ROC AUC, "
-      r"multi_class must be one of \('ovo', 'ovr'\)"),
-     {"multi_class": "ovp"})
+    ((r"multi_class='ovp' is not supported for multiclass ROC AUC, "
+      r"multi_class must be in \('ovo', 'ovr'\)"),
+     {"multi_class": "ovp"}),
+    (r"multi_class must be in \('ovo', 'ovr'\)", {})
 ])
 def test_roc_auc_score_multiclass_error(msg, kwargs):
     # Test that roc_auc_score function returns an error when trying

From 65fea8e6e2ed7bb36d2ab3c6ac472639a030bba2 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Thu, 4 Jul 2019 11:18:13 -0400
Subject: [PATCH 91/93] ENH Adds weighted scorers

---
 sklearn/metrics/scorer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py
index c300d9e291bed..80a0427647a3a 100644
--- a/sklearn/metrics/scorer.py
+++ b/sklearn/metrics/scorer.py
@@ -489,8 +489,14 @@ def make_scorer(score_func, greater_is_better=True, needs_proba=False,
                                        needs_threshold=True)
 roc_auc_ovo_scorer = make_scorer(roc_auc_score, needs_threshold=True,
                                  multi_class='ovo')
+roc_auc_ovo_weighted_scorer = make_scorer(roc_auc_score, needs_threshold=True,
+                                          multi_class='ovo',
+                                          average='weighted')
 roc_auc_ovr_scorer = make_scorer(roc_auc_score, needs_threshold=True,
                                  multi_class='ovr')
+roc_auc_ovr_weighted_scorer = make_scorer(roc_auc_score, needs_threshold=True,
+                                          multi_class='ovr',
+                                          average='weighted')
 
 # Score function for probabilistic classification
 neg_log_loss_scorer = make_scorer(log_loss, greater_is_better=False,

From c5101e2b2fcd78c92d80e4a347449d5671c5f941 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 17 Jul 2019 15:12:33 -0400
Subject: [PATCH 92/93] CLN Address comments

---
 sklearn/metrics/ranking.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index de21f19110405..699c32a5c1097 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -365,7 +365,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return _average_binary_score(
             _binary_roc_auc_score, y_true, y_score, average,
             sample_weight=sample_weight)
-    else:
+    else:  # multilabel-indicator
         return _average_binary_score(
             _binary_roc_auc_score, y_true, y_score, average,
             sample_weight=sample_weight)
@@ -395,7 +395,7 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
 
     labels : array, shape = [n_classes] or None, optional (default=None)
         List of labels to index ``y_score`` used for multiclass. If ``None``,
-        the lexicon order of ``y_true`` is used to index ``y_score``.
+        the lexical order of ``y_true`` is used to index ``y_score``.
 
     multi_class : string, 'ovr' or 'ovo'
         Determines the type of multiclass configuration to use.

From 1399ddd95c8bc0bd43684c79bd6ce7a67085e173 Mon Sep 17 00:00:00 2001
From: Thomas Fan <thomasjpfan@gmail.com>
Date: Wed, 17 Jul 2019 15:38:07 -0400
Subject: [PATCH 93/93] CLN Uses partial

---
 sklearn/metrics/ranking.py | 90 ++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 47 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 699c32a5c1097..8e1775e80e635 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -215,6 +215,34 @@ def _binary_uninterpolated_average_precision(
                                  average, sample_weight=sample_weight)
 
 
+def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
+    """Binary roc auc score"""
+    if len(np.unique(y_true)) != 2:
+        raise ValueError("Only one class present in y_true. ROC AUC score "
+                         "is not defined in that case.")
+
+    fpr, tpr, _ = roc_curve(y_true, y_score,
+                            sample_weight=sample_weight)
+    if max_fpr is None or max_fpr == 1:
+        return auc(fpr, tpr)
+    if max_fpr <= 0 or max_fpr > 1:
+        raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr)
+
+    # Add a single point at max_fpr by linear interpolation
+    stop = np.searchsorted(fpr, max_fpr, 'right')
+    x_interp = [fpr[stop - 1], fpr[stop]]
+    y_interp = [tpr[stop - 1], tpr[stop]]
+    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
+    fpr = np.append(fpr[:stop], max_fpr)
+    partial_auc = auc(fpr, tpr)
+
+    # McClish correction: standardize result to be 0.5 if non-discriminant
+    # and 1 if maximal
+    min_area = 0.5 * max_fpr**2
+    max_area = max_fpr
+    return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
+
+
 def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
                   max_fpr=None, multi_class="raise", labels=None):
     """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
@@ -314,32 +342,6 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
     0.75
 
     """
-    def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
-        if len(np.unique(y_true)) != 2:
-            raise ValueError("Only one class present in y_true. ROC AUC score "
-                             "is not defined in that case.")
-
-        fpr, tpr, _ = roc_curve(y_true, y_score,
-                                sample_weight=sample_weight)
-        if max_fpr is None or max_fpr == 1:
-            return auc(fpr, tpr)
-        if max_fpr <= 0 or max_fpr > 1:
-            raise ValueError("Expected max_fpr in range (0, 1], got: %r"
-                             % max_fpr)
-
-        # Add a single point at max_fpr by linear interpolation
-        stop = np.searchsorted(fpr, max_fpr, 'right')
-        x_interp = [fpr[stop - 1], fpr[stop]]
-        y_interp = [tpr[stop - 1], tpr[stop]]
-        tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
-        fpr = np.append(fpr[:stop], max_fpr)
-        partial_auc = auc(fpr, tpr)
-
-        # McClish correction: standardize result to be 0.5 if non-discriminant
-        # and 1 if maximal
-        min_area = 0.5 * max_fpr**2
-        max_area = max_fpr
-        return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
 
     y_type = type_of_target(y_true)
     y_true = check_array(y_true, ensure_2d=False, dtype=None)
@@ -356,36 +358,28 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              "instead".format(max_fpr))
         if multi_class == 'raise':
             raise ValueError("multi_class must be in ('ovo', 'ovr')")
-        return _multiclass_roc_auc_score(_binary_roc_auc_score,
-                                         y_true, y_score, labels,
+        return _multiclass_roc_auc_score(y_true, y_score, labels,
                                          multi_class, average, sample_weight)
     elif y_type == "binary":
         labels = np.unique(y_true)
         y_true = label_binarize(y_true, labels)[:, 0]
-        return _average_binary_score(
-            _binary_roc_auc_score, y_true, y_score, average,
-            sample_weight=sample_weight)
+        return _average_binary_score(partial(_binary_roc_auc_score,
+                                             max_fpr=max_fpr),
+                                     y_true, y_score, average,
+                                     sample_weight=sample_weight)
     else:  # multilabel-indicator
-        return _average_binary_score(
-            _binary_roc_auc_score, y_true, y_score, average,
-            sample_weight=sample_weight)
+        return _average_binary_score(partial(_binary_roc_auc_score,
+                                             max_fpr=max_fpr),
+                                     y_true, y_score, average,
+                                     sample_weight=sample_weight)
 
 
-def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
+def _multiclass_roc_auc_score(y_true, y_score, labels,
                               multi_class, average, sample_weight):
     """Multiclass roc auc score
 
     Parameters
     ----------
-    binary_metric : callable
-        The binary metric function to use that accepts the following as input
-            y_true_target : array, shape = [n_samples_target]
-                Some sub-array of y_true for a pair of classes designated
-                positive and negative in the one-vs-one scheme.
-            y_score_target : array, shape = [n_samples_target]
-                Scores corresponding to the probability estimates
-                of a sample belonging to the designated positive class label
-
     y_true : array-like, shape = (n_samples, )
         True multiclass labels.
 
@@ -469,13 +463,15 @@ def _multiclass_roc_auc_score(binary_metric, y_true, y_score, labels,
                              "'sample_weight' must be None in this case.")
         _, y_true_encoded = _encode(y_true, uniques=classes, encode=True)
         # Hand & Till (2001) implementation (ovo)
-        return _average_multiclass_ovo_score(binary_metric, y_true_encoded,
+        return _average_multiclass_ovo_score(_binary_roc_auc_score,
+                                             y_true_encoded,
                                              y_score, average=average)
     else:
         # ovr is same as multi-label
         y_true_multilabel = label_binarize(y_true, classes)
-        return _average_binary_score(binary_metric, y_true_multilabel, y_score,
-                                     average, sample_weight=sample_weight)
+        return _average_binary_score(_binary_roc_auc_score, y_true_multilabel,
+                                     y_score, average,
+                                     sample_weight=sample_weight)
 
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):