From 00a06268a5574c38914f7c6cec9883e0ba28df64 Mon Sep 17 00:00:00 2001
From: Kathy <katch@sas.upenn.edu>
Date: Tue, 11 Oct 2016 22:21:52 -0400
Subject: [PATCH 01/31] better implementation of the multiclass logic (in terms
 of design). debugging to-do

---
 sklearn/metrics/base.py               | 47 +++++++++++++++++++++++++++
 sklearn/metrics/ranking.py            | 27 +++++++++++----
 sklearn/metrics/tests/test_ranking.py | 13 ++++++++
 3 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 0ad96c1afd059..73ae7bde14365 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -14,6 +14,7 @@
 
 from __future__ import division
 
+import itertools
 import numpy as np
 
 from ..utils import check_array, check_consistent_length
@@ -131,3 +132,49 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         return np.average(score, weights=average_weight)
     else:
         return score
+
+def _average_multiclass_score(binary_metric, y_true, y_score,
+                              average, multiclass):
+    """TODO: DOCUMENTATION
+    """
+    average_options = (None, "macro", "weighted")
+    if average not in average_options:
+        raise ValueError("average has to be one of {0}"
+                         "".format(average_options))
+    multiclass_options = ("ovo", "ovr")
+    if multiclass not in multiclass_options:
+        raise ValueError("{0} is not supported for multiclass ROC AUC"
+                         "".format(multiclass))
+
+    check_consistent_length(y_true, y_score)
+    y_true = check_array(y_true)
+    y_score = check_array(y_score)
+
+    not_average_axis = 1
+    average_weight = None
+    if average == "weighted":
+        average_weight = np.sum(y_true, axis=0)
+        if average_weight.sum() == 0:
+            return 0
+
+    if y_true.ndim == 1:
+        y_true = y_true.reshape((-1, 1))
+
+    if y_score.ndim == 1:
+        y_score = y_score.reshape((-1, 1))
+
+    if multiclass == "ovo":
+        n_labels = len(np.unique(y_true))
+        pairwise = [p for p in itertools.combinations(xrange(n_labels), 2)]
+        auc_scores_sum = 0
+        for pair in pairwise:
+            ix = np.in1d(y_true.ravel(), [pair[0], pair[1]]).reshape(y_true.shape)
+            y_true_filtered = y_true[np.where(ix)]
+            y_score_filtered = y_score[np.where(ix)[1],:][:,[pair[0], pair[1]]]
+            y_true_filtered_01 = [1 if x == pair[0] else 0 for x in y_true_filtered]
+            y_true_filtered_10 = [1 if x == pair[1] else 0 for x in y_true_filtered]
+            auc_scores_sum += (binary_metric(y_true_filtered_01, y_score_filtered[:,0]) +
+                               binary_metric(y_true_filtered_10, y_score_filtered[:,1]))/2.0
+        return auc_scores_sum * (2.0 / (n_labels * (n_labels - 1.0)))
+    else:
+        raise ValueError("TODO")
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index d1f58772de595..4ce10eb51b10f 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -34,7 +34,7 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 
-from .base import _average_binary_score
+from .base import _average_binary_score, _average_multiclass_score
 
 
 def auc(x, y, reorder=False):
@@ -184,7 +184,7 @@ def _binary_average_precision(y_true, y_score, sample_weight=None):
                                  average, sample_weight=sample_weight)
 
 
-def roc_auc_score(y_true, y_score, average="macro", sample_weight=None):
+def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro", sample_weight=None):
     """Compute Area Under the Curve (AUC) from prediction scores
 
     Note: this implementation is restricted to the binary classification task
@@ -246,6 +246,7 @@ def roc_auc_score(y_true, y_score, average="macro", sample_weight=None):
     0.75
 
     """
+
     def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         if len(np.unique(y_true)) != 2:
             raise ValueError("Only one class present in y_true. ROC AUC score "
@@ -255,10 +256,24 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                                         sample_weight=sample_weight)
         return auc(fpr, tpr, reorder=True)
 
-    return _average_binary_score(
-        _binary_roc_auc_score, y_true, y_score, average,
-        sample_weight=sample_weight)
-
+    if type_of_target(y_true) != "multiclass":
+        return _average_binary_score(
+            _binary_roc_auc_score, y_true, y_score, average,
+            sample_weight=sample_weight)
+    else:
+        '''
+        average_options = (None, "macro", "weighted")
+        if average not in average_options:
+            raise ValueError("average has to be one of {0}"
+                             "".format(average_options))
+        multiclass_options = ("ovo", "ovr")
+        if multiclass not in multiclass_options:
+            raise ValueError("{0} is not supported for multiclass ROC AUC"
+                             "".format(multiclass))
+        '''
+        return _average_multiclass_score(
+            _binary_roc_auc_score, y_true, y_score,
+            average, multiclass)
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     """Calculate true and false positives per binary classification threshold.
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 0ba1d858ab7de..49c69eda1dfea 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -327,6 +327,19 @@ def test_roc_curve_toydata():
     assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), .5)
     assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), .5)
 
+def test_multi_roc_auc_toydata():
+    y_true = np.array([0, 1, 2])
+    y_scores = np.array([[0.714, 0.072, 0.214], [0.837, 0.143, 0.020], [0.714, 0.072, 0.214]])
+    assert_almost_equal(roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.666666666663)
+
+    y_true = np.array([0, 0, 1, 1])
+    y_scores_binary = np.array([0.1, 0.4, 0.35, 0.8])
+    y_scores_multi = []
+    for y_score in y_scores_binary:
+        y_scores_multi.append([1 - y_score, y_score])
+    y_scores_multi = np.array(y_scores_multi)
+    assert_almost_equal(roc_auc_score(y_true, y_scores_multi, multiclass="ovo"),
+        roc_auc_score(y_true, y_scores_binary))
 
 def test_roc_curve_drop_intermediate():
     # Test that drop_intermediate drops the correct thresholds

From 8a84578c581493edc4f9dcd218ce5b388d810c80 Mon Sep 17 00:00:00 2001
From: Kathy <katch@sas.upenn.edu>
Date: Thu, 13 Oct 2016 07:22:30 -0400
Subject: [PATCH 02/31] ovr and associated testing

---
 sklearn/metrics/base.py               | 32 ++++++++++++++++++---------
 sklearn/metrics/ranking.py            |  1 +
 sklearn/metrics/tests/test_ranking.py | 25 ++++++++++++++-------
 3 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 73ae7bde14365..5eaf5d79f1c48 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -152,6 +152,7 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
 
     not_average_axis = 1
     average_weight = None
+    # TODO: may not apply to multiclass in the same way.
     if average == "weighted":
         average_weight = np.sum(y_true, axis=0)
         if average_weight.sum() == 0:
@@ -162,19 +163,30 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
 
     if y_score.ndim == 1:
         y_score = y_score.reshape((-1, 1))
-
+    # TODO: assumes integer labels?
+    label_unique, label_counts = np.unique(y_true, return_counts=True)
+    n_labels = len(label_unique)
     if multiclass == "ovo":
-        n_labels = len(np.unique(y_true))
+        # Hand and Till 2001
         pairwise = [p for p in itertools.combinations(xrange(n_labels), 2)]
         auc_scores_sum = 0
         for pair in pairwise:
-            ix = np.in1d(y_true.ravel(), [pair[0], pair[1]]).reshape(y_true.shape)
-            y_true_filtered = y_true[np.where(ix)]
-            y_score_filtered = y_score[np.where(ix)[1],:][:,[pair[0], pair[1]]]
-            y_true_filtered_01 = [1 if x == pair[0] else 0 for x in y_true_filtered]
-            y_true_filtered_10 = [1 if x == pair[1] else 0 for x in y_true_filtered]
-            auc_scores_sum += (binary_metric(y_true_filtered_01, y_score_filtered[:,0]) +
-                               binary_metric(y_true_filtered_10, y_score_filtered[:,1]))/2.0
+            ix = np.in1d(y_true.ravel(), [pair[0], pair[1]])
+            y_true_filtered = y_true[0, np.where(ix)]
+            y_score_filtered = y_score[np.where(ix)]
+            y_true_filtered_10 = np.in1d(y_true_filtered.ravel(), pair[0]).astype(int)
+            y_true_filtered_01 = np.in1d(y_true_filtered.ravel(), pair[1]).astype(int)
+            auc_scores_sum += (binary_metric(y_true_filtered_10, y_score_filtered[:,pair[0]]) +
+                               binary_metric(y_true_filtered_01, y_score_filtered[:,pair[1]]))/2.0
         return auc_scores_sum * (2.0 / (n_labels * (n_labels - 1.0)))
     else:
-        raise ValueError("TODO")
+        # Provost and Domingos 2001
+        label_counts_map = dict(zip(label_unique, label_counts))
+        auc_scores_sum = 0
+        for label in label_unique:
+            y_true_label = np.in1d(y_true.ravel(), label).astype(int)
+            #y_true_label = y_true[0, np.where(ix)]
+            y_score_label = y_score[:,label]
+            auc_scores_sum += binary_metric(y_true_label, y_score_label) * (label_counts_map[label]/float(sum(label_counts_map.values())))
+        return auc_scores_sum
+
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 4ce10eb51b10f..632eef683d721 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -288,6 +288,7 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
 
     pos_label : int or str, default=None
         The label of the positive class
+A
 
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 49c69eda1dfea..1b326ec1f4395 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -332,14 +332,23 @@ def test_multi_roc_auc_toydata():
     y_scores = np.array([[0.714, 0.072, 0.214], [0.837, 0.143, 0.020], [0.714, 0.072, 0.214]])
     assert_almost_equal(roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.666666666663)
 
-    y_true = np.array([0, 0, 1, 1])
-    y_scores_binary = np.array([0.1, 0.4, 0.35, 0.8])
-    y_scores_multi = []
-    for y_score in y_scores_binary:
-        y_scores_multi.append([1 - y_score, y_score])
-    y_scores_multi = np.array(y_scores_multi)
-    assert_almost_equal(roc_auc_score(y_true, y_scores_multi, multiclass="ovo"),
-        roc_auc_score(y_true, y_scores_binary))
+    y_true = np.array([0, 1, 0, 2])
+    y_scores = np.array([[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+    assert_almost_equal(roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.75)
+    #y_scores_multi = []
+    #for y_score in y_scores_binary:
+    #    y_scores_multi.append([1 - y_score, y_score])
+    #y_scores_multi = np.array(y_scores_multi)
+    #assert_almost_equal(roc_auc_score(y_true, y_scores_multi, multiclass="ovo"),
+    #    roc_auc_score(y_true, y_scores_binary))
+
+    y_true = np.array([0, 1, 2, 2])
+    y_scores = np.array([[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:,0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:,1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:,2])
+    result = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(roc_auc_score(y_true, y_scores, multiclass="ovr"), result)
 
 def test_roc_curve_drop_intermediate():
     # Test that drop_intermediate drops the correct thresholds

From 485fd59343c985b1ba356d1d634132f76f9ac479 Mon Sep 17 00:00:00 2001
From: Kathy <katch@sas.upenn.edu>
Date: Thu, 13 Oct 2016 13:59:26 -0400
Subject: [PATCH 03/31] some testing implemented for the value errors, but not
 yet comprehensive

---
 sklearn/metrics/base.py               | 63 +++++++++++++++++------
 sklearn/metrics/tests/test_ranking.py | 74 ++++++++++++++++-----------
 2 files changed, 91 insertions(+), 46 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 5eaf5d79f1c48..0a0a33227c6de 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -135,7 +135,39 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
 
 def _average_multiclass_score(binary_metric, y_true, y_score,
                               average, multiclass):
-    """TODO: DOCUMENTATION
+
+    """Uses the binary metric for multiclass classification
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples] or [n_samples, n_classes]
+        True multiclass labels
+
+    y_score : array, shape = [n_samples] or [n_samples, n_classes]
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    average : string, [None, 'macro' (default), 'weighted']
+        TODO: difference between 'macro' and None? Should there be both?
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the a priori
+            distribution of the classes.
+
+    binary_metric : callable, returns shape [n_classes]
+        The binary metric function to use.
+
+    Returns
+    -------
+    score : float or array of shape [n_classes]
+        If not ``None``, average the score, else return the score for each
+        classes.
+
     """
     average_options = (None, "macro", "weighted")
     if average not in average_options:
@@ -151,23 +183,18 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
     y_score = check_array(y_score)
 
     not_average_axis = 1
-    average_weight = None
-    # TODO: may not apply to multiclass in the same way.
-    if average == "weighted":
-        average_weight = np.sum(y_true, axis=0)
-        if average_weight.sum() == 0:
-            return 0
 
     if y_true.ndim == 1:
         y_true = y_true.reshape((-1, 1))
 
     if y_score.ndim == 1:
         y_score = y_score.reshape((-1, 1))
-    # TODO: assumes integer labels?
+
     label_unique, label_counts = np.unique(y_true, return_counts=True)
+    label_counts_map = dict(zip(label_unique, label_counts))
     n_labels = len(label_unique)
     if multiclass == "ovo":
-        # Hand and Till 2001
+        # Hand and Till 2001 (unweighted)
         pairwise = [p for p in itertools.combinations(xrange(n_labels), 2)]
         auc_scores_sum = 0
         for pair in pairwise:
@@ -176,17 +203,23 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
             y_score_filtered = y_score[np.where(ix)]
             y_true_filtered_10 = np.in1d(y_true_filtered.ravel(), pair[0]).astype(int)
             y_true_filtered_01 = np.in1d(y_true_filtered.ravel(), pair[1]).astype(int)
-            auc_scores_sum += (binary_metric(y_true_filtered_10, y_score_filtered[:,pair[0]]) +
-                               binary_metric(y_true_filtered_01, y_score_filtered[:,pair[1]]))/2.0
+            binary_avg_output = \
+              (binary_metric(y_true_filtered_10, y_score_filtered[:,pair[0]]) +
+               binary_metric(y_true_filtered_01, y_score_filtered[:,pair[1]]))/2.0
+            auc_scores_sum += binary_avg_output
+            if average == "weighted":
+                raise ValueError("one-vs-one multiclass AUC is only implemented "
+                                 "for the unweighted Hand and Till (2001) algorithm")
         return auc_scores_sum * (2.0 / (n_labels * (n_labels - 1.0)))
     else:
-        # Provost and Domingos 2001
-        label_counts_map = dict(zip(label_unique, label_counts))
+        # Provost and Domingos 2001 (weighted)
         auc_scores_sum = 0
         for label in label_unique:
             y_true_label = np.in1d(y_true.ravel(), label).astype(int)
-            #y_true_label = y_true[0, np.where(ix)]
             y_score_label = y_score[:,label]
-            auc_scores_sum += binary_metric(y_true_label, y_score_label) * (label_counts_map[label]/float(sum(label_counts_map.values())))
+            binary_output = binary_metric(y_true_label, y_score_label)
+            if average == "weighted":
+                binary_output *= (label_counts_map[label]/float(sum(label_counts_map.values())))
+            auc_scores_sum += binary_output
         return auc_scores_sum
 
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 1b326ec1f4395..9b4ec620b31e6 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -327,28 +327,6 @@ def test_roc_curve_toydata():
     assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), .5)
     assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), .5)
 
-def test_multi_roc_auc_toydata():
-    y_true = np.array([0, 1, 2])
-    y_scores = np.array([[0.714, 0.072, 0.214], [0.837, 0.143, 0.020], [0.714, 0.072, 0.214]])
-    assert_almost_equal(roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.666666666663)
-
-    y_true = np.array([0, 1, 0, 2])
-    y_scores = np.array([[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
-    assert_almost_equal(roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.75)
-    #y_scores_multi = []
-    #for y_score in y_scores_binary:
-    #    y_scores_multi.append([1 - y_score, y_score])
-    #y_scores_multi = np.array(y_scores_multi)
-    #assert_almost_equal(roc_auc_score(y_true, y_scores_multi, multiclass="ovo"),
-    #    roc_auc_score(y_true, y_scores_binary))
-
-    y_true = np.array([0, 1, 2, 2])
-    y_scores = np.array([[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
-    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:,0])
-    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:,1])
-    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:,2])
-    result = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
-    assert_almost_equal(roc_auc_score(y_true, y_scores, multiclass="ovr"), result)
 
 def test_roc_curve_drop_intermediate():
     # Test that drop_intermediate drops the correct thresholds
@@ -413,6 +391,49 @@ def test_auc_errors():
     assert_raises(ValueError, auc, [1.0, 0.0, 0.5], [0.0, 0.0, 0.0])
 
 
+def test_multi_auc_toydata():
+    y_true = np.array([0, 1, 2])
+    y_scores = np.array(
+        [[0.714, 0.072, 0.214], [0.837, 0.143, 0.020], [0.714, 0.072, 0.214]])
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.666666666663)
+
+    y_true = np.array([0, 1, 0, 2])
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.75)
+
+    y_true = np.array([0, 1, 2, 2])
+    y_scores = np.array(
+        [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:,0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:,1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:,2])
+    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
+        result_weighted)
+
+    result_unweighted = out_0 + out_1 + out_2
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr"),
+        result_unweighted)
+
+def test_auc_score_multi_error():
+    # Test that roc_auc_score function returns an error when trying
+    # to compute multiclass AUC for parameters where an output
+    # is not defined.
+    rng = check_random_state(404)
+    y_pred = rng.rand(10)
+    y_true = rng.randint(0, 3, size=10)
+    assert_raise_message(ValueError,
+			"average has to be one of (None, 'macro', 'weighted')",
+                         roc_auc_score, y_true, y_pred, average="sample")
+    assert_raise_message(ValueError,
+			 "average has to be one of (None, 'macro', 'weighted')",
+                         roc_auc_score, y_true, y_pred, average="micro")
+
 def test_auc_score_non_binary_class():
     # Test that roc_auc_score function returns an error when trying
     # to compute AUC for non-binary class values.
@@ -428,10 +449,6 @@ def test_auc_score_non_binary_class():
     y_true = -np.ones(10, dtype="int")
     assert_raise_message(ValueError, "ROC AUC score is not defined",
                          roc_auc_score, y_true, y_pred)
-    # y_true contains three different class values
-    y_true = rng.randint(0, 3, size=10)
-    assert_raise_message(ValueError, "multiclass format is not supported",
-                         roc_auc_score, y_true, y_pred)
 
     clean_warning_registry()
     with warnings.catch_warnings(record=True):
@@ -448,11 +465,6 @@ def test_auc_score_non_binary_class():
         assert_raise_message(ValueError, "ROC AUC score is not defined",
                              roc_auc_score, y_true, y_pred)
 
-        # y_true contains three different class values
-        y_true = rng.randint(0, 3, size=10)
-        assert_raise_message(ValueError, "multiclass format is not supported",
-                             roc_auc_score, y_true, y_pred)
-
 
 def test_precision_recall_curve():
     y_true, _, probas_pred = make_prediction(binary=True)

From 2ac42c2efdb841ab0def36061267413139074658 Mon Sep 17 00:00:00 2001
From: Kathy <katch@sas.upenn.edu>
Date: Tue, 18 Oct 2016 21:20:30 -0400
Subject: [PATCH 04/31] implemented ovr with the multilabelbinarizer

---
 sklearn/metrics/base.py               | 61 ++++++++++++---------------
 sklearn/metrics/ranking.py            | 11 ++++-
 sklearn/metrics/tests/test_ranking.py |  2 +-
 3 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 0a0a33227c6de..5b03659054f47 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -14,7 +14,6 @@
 
 from __future__ import division
 
-import itertools
 import numpy as np
 
 from ..utils import check_array, check_consistent_length
@@ -133,6 +132,7 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
     else:
         return score
 
+
 def _average_multiclass_score(binary_metric, y_true, y_score,
                               average, multiclass):
 
@@ -147,29 +147,27 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
         Target scores corresponding to probability estimates of a sample
         belonging to a particular class
 
-    average : string, [None, 'macro' (default), 'weighted']
-        TODO: difference between 'macro' and None? Should there be both?
-        If ``None``, the scores for each class are returned. Otherwise,
-        this determines the type of averaging performed on the data:
-
+    average : string, ['macro' (default), 'weighted']
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
-            mean.  This does not take label imbalance into account.
+            mean. This does not take label imbalance into account. (Classes
+            are assumed to be uniformly distributed.)
         ``'weighted'``:
             Calculate metrics for each label, taking into account the a priori
             distribution of the classes.
 
     binary_metric : callable, returns shape [n_classes]
         The binary metric function to use.
+        TODO: what is the input requirement?
 
     Returns
     -------
-    score : float or array of shape [n_classes]
-        If not ``None``, average the score, else return the score for each
-        classes.
+    score : float
+        Average the score.
+        TODO: improve documentation on this line.
 
     """
-    average_options = (None, "macro", "weighted")
+    average_options = ("macro", "weighted")
     if average not in average_options:
         raise ValueError("average has to be one of {0}"
                          "".format(average_options))
@@ -182,35 +180,32 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
     y_true = check_array(y_true)
     y_score = check_array(y_score)
 
-    not_average_axis = 1
-
     if y_true.ndim == 1:
         y_true = y_true.reshape((-1, 1))
 
-    if y_score.ndim == 1:
-        y_score = y_score.reshape((-1, 1))
-
     label_unique, label_counts = np.unique(y_true, return_counts=True)
-    label_counts_map = dict(zip(label_unique, label_counts))
     n_labels = len(label_unique)
-    if multiclass == "ovo":
-        # Hand and Till 2001 (unweighted)
-        pairwise = [p for p in itertools.combinations(xrange(n_labels), 2)]
-        auc_scores_sum = 0
-        for pair in pairwise:
-            ix = np.in1d(y_true.ravel(), [pair[0], pair[1]])
+    # Hand and Till 2001 (unweighted)
+    auc_scores_sum = 0
+    for pos in range(n_labels):
+        for neg in range(n_labels):
+            if pos == neg:
+                continue
+            ix = np.in1d(y_true.ravel(), [pos, neg])
             y_true_filtered = y_true[0, np.where(ix)]
             y_score_filtered = y_score[np.where(ix)]
-            y_true_filtered_10 = np.in1d(y_true_filtered.ravel(), pair[0]).astype(int)
-            y_true_filtered_01 = np.in1d(y_true_filtered.ravel(), pair[1]).astype(int)
-            binary_avg_output = \
-              (binary_metric(y_true_filtered_10, y_score_filtered[:,pair[0]]) +
-               binary_metric(y_true_filtered_01, y_score_filtered[:,pair[1]]))/2.0
-            auc_scores_sum += binary_avg_output
+            y_true_10 = y_true_filtered == pos
+            y_true_01 = y_true_filtered == neg
+            score_10 = binary_metric(y_true_10[0], y_score_filtered[:, pos])
+            score_01 = binary_metric(y_true_01[0], y_score_filtered[:, neg])
+            binary_avg_auc = (score_10 + score_01)/2.0
             if average == "weighted":
-                raise ValueError("one-vs-one multiclass AUC is only implemented "
-                                 "for the unweighted Hand and Till (2001) algorithm")
-        return auc_scores_sum * (2.0 / (n_labels * (n_labels - 1.0)))
+                probability_pos = len(y_true[0] == pos)/float(len(y_true))
+                auc_scores_sum += binary_avg_auc * probability_pos
+            else:
+                auc_scores_sum += binary_avg_auc
+    return auc_scores_sum * (1.0 / (n_labels * (n_labels - 1.0)))
+    '''
     else:
         # Provost and Domingos 2001 (weighted)
         auc_scores_sum = 0
@@ -222,4 +217,4 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
                 binary_output *= (label_counts_map[label]/float(sum(label_counts_map.values())))
             auc_scores_sum += binary_output
         return auc_scores_sum
-
+    '''
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 632eef683d721..00a2bb394fe58 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -23,6 +23,7 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 
+from ..preprocessing import MultiLabelBinarizer
 from ..utils import assert_all_finite
 from ..utils import check_consistent_length
 from ..utils import column_or_1d, check_array
@@ -260,7 +261,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return _average_binary_score(
             _binary_roc_auc_score, y_true, y_score, average,
             sample_weight=sample_weight)
-    else:
+    elif multiclass == "ovo":
         '''
         average_options = (None, "macro", "weighted")
         if average not in average_options:
@@ -274,6 +275,13 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return _average_multiclass_score(
             _binary_roc_auc_score, y_true, y_score,
             average, multiclass)
+    else:
+        print y_true
+        y_true = y_true.reshape((-1, 1))
+        y_true_multilabels = MultiLabelBinarizer().fit_transform(y_true)
+        return _average_binary_score(_binary_roc_auc_score,
+               y_true_multilabels, y_score, average, sample_weight=sample_weight)
+
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
     """Calculate true and false positives per binary classification threshold.
@@ -288,7 +296,6 @@ def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
 
     pos_label : int or str, default=None
         The label of the positive class
-A
 
     sample_weight : array-like of shape = [n_samples], optional
         Sample weights.
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 9b4ec620b31e6..ee988a7992e8b 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -415,7 +415,7 @@ def test_multi_auc_toydata():
         roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
         result_weighted)
 
-    result_unweighted = out_0 + out_1 + out_2
+    result_unweighted = (out_0 + out_1 + out_2)/3.0
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovr"),
         result_unweighted)

From 4e6141fe6b89f1a30cdec19c0e9fd3e34535f273 Mon Sep 17 00:00:00 2001
From: Kathy <katch@sas.upenn.edu>
Date: Tue, 18 Oct 2016 21:25:40 -0400
Subject: [PATCH 05/31] removed the ovr implementation that was in the base.py
 function

---
 sklearn/metrics/base.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 5b03659054f47..978178db6b52e 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -205,16 +205,3 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
             else:
                 auc_scores_sum += binary_avg_auc
     return auc_scores_sum * (1.0 / (n_labels * (n_labels - 1.0)))
-    '''
-    else:
-        # Provost and Domingos 2001 (weighted)
-        auc_scores_sum = 0
-        for label in label_unique:
-            y_true_label = np.in1d(y_true.ravel(), label).astype(int)
-            y_score_label = y_score[:,label]
-            binary_output = binary_metric(y_true_label, y_score_label)
-            if average == "weighted":
-                binary_output *= (label_counts_map[label]/float(sum(label_counts_map.values())))
-            auc_scores_sum += binary_output
-        return auc_scores_sum
-    '''

From 7bd899edff7ad91891546aa1506b598eece90d08 Mon Sep 17 00:00:00 2001
From: Kathy <katch@sas.upenn.edu>
Date: Tue, 18 Oct 2016 21:43:35 -0400
Subject: [PATCH 06/31] lots more code cleanup

---
 sklearn/metrics/base.py    | 27 ++++-----------------------
 sklearn/metrics/ranking.py | 37 ++++++++++++++++++++++---------------
 2 files changed, 26 insertions(+), 38 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 978178db6b52e..588380345515e 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -133,10 +133,9 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         return score
 
 
-def _average_multiclass_score(binary_metric, y_true, y_score,
-                              average, multiclass):
-
-    """Uses the binary metric for multiclass classification
+def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
+    """Uses the binary metric for one-vs-one multiclass classification,
+    where the score is computed according to the Hand & Till (2001) algorithm.
 
     Parameters
     ----------
@@ -165,27 +164,9 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
     score : float
         Average the score.
         TODO: improve documentation on this line.
-
     """
-    average_options = ("macro", "weighted")
-    if average not in average_options:
-        raise ValueError("average has to be one of {0}"
-                         "".format(average_options))
-    multiclass_options = ("ovo", "ovr")
-    if multiclass not in multiclass_options:
-        raise ValueError("{0} is not supported for multiclass ROC AUC"
-                         "".format(multiclass))
-
-    check_consistent_length(y_true, y_score)
-    y_true = check_array(y_true)
-    y_score = check_array(y_score)
-
-    if y_true.ndim == 1:
-        y_true = y_true.reshape((-1, 1))
-
     label_unique, label_counts = np.unique(y_true, return_counts=True)
     n_labels = len(label_unique)
-    # Hand and Till 2001 (unweighted)
     auc_scores_sum = 0
     for pos in range(n_labels):
         for neg in range(n_labels):
@@ -204,4 +185,4 @@ def _average_multiclass_score(binary_metric, y_true, y_score,
                 auc_scores_sum += binary_avg_auc * probability_pos
             else:
                 auc_scores_sum += binary_avg_auc
-    return auc_scores_sum * (1.0 / (n_labels * (n_labels - 1.0)))
+    return auc_scores_sum / (n_labels * (n_labels - 1.0))
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 00a2bb394fe58..0e5784d34606c 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -35,7 +35,7 @@
 from ..utils.sparsefuncs import count_nonzero
 from ..exceptions import UndefinedMetricWarning
 
-from .base import _average_binary_score, _average_multiclass_score
+from .base import _average_binary_score, _average_multiclass_ovo_score
 
 
 def auc(x, y, reorder=False):
@@ -185,7 +185,8 @@ def _binary_average_precision(y_true, y_score, sample_weight=None):
                                  average, sample_weight=sample_weight)
 
 
-def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro", sample_weight=None):
+def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
+                  sample_weight=None):
     """Compute Area Under the Curve (AUC) from prediction scores
 
     Note: this implementation is restricted to the binary classification task
@@ -261,9 +262,9 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return _average_binary_score(
             _binary_roc_auc_score, y_true, y_score, average,
             sample_weight=sample_weight)
-    elif multiclass == "ovo":
-        '''
-        average_options = (None, "macro", "weighted")
+    else:
+        # validation for multiclass parameter specifications
+        average_options = ("macro", "weighted")
         if average not in average_options:
             raise ValueError("average has to be one of {0}"
                              "".format(average_options))
@@ -271,16 +272,22 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         if multiclass not in multiclass_options:
             raise ValueError("{0} is not supported for multiclass ROC AUC"
                              "".format(multiclass))
-        '''
-        return _average_multiclass_score(
-            _binary_roc_auc_score, y_true, y_score,
-            average, multiclass)
-    else:
-        print y_true
-        y_true = y_true.reshape((-1, 1))
-        y_true_multilabels = MultiLabelBinarizer().fit_transform(y_true)
-        return _average_binary_score(_binary_roc_auc_score,
-               y_true_multilabels, y_score, average, sample_weight=sample_weight)
+
+        check_consistent_length(y_true, y_score)
+        y_true = check_array(y_true)
+        y_score = check_array(y_score)
+
+        if y_true.ndim == 1:
+            y_true = y_true.reshape((-1, 1))
+
+        if multiclass == "ovo":
+            return _average_multiclass_ovo_score(
+                _binary_roc_auc_score, y_true, y_score, average)
+        else:
+            y_true_multilabel = MultiLabelBinarizer().fit_transform(y_true)
+            return _average_binary_score(_binary_roc_auc_score,
+                                         y_true_multilabel, y_score, average,
+                                         sample_weight=sample_weight)
 
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):

From f4fb56f1e97afa437add3a13391e68987aaea08b Mon Sep 17 00:00:00 2001
From: Kathy <katch@sas.upenn.edu>
Date: Tue, 18 Oct 2016 23:01:45 -0400
Subject: [PATCH 07/31] pending, need more test cases

---
 sklearn/metrics/base.py               | 35 ++++++++++++++++-----------
 sklearn/metrics/ranking.py            | 19 +++++++++------
 sklearn/metrics/tests/test_ranking.py | 19 +++++++++++----
 3 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 588380345515e..fd3564b5076bf 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -139,31 +139,35 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
 
     Parameters
     ----------
-    y_true : array, shape = [n_samples] or [n_samples, n_classes]
-        True multiclass labels
+    y_true : array, shape = [n_samples]
+        True multiclass labels.
+        Currently only handles labels with values 0 to n_classes - 1.
 
-    y_score : array, shape = [n_samples] or [n_samples, n_classes]
+    y_score : array, shape = [n_samples, n_classes]
         Target scores corresponding to probability estimates of a sample
         belonging to a particular class
 
     average : string, ['macro' (default), 'weighted']
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
-            mean. This does not take label imbalance into account. (Classes
-            are assumed to be uniformly distributed.)
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
         ``'weighted'``:
             Calculate metrics for each label, taking into account the a priori
             distribution of the classes.
 
-    binary_metric : callable, returns shape [n_classes]
-        The binary metric function to use.
-        TODO: what is the input requirement?
+    binary_metric : callable, the binary metric function to use.
+        Accepts the following as input
+            y_true' : array, shape = [n_samples']
+                Some sub-array of y_true
+            y_score' : array, shape = [n_samples']
+                Target scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
 
     Returns
     -------
     score : float
-        Average the score.
-        TODO: improve documentation on this line.
+        Average the sum of the pairwise binary metric scores
     """
     label_unique, label_counts = np.unique(y_true, return_counts=True)
     n_labels = len(label_unique)
@@ -173,15 +177,18 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
             if pos == neg:
                 continue
             ix = np.in1d(y_true.ravel(), [pos, neg])
-            y_true_filtered = y_true[0, np.where(ix)]
+            y_true_filtered = y_true[np.where(ix.reshape(y_true.shape))]
             y_score_filtered = y_score[np.where(ix)]
+
             y_true_10 = y_true_filtered == pos
             y_true_01 = y_true_filtered == neg
-            score_10 = binary_metric(y_true_10[0], y_score_filtered[:, pos])
-            score_01 = binary_metric(y_true_01[0], y_score_filtered[:, neg])
+            score_10 = binary_metric(
+                    y_true_10, y_score_filtered[:, pos])
+            score_01 = binary_metric(
+                    y_true_01, y_score_filtered[:, neg])
             binary_avg_auc = (score_10 + score_01)/2.0
             if average == "weighted":
-                probability_pos = len(y_true[0] == pos)/float(len(y_true))
+                probability_pos = np.sum(y_true == pos)/float(y_true.size)
                 auc_scores_sum += binary_avg_auc * probability_pos
             else:
                 auc_scores_sum += binary_avg_auc
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 0e5784d34606c..1cca54f6ba331 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -189,9 +189,6 @@ def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
                   sample_weight=None):
     """Compute Area Under the Curve (AUC) from prediction scores
 
-    Note: this implementation is restricted to the binary classification task
-    or multilabel classification task in label indicator format.
-
     Read more in the :ref:`User Guide <roc_metrics>`.
 
     Parameters
@@ -204,6 +201,17 @@ def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
         class, confidence values, or non-thresholded measure of decisions
         (as returned by "decision_function" on some classifiers).
 
+    multiclass : string, ['ovr' (default), 'ovo']
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages.
+
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
     average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
@@ -274,8 +282,6 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              "".format(multiclass))
 
         check_consistent_length(y_true, y_score)
-        y_true = check_array(y_true)
-        y_score = check_array(y_score)
 
         if y_true.ndim == 1:
             y_true = y_true.reshape((-1, 1))
@@ -286,8 +292,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         else:
             y_true_multilabel = MultiLabelBinarizer().fit_transform(y_true)
             return _average_binary_score(_binary_roc_auc_score,
-                                         y_true_multilabel, y_score, average,
-                                         sample_weight=sample_weight)
+                                         y_true_multilabel, y_score, average)
 
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index ee988a7992e8b..df82d388e5191 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -404,12 +404,19 @@ def test_multi_auc_toydata():
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.75)
 
+    y_true = np.array([0, 1, 0, 2])
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
+        0.23958333333)
+
     y_true = np.array([0, 1, 2, 2])
     y_scores = np.array(
         [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
-    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:,0])
-    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:,1])
-    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:,2])
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
     result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
@@ -420,6 +427,7 @@ def test_multi_auc_toydata():
         roc_auc_score(y_true, y_scores, multiclass="ovr"),
         result_unweighted)
 
+
 def test_auc_score_multi_error():
     # Test that roc_auc_score function returns an error when trying
     # to compute multiclass AUC for parameters where an output
@@ -428,12 +436,13 @@ def test_auc_score_multi_error():
     y_pred = rng.rand(10)
     y_true = rng.randint(0, 3, size=10)
     assert_raise_message(ValueError,
-			"average has to be one of (None, 'macro', 'weighted')",
+                         "average has to be one of ('macro', 'weighted')",
                          roc_auc_score, y_true, y_pred, average="sample")
     assert_raise_message(ValueError,
-			 "average has to be one of (None, 'macro', 'weighted')",
+                         "average has to be one of ('macro', 'weighted')",
                          roc_auc_score, y_true, y_pred, average="micro")
 
+
 def test_auc_score_non_binary_class():
     # Test that roc_auc_score function returns an error when trying
     # to compute AUC for non-binary class values.

From dd5c06a91cfc654b293fd6e8dcb7f16b883a6f8e Mon Sep 17 00:00:00 2001
From: Kathy Chen <kchen@Allens-Air.fios-router.home>
Date: Tue, 25 Oct 2016 22:18:34 -0400
Subject: [PATCH 08/31] making changes in response to PR: remove unused
 variable and added input parameter specifications

---
 sklearn/metrics/base.py    | 3 +--
 sklearn/metrics/ranking.py | 6 ++++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index fd3564b5076bf..27b3946b91373 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -169,8 +169,7 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     score : float
         Average the sum of the pairwise binary metric scores
     """
-    label_unique, label_counts = np.unique(y_true, return_counts=True)
-    n_labels = len(label_unique)
+    n_labels = len(np.unique(y_true))
     auc_scores_sum = 0
     for pos in range(n_labels):
         for neg in range(n_labels):
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 1cca54f6ba331..222dc8965cd77 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -195,11 +195,15 @@ def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
     ----------
     y_true : array, shape = [n_samples] or [n_samples, n_classes]
         True binary labels in binary label indicators.
+        The multiclass case expects shape = [n_samples] and labels
+        with values from 0 to (n_classes-1), inclusive.
 
     y_score : array, shape = [n_samples] or [n_samples, n_classes]
         Target scores, can either be probability estimates of the positive
         class, confidence values, or non-thresholded measure of decisions
         (as returned by "decision_function" on some classifiers).
+        The multiclass case expects shape = [n_samples, n_classes]
+        where the scores correspond to probability estimates.
 
     multiclass : string, ['ovr' (default), 'ovo']
         Note: multiclass ROC AUC currently only handles the 'macro' and
@@ -282,6 +286,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              "".format(multiclass))
 
         check_consistent_length(y_true, y_score)
+        check_array(y_true, ensure_2d=False)
+        check_array(y_score)
 
         if y_true.ndim == 1:
             y_true = y_true.reshape((-1, 1))

From 91b1428e0b370f768122edb4143ef17c77cfd94a Mon Sep 17 00:00:00 2001
From: Kathy Chen <kchen@Allens-Air.fios-router.home>
Date: Tue, 25 Oct 2016 22:49:42 -0400
Subject: [PATCH 09/31] making a change to one of the rst files for documenting
 the multiclass roc auc score

---
 sklearn/metrics/ranking.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 222dc8965cd77..26e4c851ce12c 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -289,13 +289,11 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         check_array(y_true, ensure_2d=False)
         check_array(y_score)
 
-        if y_true.ndim == 1:
-            y_true = y_true.reshape((-1, 1))
-
         if multiclass == "ovo":
             return _average_multiclass_ovo_score(
                 _binary_roc_auc_score, y_true, y_score, average)
         else:
+            y_true = y_true.reshape((-1, 1))
             y_true_multilabel = MultiLabelBinarizer().fit_transform(y_true)
             return _average_binary_score(_binary_roc_auc_score,
                                          y_true_multilabel, y_score, average)

From 3d4d065a028895dc4fae0244ffeadac37a1efc93 Mon Sep 17 00:00:00 2001
From: Kathy Chen <kchen2013@gmail.com>
Date: Tue, 25 Oct 2016 22:51:07 -0400
Subject: [PATCH 10/31] making a change to one of the rst files for documenting
 the multiclass roc auc score

---
 doc/modules/model_evaluation.rst | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index be0259879a2dc..a03530cf80733 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -254,13 +254,21 @@ Some also work in the multilabel case:
    recall_score
    zero_one_loss
 
-And some work with binary and multilabel (but not multiclass) problems:
+
+Some work with binary and multilabel (but not multiclass) problems:
 
 .. autosummary::
    :template: function.rst
 
    average_precision_score
-   roc_auc_score
+
+
+And some work with binary, multilabel, and multiclass problems:
+
+.. autosummary::
+   :template: function.rst
+
+    roc_auc_score
 
 
 In the following sub-sections, we will describe each of those functions,
@@ -976,9 +984,12 @@ In multi-label classification, the :func:`roc_auc_score` function is
 extended by averaging over the labels as :ref:`above <average>`.
 
 Compared to metrics such as the subset accuracy, the Hamming loss, or the
-F1 score, ROC doesn't require optimizing a threshold for each label. The
-:func:`roc_auc_score` function can also be used in multi-class classification,
-if the predicted outputs have been binarized.
+F1 score, ROC doesn't require optimizing a threshold for each label.
+
+The :func:`roc_auc_score` function can also be used in multi-class
+classification, where the predicted class labels are provided in
+an array with values from 0 to `n_classes`, and the scores are the
+probability estimates that a sample belongs to a particular class.
 
 
 .. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_002.png

From e037993b590495f6e273f4fcea355259986bfff8 Mon Sep 17 00:00:00 2001
From: Kathy Chen <kchen2013@gmail.com>
Date: Wed, 26 Oct 2016 08:39:48 -0400
Subject: [PATCH 11/31] added a valueerror test case after checking code
 coverage for new functionality

---
 sklearn/metrics/ranking.py            | 2 +-
 sklearn/metrics/tests/test_ranking.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 26e4c851ce12c..4a77889107bd9 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -282,7 +282,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                              "".format(average_options))
         multiclass_options = ("ovo", "ovr")
         if multiclass not in multiclass_options:
-            raise ValueError("{0} is not supported for multiclass ROC AUC"
+            raise ValueError("'{0}' is not supported for multiclass ROC AUC"
                              "".format(multiclass))
 
         check_consistent_length(y_true, y_score)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index df82d388e5191..93f0b4fa59f83 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -441,6 +441,9 @@ def test_auc_score_multi_error():
     assert_raise_message(ValueError,
                          "average has to be one of ('macro', 'weighted')",
                          roc_auc_score, y_true, y_pred, average="micro")
+    assert_raise_message(ValueError,
+                         "'invalid' is not supported for multiclass ROC AUC",
+                         roc_auc_score, y_true, y_pred, multiclass="invalid")
 
 
 def test_auc_score_non_binary_class():

From acb977e37265cd04e704cec2d7983e44c94f09d9 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Sat, 19 Nov 2016 20:57:10 -0500
Subject: [PATCH 12/31] sample_weight can only be None, documentation update

---
 sklearn/metrics/ranking.py            | 13 +++++++++----
 sklearn/metrics/tests/test_ranking.py | 19 +++++++++++++------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 4a77889107bd9..fd9d4546b55dc 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -278,13 +278,18 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         # validation for multiclass parameter specifications
         average_options = ("macro", "weighted")
         if average not in average_options:
-            raise ValueError("average has to be one of {0}"
+            raise ValueError("Parameter 'average' must be one of {0}."
                              "".format(average_options))
         multiclass_options = ("ovo", "ovr")
         if multiclass not in multiclass_options:
-            raise ValueError("'{0}' is not supported for multiclass ROC AUC"
-                             "".format(multiclass))
-
+            raise ValueError("Parameter multiclass='{0}' is not supported"
+                             " for multiclass ROC AUC. 'multiclass' must be"
+                             " one of {1}.".format(
+                                 multiclass, multiclass_options))
+        if sample_weight is not None:
+            raise ValueError("Parameter 'sample_weight' is not supported"
+                             " for multiclass ROC AUC. 'sample_weight' must"
+                             " be None.")
         check_consistent_length(y_true, y_score)
         check_array(y_true, ensure_2d=False)
         check_array(y_score)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 93f0b4fa59f83..dd3c38b844c08 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -435,15 +435,22 @@ def test_auc_score_multi_error():
     rng = check_random_state(404)
     y_pred = rng.rand(10)
     y_true = rng.randint(0, 3, size=10)
-    assert_raise_message(ValueError,
-                         "average has to be one of ('macro', 'weighted')",
+    average_error_msg = ("Parameter 'average' must be one of " +
+                         "('macro', 'weighted').")
+    assert_raise_message(ValueError, average_error_msg,
                          roc_auc_score, y_true, y_pred, average="sample")
-    assert_raise_message(ValueError,
-                         "average has to be one of ('macro', 'weighted')",
+    assert_raise_message(ValueError, average_error_msg,
                          roc_auc_score, y_true, y_pred, average="micro")
-    assert_raise_message(ValueError,
-                         "'invalid' is not supported for multiclass ROC AUC",
+    multiclass_error_msg = ("Parameter multiclass='invalid' is not " +
+                            "supported for multiclass ROC AUC. 'multiclass' " +
+                            "must be one of ('ovo', 'ovr').")
+    assert_raise_message(ValueError, multiclass_error_msg,
                          roc_auc_score, y_true, y_pred, multiclass="invalid")
+    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported " +
+                               "for multiclass ROC AUC. 'sample_weight' " +
+                               "must be None.")
+    assert_raise_message(ValueError, sample_weight_error_msg,
+                         roc_auc_score, y_true, y_pred, sample_weight=[])
 
 
 def test_auc_score_non_binary_class():

From 8dd96651ab4746687fc1aaa0de79a6c7ef6bdc25 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Mon, 21 Nov 2016 16:28:02 -0500
Subject: [PATCH 13/31] model_evaluation documentation update

---
 doc/modules/model_evaluation.rst | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index a03530cf80733..d3cbd381b9220 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -252,6 +252,7 @@ Some also work in the multilabel case:
    precision_recall_fscore_support
    precision_score
    recall_score
+   roc_auc_score
    zero_one_loss
 
 
@@ -263,14 +264,6 @@ Some work with binary and multilabel (but not multiclass) problems:
    average_precision_score
 
 
-And some work with binary, multilabel, and multiclass problems:
-
-.. autosummary::
-   :template: function.rst
-
-    roc_auc_score
-
-
 In the following sub-sections, we will describe each of those functions,
 preceded by some notes on common API and metric definition.
 
@@ -987,8 +980,12 @@ Compared to metrics such as the subset accuracy, the Hamming loss, or the
 F1 score, ROC doesn't require optimizing a threshold for each label.
 
 The :func:`roc_auc_score` function can also be used in multi-class
-classification, where the predicted class labels are provided in
-an array with values from 0 to `n_classes`, and the scores are the
+classification. Two averaging strategies are currently supported: the
+Hand & Till (2001) one-vs-one algorithm computes the average of the pairwise
+ROC AUC scores, and the Provost & Domingos (2001) one-vs-rest algorithm
+computes the average of the ROC AUC scores for each class against
+all other classes. In both cases, the predicted class labels are provided in
+an array with values from 0 to `n_classes`, and the scores correspond to the
 probability estimates that a sample belongs to a particular class.
 
 

From 7f652aa1416d7b5d037d0e0a8453bb515e09893b Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 29 Nov 2016 19:27:13 -0500
Subject: [PATCH 14/31] docstring update in _average_multiclass_ovo_score

---
 sklearn/metrics/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 27b3946b91373..db546c235b222 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -141,13 +141,13 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     ----------
     y_true : array, shape = [n_samples]
         True multiclass labels.
-        Currently only handles labels with values 0 to n_classes - 1.
+        Assumes labels have been recoded to 0 to n_classes.
 
     y_score : array, shape = [n_samples, n_classes]
         Target scores corresponding to probability estimates of a sample
         belonging to a particular class
 
-    average : string, ['macro' (default), 'weighted']
+    average : 'macro' or 'weighted', default='macro'
         ``'macro'``:
             Calculate metrics for each label, and find their unweighted
             mean. This does not take label imbalance into account. Classes
@@ -167,7 +167,7 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     Returns
     -------
     score : float
-        Average the sum of the pairwise binary metric scores
+        Average the sum of pairwise binary metric scores
     """
     n_labels = len(np.unique(y_true))
     auc_scores_sum = 0

From 4016c0cf93cb03fbe875eeeceb7bb7d1ccf41929 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 29 Nov 2016 21:14:08 -0500
Subject: [PATCH 15/31] update documentation for multiclass base function and
 test

---
 sklearn/metrics/base.py               | 24 +++++++++++----------
 sklearn/metrics/tests/test_ranking.py | 30 ++++++++++++++-------------
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index db546c235b222..35f26752b3da7 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -176,18 +176,20 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
             if pos == neg:
                 continue
             ix = np.in1d(y_true.ravel(), [pos, neg])
-            y_true_filtered = y_true[np.where(ix.reshape(y_true.shape))]
-            y_score_filtered = y_score[np.where(ix)]
-
-            y_true_10 = y_true_filtered == pos
-            y_true_01 = y_true_filtered == neg
-            score_10 = binary_metric(
-                    y_true_10, y_score_filtered[:, pos])
-            score_01 = binary_metric(
-                    y_true_01, y_score_filtered[:, neg])
-            binary_avg_auc = (score_10 + score_01)/2.0
+            y_true_filtered = y_true[ix.reshape(y_true.shape)]
+            y_score_filtered = y_score[ix]
+
+            # compute score with `pos` as the positive class
+            class_a = y_true_filtered == pos
+            # compute score with `neg` as the positive class
+            class_b = y_true_filtered == neg
+            score_class_a = binary_metric(
+                    class_a, y_score_filtered[:, pos])
+            score_class_b = binary_metric(
+                    class_b, y_score_filtered[:, neg])
+            binary_avg_auc = (score_class_a + score_class_b) / 2.0
             if average == "weighted":
-                probability_pos = np.sum(y_true == pos)/float(y_true.size)
+                probability_pos = np.sum(y_true == pos) / float(y_true.size)
                 auc_scores_sum += binary_avg_auc * probability_pos
             else:
                 auc_scores_sum += binary_avg_auc
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index dd3c38b844c08..0dae60c9b5f27 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -392,41 +392,43 @@ def test_auc_errors():
 
 
 def test_multi_auc_toydata():
-    y_true = np.array([0, 1, 2])
-    y_scores = np.array(
-        [[0.714, 0.072, 0.214], [0.837, 0.143, 0.020], [0.714, 0.072, 0.214]])
-    assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.666666666663)
-
+    # Tests the unweighted, one-vs-one multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
     y_true = np.array([0, 1, 0, 2])
     y_scores = np.array(
         [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.75)
 
-    y_true = np.array([0, 1, 0, 2])
-    y_scores = np.array(
-        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+    # Tests the weighted, one-vs-one multiclass ROC AUC algorithm
+    # on the same input
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
         0.23958333333)
 
+    # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
     y_true = np.array([0, 1, 2, 2])
     y_scores = np.array(
         [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
+    # Compute the expected result by individually computing the 'one-vs-rest'
+    # ROC AUC scores for classes 0, 1, and 2.
     out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
     out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
     out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
-    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
-    assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
-        result_weighted)
-
     result_unweighted = (out_0 + out_1 + out_2)/3.0
+
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovr"),
         result_unweighted)
 
+    # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
+    # on the same input
+    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
+        result_weighted)
+
 
 def test_auc_score_multi_error():
     # Test that roc_auc_score function returns an error when trying

From 86327d9139ebbed5aa123d35e650b93e3c86f6d7 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Thu, 1 Dec 2016 15:56:57 -0500
Subject: [PATCH 16/31] updated the documentation with equations and citations

---
 doc/modules/model_evaluation.rst | 42 +++++++++++++++++++++++++++++---
 sklearn/metrics/base.py          |  9 ++++---
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index d3cbd381b9220..4f0761c32857d 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -981,13 +981,37 @@ F1 score, ROC doesn't require optimizing a threshold for each label.
 
 The :func:`roc_auc_score` function can also be used in multi-class
 classification. Two averaging strategies are currently supported: the
-Hand & Till (2001) one-vs-one algorithm computes the average of the pairwise
-ROC AUC scores, and the Provost & Domingos (2001) one-vs-rest algorithm
+[HT2001]_ one-vs-one algorithm computes the average of the pairwise
+ROC AUC scores, and the [PD2000]_ one-vs-rest algorithm
 computes the average of the ROC AUC scores for each class against
 all other classes. In both cases, the predicted class labels are provided in
-an array with values from 0 to `n_classes`, and the scores correspond to the
+an array with values from 0 to ``n_classes``, and the scores correspond to the
 probability estimates that a sample belongs to a particular class.
 
+**One-vs-one Algorithm**
+[HT2001]_: AUC of each class against each other, computing
+the AUC of all possible pairwise combinations :math:`c(c-1)` for a
+:math:`c`-dimensional classifier.
+
+Using the uniform class distribution:
+
+.. math:: \frac{1}{c(c-1)}\sum_{j=1}^c\sum_{k \neq j}^c \textnormal{AUC}(j, k)
+        
+Using the a priori class distribution:
+
+.. math:: \frac{1}{c(c-1)}\sum_{j=1}^c\sum_{k \neq j}^c p(j)\textnormal{AUC}(j, k)
+
+**One-vs-rest Algorithm**
+[PD2000]_: AUC of each class against the rest. This treats
+a :math:`c`-dimensional classifier as :math:`c` two-dimensional classifiers.
+
+Using the uniform class distribution:
+
+.. math:: \frac{\sum_{j=1}^c \textnormal{AUC}(j, \textnormal{rest}_j)}{c}
+
+Using the a priori class distribution
+
+.. math:: \frac{\sum_{j=1}^c p(j)\textnormal{AUC}(j, \textnormal{rest}_j)}{c}
 
 .. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_002.png
    :target: ../auto_examples/model_selection/plot_roc.html
@@ -1008,6 +1032,18 @@ probability estimates that a sample belongs to a particular class.
     for an example of using ROC to
     model species distribution.
 
+.. topic:: References:
+
+    .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
+       of the area under the ROC curve for multiple class classification problems.
+       <http://link.springer.com/article/10.1023/A:1010920819831>`_
+       Machine learning, 45(2), pp.171-186.
+    .. [PD2000] Provost, F. and Domingos, P., 2000.
+       `Well-trained PETs: Improving probability estimation trees.
+       <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.33.309&rep=rep1&type=pdf>`_
+       CeDER Working Paper #IS-00-04, Stern School of Business, New
+       York University, NY 10012.
+
 .. _zero_one_loss:
 
 Zero one loss
diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 35f26752b3da7..b77cc60429b43 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -158,10 +158,11 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
 
     binary_metric : callable, the binary metric function to use.
         Accepts the following as input
-            y_true' : array, shape = [n_samples']
-                Some sub-array of y_true
-            y_score' : array, shape = [n_samples']
-                Target scores corresponding to the probability estimates
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
                 of a sample belonging to the designated positive class label
 
     Returns

From 271b882e62539bbb23870e74c2a0f45b2e798a56 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 6 Dec 2016 17:11:08 -0500
Subject: [PATCH 17/31] improve the test cases for one-vs-one multiclass roc
 auc

---
 sklearn/metrics/tests/test_ranking.py | 43 ++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 0dae60c9b5f27..4529eb6ece9ed 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -392,19 +392,48 @@ def test_auc_errors():
 
 
 def test_multi_auc_toydata():
-    # Tests the unweighted, one-vs-one multiclass ROC AUC algorithm
+    # Tests the one-vs-one multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
     y_true = np.array([0, 1, 0, 2])
+    n_labels = len(np.unique(y_true))
     y_scores = np.array(
         [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
-    assert_almost_equal(
-        roc_auc_score(y_true, y_scores, multiclass="ovo"), 0.75)
 
-    # Tests the weighted, one-vs-one multiclass ROC AUC algorithm
-    # on the same input
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
+    average_score_01 = (score_01 + score_10) / 2.
+
+    # Consider labels 0 and 2:
+    score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
+    score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
+    average_score_02 = (score_02 + score_20) / 2.
+
+    # Consider labels 1 and 2:
+    score_12 = roc_auc_score([1, 0], [0.4, 0.2])
+    score_21 = roc_auc_score([0, 1], [0.3, 0.8])
+    average_score_12 = (score_12 + score_21) / 2.
+
+    ovo_coefficient = 2. / (n_labels * (n_labels - 1))
+    # Unweighted, one-vs-one multiclass ROC AUC algorithm
+    sum_avg_scores = average_score_01 + average_score_02 + average_score_12
+    ovo_unweighted_score = ovo_coefficient * sum_avg_scores
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo"),
+        ovo_unweighted_score)
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    # Each term is weighted by the posterior for the positive label.
+    weighted_sum_avg_scores = (0.5 * average_score_01 +
+                               0.5 * average_score_02 +
+                               0.25 * average_score_12)
+    ovo_weighted_score = ovo_coefficient * weighted_sum_avg_scores
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
-        0.23958333333)
+        ovo_weighted_score)
 
     # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
@@ -416,7 +445,7 @@ def test_multi_auc_toydata():
     out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
     out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
     out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
-    result_unweighted = (out_0 + out_1 + out_2)/3.0
+    result_unweighted = (out_0 + out_1 + out_2)/3.
 
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovr"),

From d70ae6c03fd378050c05e6b806d475303e3f8ba2 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 6 Dec 2016 19:09:48 -0500
Subject: [PATCH 18/31] ovo uses bincount and ovr uses labelbinarizer

---
 sklearn/metrics/base.py    | 15 +++++++--------
 sklearn/metrics/ranking.py |  4 ++--
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index b77cc60429b43..b28902745b021 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -171,11 +171,10 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         Average the sum of pairwise binary metric scores
     """
     n_labels = len(np.unique(y_true))
+    label_counts = np.bincount(y_true)
     auc_scores_sum = 0
     for pos in range(n_labels):
-        for neg in range(n_labels):
-            if pos == neg:
-                continue
+        for neg in range(pos + 1, n_labels):
             ix = np.in1d(y_true.ravel(), [pos, neg])
             y_true_filtered = y_true[ix.reshape(y_true.shape)]
             y_score_filtered = y_score[ix]
@@ -188,10 +187,10 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
                     class_a, y_score_filtered[:, pos])
             score_class_b = binary_metric(
                     class_b, y_score_filtered[:, neg])
-            binary_avg_auc = (score_class_a + score_class_b) / 2.0
+            binary_avg_score = (score_class_a + score_class_b) / 2.0
             if average == "weighted":
-                probability_pos = np.sum(y_true == pos) / float(y_true.size)
-                auc_scores_sum += binary_avg_auc * probability_pos
+                probability_pos = label_counts[pos] / float(y_true.size)
+                auc_scores_sum += binary_avg_score * probability_pos
             else:
-                auc_scores_sum += binary_avg_auc
-    return auc_scores_sum / (n_labels * (n_labels - 1.0))
+                auc_scores_sum += binary_avg_score
+    return 2. * auc_scores_sum / (n_labels * (n_labels - 1))
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index fd9d4546b55dc..b84e9172a1731 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -23,7 +23,7 @@
 import numpy as np
 from scipy.sparse import csr_matrix
 
-from ..preprocessing import MultiLabelBinarizer
+from ..preprocessing import LabelBinarizer
 from ..utils import assert_all_finite
 from ..utils import check_consistent_length
 from ..utils import column_or_1d, check_array
@@ -299,7 +299,7 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                 _binary_roc_auc_score, y_true, y_score, average)
         else:
             y_true = y_true.reshape((-1, 1))
-            y_true_multilabel = MultiLabelBinarizer().fit_transform(y_true)
+            y_true_multilabel = LabelBinarizer().fit_transform(y_true)
             return _average_binary_score(_binary_roc_auc_score,
                                          y_true_multilabel, y_score, average)
 

From bf8c5fe200a01fc65f3b39fa782aa7880393c8e4 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 6 Dec 2016 21:29:42 -0500
Subject: [PATCH 19/31] fixed a coefficient bug in the weighted HT2001
 algorithm and refactored the implementation

---
 sklearn/metrics/base.py               | 16 ++++++++--------
 sklearn/metrics/tests/test_ranking.py |  7 ++++---
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index b28902745b021..4157fc8f7a1b1 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -171,8 +171,8 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         Average the sum of pairwise binary metric scores
     """
     n_labels = len(np.unique(y_true))
-    label_counts = np.bincount(y_true)
-    auc_scores_sum = 0
+    apriori_label_distribution = np.bincount(y_true) / float(y_true.size)
+    label_scores = np.zeros(n_labels)
     for pos in range(n_labels):
         for neg in range(pos + 1, n_labels):
             ix = np.in1d(y_true.ravel(), [pos, neg])
@@ -188,9 +188,9 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
             score_class_b = binary_metric(
                     class_b, y_score_filtered[:, neg])
             binary_avg_score = (score_class_a + score_class_b) / 2.0
-            if average == "weighted":
-                probability_pos = label_counts[pos] / float(y_true.size)
-                auc_scores_sum += binary_avg_score * probability_pos
-            else:
-                auc_scores_sum += binary_avg_score
-    return 2. * auc_scores_sum / (n_labels * (n_labels - 1))
+            label_scores[pos] += binary_avg_score
+    if average == "weighted":
+        label_scores = np.multiply(apriori_label_distribution, label_scores)
+        return 2. * np.sum(label_scores) / (n_labels - 1)
+    else:
+        return 2. * np.sum(label_scores) / (n_labels * (n_labels - 1))
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 4529eb6ece9ed..cdebcfea8565f 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -417,10 +417,10 @@ def test_multi_auc_toydata():
     score_21 = roc_auc_score([0, 1], [0.3, 0.8])
     average_score_12 = (score_12 + score_21) / 2.
 
-    ovo_coefficient = 2. / (n_labels * (n_labels - 1))
     # Unweighted, one-vs-one multiclass ROC AUC algorithm
     sum_avg_scores = average_score_01 + average_score_02 + average_score_12
-    ovo_unweighted_score = ovo_coefficient * sum_avg_scores
+    ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1))
+    ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo"),
         ovo_unweighted_score)
@@ -430,7 +430,8 @@ def test_multi_auc_toydata():
     weighted_sum_avg_scores = (0.5 * average_score_01 +
                                0.5 * average_score_02 +
                                0.25 * average_score_12)
-    ovo_weighted_score = ovo_coefficient * weighted_sum_avg_scores
+    ovo_weighted_coefficient = 2. / (n_labels - 1)
+    ovo_weighted_score = ovo_weighted_coefficient * weighted_sum_avg_scores
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
         ovo_weighted_score)

From ed7e840a9e8a30f9860a1f0ce629cf7503f95265 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 6 Dec 2016 21:31:35 -0500
Subject: [PATCH 20/31] update the docs with the correct equation

---
 doc/modules/model_evaluation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 4f0761c32857d..4e4bf43704ed4 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -999,7 +999,7 @@ Using the uniform class distribution:
         
 Using the a priori class distribution:
 
-.. math:: \frac{1}{c(c-1)}\sum_{j=1}^c\sum_{k \neq j}^c p(j)\textnormal{AUC}(j, k)
+.. math:: \frac{1}{c-1}\sum_{j=1}^c\sum_{k \neq j}^c p(j)\textnormal{AUC}(j, k)
 
 **One-vs-rest Algorithm**
 [PD2000]_: AUC of each class against the rest. This treats

From b2214c8695e561f4e5b58bc56cbfd9aeec2e8588 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Wed, 7 Dec 2016 15:47:32 -0500
Subject: [PATCH 21/31] updating the plot_roc example with plots for one vs one

---
 examples/model_selection/plot_roc.py | 73 ++++++++++++++++++++++++----
 1 file changed, 63 insertions(+), 10 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 475d7b4aba7a6..556fac0148e87 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -53,9 +53,8 @@
 X = iris.data
 y = iris.target
 
-# Binarize the output
-y = label_binarize(y, classes=[0, 1, 2])
-n_classes = y.shape[1]
+classes = np.unique(y)
+n_classes = len(classes)
 
 # Add noisy features to make the problem harder
 random_state = np.random.RandomState(0)
@@ -72,17 +71,17 @@
 y_score = classifier.fit(X_train, y_train).decision_function(X_test)
 
 # Compute ROC curve and ROC area for each class
+
+# Binarize y_test to compute the ROC curve
+y_test_binarized = label_binarize(y_test, classes=classes)
+
 fpr = dict()
 tpr = dict()
 roc_auc = dict()
 for i in range(n_classes):
-    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
+    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
     roc_auc[i] = auc(fpr[i], tpr[i])
 
-# Compute micro-average ROC curve and ROC area
-fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
-roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
-
 
 ##############################################################################
 # Plot of a ROC curve for a specific class
@@ -101,7 +100,11 @@
 
 
 ##############################################################################
-# Plot ROC curves for the multiclass problem
+# Plot ROC curves for the multiclass problem using One vs. Rest classification.
+
+# Compute micro-average ROC curve and ROC area
+fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
+roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
 # Compute macro-average ROC curve and ROC area
 
@@ -143,6 +146,56 @@
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
-plt.title('Some extension of Receiver operating characteristic to multi-class')
+plt.title('An extension of Receiver operating characteristic to multi-class '
+          'using One-vs-Rest')
 plt.legend(loc="lower right")
 plt.show()
+
+# TODO: roc_auc_score weighted and unweighted
+
+
+##############################################################################
+# Plot ROC curves for the multiclass problem using One vs. One classification.
+
+for pos in range(n_classes):
+    for neg in range(pos + 1, n_classes):
+        # Filter `y_test` and `y_score` to only consider the current
+        # class pair: `pos` and `neg`.
+        class_pair_indices = np.in1d(y_test, [pos, neg])
+        y_true_filtered = y_test[class_pair_indices]
+        y_score_filtered = y_score[class_pair_indices]
+
+        # Compute ROC curve and ROC area with `pos` as the positive class
+        class_a = y_true_filtered == pos
+        fpr[(pos, neg)], tpr[(pos, neg)], _ = roc_curve(
+                class_a, y_score_filtered[:, pos])
+        roc_auc[(pos, neg)] = auc(fpr[(pos, neg)], tpr[(pos, neg)])
+
+        # Compute ROC curve and ROC area with `neg` as the positive class
+        class_b = y_true_filtered == neg
+        fpr[(neg, pos)], tpr[(neg, pos)], _ = roc_curve(
+                class_b, y_score_filtered[:, neg])
+        roc_auc[(neg, pos)] = auc(fpr[(neg, pos)], tpr[(neg, pos)])
+
+plt.figure()
+for pos in range(n_classes):
+    for neg in range(pos + 1, n_classes):
+        plt.plot(fpr[(pos, neg)], tpr[(pos, neg)], lw=lw,
+                 label='ROC curve of class {0} against class {1} '
+                       '(area = {2:0.2f})'.format(
+                        pos, neg, roc_auc[(pos, neg)]))
+        plt.plot(fpr[(neg, pos)], tpr[(neg, pos)], lw=lw,
+                 label='ROC curve of class {0} against class {1} '
+                       '(area = {2:0.2f})'.format(
+                        neg, pos, roc_auc[(neg, pos)]))
+plt.plot([0, 1], [0, 1], 'k--', lw=lw)
+plt.xlim([0.0, 1.0])
+plt.ylim([0.0, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('An extension of Receiver operating characteristic to multi-class '
+          'using One-vs-One')
+plt.legend(bbox_to_anchor=(1.8, 0.55))
+plt.show()
+
+# TODO: roc_auc_scores

From d2aa2a028b0fa5cf5014a0245c7b5bf72727ffb1 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Sat, 10 Dec 2016 16:46:18 -0500
Subject: [PATCH 22/31] updating plot_roc with roc_auc_score functions

---
 examples/model_selection/plot_roc.py | 49 +++++++++++++++++++++++-----
 1 file changed, 41 insertions(+), 8 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 556fac0148e87..8b02931e10eaf 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -19,16 +19,40 @@
 -------------------
 
 ROC curves are typically used in binary classification to study the output of
-a classifier. In order to extend ROC curve and ROC area to multi-class
-or multi-label classification, it is necessary to binarize the output. One ROC
-curve can be drawn per label, but one can also draw a ROC curve by considering
+a classifier. Extensions of ROC curve and ROC area to multi-class
+or multi-label classification can use the One-vs-Rest or One-vs-One scheme.
+
+One-vs-Rest
+-----------
+
+The output is binarized and one ROC curve can be drawn per label,
+where the label is the positive class and all other labels are
+the negative class.
+
+The ROC area can be approximated by taking the average--unweighted or weighted
+by the a priori class distribution--of the one-vs-rest ROC areas.
+
+One can also draw a ROC curve by considering
 each element of the label indicator matrix as a binary prediction
 (micro-averaging).
 
-Another evaluation measure for multi-class classification is
+Another evaluation measure for one-vs-rest multi-class classification is
 macro-averaging, which gives equal weight to the classification of each
 label.
 
+One-vs-One
+----------
+
+Two ROC curves can be drawn per pair of labels because either of the two
+labels can be considered the positive class.
+
+The ROC area can be approximated by first computing the
+approximate ROC area of each label pair as the average of the
+two ROC AUC scores corresponding to that pair. The One-vs-One
+approximation of a multi-class ROC AUC score is the average--
+unweighted or weighted by the a priori class distribution--across
+all of the pairwise approximate ROC AUC scores.
+
 .. note::
 
     See also :func:`sklearn.metrics.roc_auc_score`,
@@ -42,7 +66,7 @@
 from itertools import cycle
 
 from sklearn import svm, datasets
-from sklearn.metrics import roc_curve, auc
+from sklearn.metrics import roc_curve, auc, roc_auc_score
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import label_binarize
 from sklearn.multiclass import OneVsRestClassifier
@@ -151,8 +175,12 @@
 plt.legend(loc="lower right")
 plt.show()
 
-# TODO: roc_auc_score weighted and unweighted
-
+# Compute the One-vs-Rest ROC AUC score, weighted and unweighted
+unweighted_roc_auc_ovr = roc_auc_score(y_test, y_score, multiclass="ovr")
+weighted_roc_auc_ovr = roc_auc_score(
+    y_test, y_score, multiclass="ovr", average="weighted")
+print("One-vs-Rest ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
+      unweighted_roc_auc_ovr, weighted_roc_auc_ovr))
 
 ##############################################################################
 # Plot ROC curves for the multiclass problem using One vs. One classification.
@@ -198,4 +226,9 @@
 plt.legend(bbox_to_anchor=(1.8, 0.55))
 plt.show()
 
-# TODO: roc_auc_scores
+# Compute the One-vs-One ROC AUC score, weighted and unweighted
+unweighted_roc_auc_ovo = roc_auc_score(y_test, y_score, multiclass="ovo")
+weighted_roc_auc_ovo = roc_auc_score(
+    y_test, y_score, multiclass="ovo", average="weighted")
+print("One-vs-One ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
+      unweighted_roc_auc_ovo, weighted_roc_auc_ovo))

From fde6387f649f5827d30f14f57760635c8de1039d Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 14 Mar 2017 16:41:03 -0400
Subject: [PATCH 23/31] updating with some style changes and including the
 invariant under permutation test

---
 sklearn/metrics/base.py               | 46 +++++++++++++++------------
 sklearn/metrics/ranking.py            | 24 +++++++-------
 sklearn/metrics/tests/test_ranking.py | 41 ++++++++++++++++++++----
 3 files changed, 72 insertions(+), 39 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index 4157fc8f7a1b1..b0a104d85f606 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -13,6 +13,7 @@
 # License: BSD 3 clause
 
 from __future__ import division
+import itertools
 
 import numpy as np
 
@@ -171,26 +172,29 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
         Average the sum of pairwise binary metric scores
     """
     n_labels = len(np.unique(y_true))
-    apriori_label_distribution = np.bincount(y_true) / float(y_true.size)
-    label_scores = np.zeros(n_labels)
-    for pos in range(n_labels):
-        for neg in range(pos + 1, n_labels):
-            ix = np.in1d(y_true.ravel(), [pos, neg])
-            y_true_filtered = y_true[ix.reshape(y_true.shape)]
-            y_score_filtered = y_score[ix]
-
-            # compute score with `pos` as the positive class
-            class_a = y_true_filtered == pos
-            # compute score with `neg` as the positive class
-            class_b = y_true_filtered == neg
-            score_class_a = binary_metric(
-                    class_a, y_score_filtered[:, pos])
-            score_class_b = binary_metric(
-                    class_b, y_score_filtered[:, neg])
-            binary_avg_score = (score_class_a + score_class_b) / 2.0
-            label_scores[pos] += binary_avg_score
+    pos_and_neg_prevalence = []
+    label_scores = []
+    for pos, neg in itertools.combinations(range(n_labels), 2):
+        pos_ix = y_true == pos
+        ix = np.logical_or(pos_ix, y_true == neg)
+
+        pos_and_neg_prevalence.append(float(np.sum(ix)) / len(y_true))
+
+        y_score_filtered = y_score[ix]
+
+        class_a = pos_ix[ix]
+        class_b = np.logical_not(class_a)
+
+        score_class_a = binary_metric(
+                class_a, y_score_filtered[:, pos])
+        score_class_b = binary_metric(
+                class_b, y_score_filtered[:, neg])
+        binary_avg_score = (score_class_a + score_class_b) / 2.
+        label_scores.append(binary_avg_score)
+
     if average == "weighted":
-        label_scores = np.multiply(apriori_label_distribution, label_scores)
-        return 2. * np.sum(label_scores) / (n_labels - 1)
+        label_scores = np.multiply(np.array(pos_and_neg_prevalence),
+                                   np.array(label_scores))
+        return np.sum(label_scores) / (n_labels * (n_labels - 1))
     else:
-        return 2. * np.sum(label_scores) / (n_labels * (n_labels - 1))
+        return 2 * np.sum(label_scores) / (n_labels * (n_labels - 1))
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index b84e9172a1731..9862f5c660f81 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -260,7 +260,6 @@ def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
     0.75
 
     """
-
     def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         if len(np.unique(y_true)) != 2:
             raise ValueError("Only one class present in y_true. ROC AUC score "
@@ -270,16 +269,18 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                                         sample_weight=sample_weight)
         return auc(fpr, tpr, reorder=True)
 
-    if type_of_target(y_true) != "multiclass":
-        return _average_binary_score(
-            _binary_roc_auc_score, y_true, y_score, average,
-            sample_weight=sample_weight)
-    else:
+    y_type = type_of_target(y_true)
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
+
+    if y_type == "multiclass" or (y_type == "binary" and
+                                  y_score.ndim == 2 and
+                                  y_score.shape[1] > 2):
         # validation for multiclass parameter specifications
         average_options = ("macro", "weighted")
         if average not in average_options:
-            raise ValueError("Parameter 'average' must be one of {0}."
-                             "".format(average_options))
+            raise ValueError("Parameter 'average' must be one of {0} for"
+                             " multiclass problems.".format(average_options))
         multiclass_options = ("ovo", "ovr")
         if multiclass not in multiclass_options:
             raise ValueError("Parameter multiclass='{0}' is not supported"
@@ -290,9 +291,6 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
             raise ValueError("Parameter 'sample_weight' is not supported"
                              " for multiclass ROC AUC. 'sample_weight' must"
                              " be None.")
-        check_consistent_length(y_true, y_score)
-        check_array(y_true, ensure_2d=False)
-        check_array(y_score)
 
         if multiclass == "ovo":
             return _average_multiclass_ovo_score(
@@ -302,6 +300,10 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
             y_true_multilabel = LabelBinarizer().fit_transform(y_true)
             return _average_binary_score(_binary_roc_auc_score,
                                          y_true_multilabel, y_score, average)
+    else:
+        return _average_binary_score(
+            _binary_roc_auc_score, y_true, y_score, average,
+            sample_weight=sample_weight)
 
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index cdebcfea8565f..76bb202247179 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -391,7 +391,7 @@ def test_auc_errors():
     assert_raises(ValueError, auc, [1.0, 0.0, 0.5], [0.0, 0.0, 0.0])
 
 
-def test_multi_auc_toydata():
+def test_multi_ovo_auc_toydata():
     # Tests the one-vs-one multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
     y_true = np.array([0, 1, 0, 2])
@@ -427,15 +427,17 @@ def test_multi_auc_toydata():
 
     # Weighted, one-vs-one multiclass ROC AUC algorithm
     # Each term is weighted by the posterior for the positive label.
-    weighted_sum_avg_scores = (0.5 * average_score_01 +
-                               0.5 * average_score_02 +
-                               0.25 * average_score_12)
-    ovo_weighted_coefficient = 2. / (n_labels - 1)
+    weighted_sum_avg_scores = (0.75 * average_score_01 +
+                               0.75 * average_score_02 +
+                               0.50 * average_score_12)
+    ovo_weighted_coefficient = 1. / (n_labels * (n_labels - 1))
     ovo_weighted_score = ovo_weighted_coefficient * weighted_sum_avg_scores
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
         ovo_weighted_score)
 
+
+def test_multi_ovr_auc_toydata():
     # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
     # on a small example, representative of an expected use case.
     y_true = np.array([0, 1, 2, 2])
@@ -460,6 +462,30 @@ def test_multi_auc_toydata():
         result_weighted)
 
 
+def test_multi_auc_score_under_permutation():
+    y_score = np.random.rand(100, 3)
+    y_score[:, 2] += .1
+    y_score[:, 1] -= .1
+    y_true = np.argmax(y_score, axis=1)
+    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
+        2, size=20)
+    for multiclass in ['ovr', 'ovo']:
+        for average in ['macro', 'weighted']:
+            same_score_under_permutation = None
+            for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2],
+                         [1, 2, 0], [2, 0, 1], [2, 1, 0]]:
+                inv_perm = np.zeros(3, dtype=int)
+                inv_perm[perm] = np.arange(3)
+                y_score_perm = y_score[:, inv_perm]
+                y_true_perm = np.take(perm, y_true)
+                score = roc_auc_score(y_true_perm, y_score_perm,
+                                      multiclass=multiclass, average=average)
+                if not same_score_under_permutation:
+                    same_score_under_permutation = score
+                else:
+                    assert_almost_equal(score, same_score_under_permutation)
+
+
 def test_auc_score_multi_error():
     # Test that roc_auc_score function returns an error when trying
     # to compute multiclass AUC for parameters where an output
@@ -468,7 +494,7 @@ def test_auc_score_multi_error():
     y_pred = rng.rand(10)
     y_true = rng.randint(0, 3, size=10)
     average_error_msg = ("Parameter 'average' must be one of " +
-                         "('macro', 'weighted').")
+                         "('macro', 'weighted') for multiclass problems.")
     assert_raise_message(ValueError, average_error_msg,
                          roc_auc_score, y_true, y_pred, average="sample")
     assert_raise_message(ValueError, average_error_msg,
@@ -686,7 +712,8 @@ def test_score_scale_invariance():
     # issue #3864 (and others), where overly aggressive rounding was causing
     # problems for users with very small y_score values
     y_true, _, probas_pred = make_prediction(binary=True)
-
+    print(y_true.shape)
+    print(probas_pred.shape)
     roc_auc = roc_auc_score(y_true, probas_pred)
     roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred)
     roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)

From 12592f43107156a4e045e45eafcf085d282cb937 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 14 Mar 2017 16:58:56 -0400
Subject: [PATCH 24/31] flake8 on plot_roc

---
 examples/model_selection/plot_roc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 8b02931e10eaf..2124c54f93feb 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -196,13 +196,13 @@
         # Compute ROC curve and ROC area with `pos` as the positive class
         class_a = y_true_filtered == pos
         fpr[(pos, neg)], tpr[(pos, neg)], _ = roc_curve(
-                class_a, y_score_filtered[:, pos])
+            class_a, y_score_filtered[:, pos])
         roc_auc[(pos, neg)] = auc(fpr[(pos, neg)], tpr[(pos, neg)])
 
         # Compute ROC curve and ROC area with `neg` as the positive class
         class_b = y_true_filtered == neg
         fpr[(neg, pos)], tpr[(neg, pos)], _ = roc_curve(
-                class_b, y_score_filtered[:, neg])
+            class_b, y_score_filtered[:, neg])
         roc_auc[(neg, pos)] = auc(fpr[(neg, pos)], tpr[(neg, pos)])
 
 plt.figure()
@@ -211,11 +211,11 @@
         plt.plot(fpr[(pos, neg)], tpr[(pos, neg)], lw=lw,
                  label='ROC curve of class {0} against class {1} '
                        '(area = {2:0.2f})'.format(
-                        pos, neg, roc_auc[(pos, neg)]))
+                    pos, neg, roc_auc[(pos, neg)]))
         plt.plot(fpr[(neg, pos)], tpr[(neg, pos)], lw=lw,
                  label='ROC curve of class {0} against class {1} '
                        '(area = {2:0.2f})'.format(
-                        neg, pos, roc_auc[(neg, pos)]))
+                    neg, pos, roc_auc[(neg, pos)]))
 plt.plot([0, 1], [0, 1], 'k--', lw=lw)
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])

From b4e498e13c13f92ea6bf63ef1da7edfeba7535a0 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Tue, 14 Mar 2017 18:12:01 -0400
Subject: [PATCH 25/31] over-indent flake8 fix

---
 examples/model_selection/plot_roc.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 2124c54f93feb..3382a006ed6ef 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -211,11 +211,11 @@
         plt.plot(fpr[(pos, neg)], tpr[(pos, neg)], lw=lw,
                  label='ROC curve of class {0} against class {1} '
                        '(area = {2:0.2f})'.format(
-                    pos, neg, roc_auc[(pos, neg)]))
+            pos, neg, roc_auc[(pos, neg)]))
         plt.plot(fpr[(neg, pos)], tpr[(neg, pos)], lw=lw,
                  label='ROC curve of class {0} against class {1} '
                        '(area = {2:0.2f})'.format(
-                    neg, pos, roc_auc[(neg, pos)]))
+            neg, pos, roc_auc[(neg, pos)]))
 plt.plot([0, 1], [0, 1], 'k--', lw=lw)
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])

From 5688ade948b4356340ec05496e9f199eaae65302 Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Sat, 25 Mar 2017 22:50:41 -0400
Subject: [PATCH 26/31] fixed the normalization equation for ovo

---
 sklearn/metrics/base.py               | 54 +++++++++++++--------------
 sklearn/metrics/tests/test_ranking.py | 12 ++----
 2 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index b0a104d85f606..d2edee1902126 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -171,30 +171,30 @@ def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
     score : float
         Average the sum of pairwise binary metric scores
     """
-    n_labels = len(np.unique(y_true))
-    pos_and_neg_prevalence = []
-    label_scores = []
-    for pos, neg in itertools.combinations(range(n_labels), 2):
-        pos_ix = y_true == pos
-        ix = np.logical_or(pos_ix, y_true == neg)
-
-        pos_and_neg_prevalence.append(float(np.sum(ix)) / len(y_true))
-
-        y_score_filtered = y_score[ix]
-
-        class_a = pos_ix[ix]
-        class_b = np.logical_not(class_a)
-
-        score_class_a = binary_metric(
-                class_a, y_score_filtered[:, pos])
-        score_class_b = binary_metric(
-                class_b, y_score_filtered[:, neg])
-        binary_avg_score = (score_class_a + score_class_b) / 2.
-        label_scores.append(binary_avg_score)
-
-    if average == "weighted":
-        label_scores = np.multiply(np.array(pos_and_neg_prevalence),
-                                   np.array(label_scores))
-        return np.sum(label_scores) / (n_labels * (n_labels - 1))
-    else:
-        return 2 * np.sum(label_scores) / (n_labels * (n_labels - 1))
+    n_classes = len(np.unique(y_true))
+    n_pairs = n_classes * (n_classes - 1) // 2
+    prevalence = np.empty(n_pairs)
+    pair_scores = np.empty(n_pairs)
+
+    ix = 0
+    for a, b in itertools.combinations(range(n_classes), 2):
+        a_mask = y_true == a
+        ab_mask = np.logical_or(a_mask, y_true == b)
+
+        prevalence[ix] = np.sum(ab_mask) / len(y_true)
+
+        y_score_filtered = y_score[ab_mask]
+
+        a_true = a_mask[ab_mask]
+        b_true = np.logical_not(a_true)
+
+        a_true_score = binary_metric(
+                a_true, y_score_filtered[:, a])
+        b_true_score = binary_metric(
+                b_true, y_score_filtered[:, b])
+        binary_avg_score = (a_true_score + b_true_score) / 2
+        pair_scores[ix] = binary_avg_score
+
+        ix += 1
+    return (np.average(pair_scores, weights=prevalence)
+            if average == "weighted" else np.average(pair_scores))
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 76bb202247179..1c5a78d441482 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -427,11 +427,9 @@ def test_multi_ovo_auc_toydata():
 
     # Weighted, one-vs-one multiclass ROC AUC algorithm
     # Each term is weighted by the posterior for the positive label.
-    weighted_sum_avg_scores = (0.75 * average_score_01 +
-                               0.75 * average_score_02 +
-                               0.50 * average_score_12)
-    ovo_weighted_coefficient = 1. / (n_labels * (n_labels - 1))
-    ovo_weighted_score = ovo_weighted_coefficient * weighted_sum_avg_scores
+    pair_scores = [average_score_01, average_score_02, average_score_12]
+    prevalence = [0.75, 0.75, 0.50]
+    ovo_weighted_score = np.average(pair_scores, weights=prevalence)
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
         ovo_weighted_score)
@@ -480,7 +478,7 @@ def test_multi_auc_score_under_permutation():
                 y_true_perm = np.take(perm, y_true)
                 score = roc_auc_score(y_true_perm, y_score_perm,
                                       multiclass=multiclass, average=average)
-                if not same_score_under_permutation:
+                if same_score_under_permutation is None:
                     same_score_under_permutation = score
                 else:
                     assert_almost_equal(score, same_score_under_permutation)
@@ -712,8 +710,6 @@ def test_score_scale_invariance():
     # issue #3864 (and others), where overly aggressive rounding was causing
     # problems for users with very small y_score values
     y_true, _, probas_pred = make_prediction(binary=True)
-    print(y_true.shape)
-    print(probas_pred.shape)
     roc_auc = roc_auc_score(y_true, probas_pred)
     roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred)
     roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)

From a784dbc24a5eec1fcbb654acdf81bd32f2f4f48a Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Sun, 26 Mar 2017 10:26:41 -0400
Subject: [PATCH 27/31] beginning the update to examples, needs to be tested

---
 examples/model_selection/plot_roc.py | 77 +++++++++++++---------------
 1 file changed, 37 insertions(+), 40 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 3382a006ed6ef..3187c0e80df87 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -25,9 +25,9 @@
 One-vs-Rest
 -----------
 
-The output is binarized and one ROC curve can be drawn per label,
-where the label is the positive class and all other labels are
-the negative class.
+The output is binarized and one ROC curve is drawn per label,
+where label is set to be the positive class and all other labels (the "rest")
+are considered the negative class.
 
 The ROC area can be approximated by taking the average--unweighted or weighted
 by the a priori class distribution--of the one-vs-rest ROC areas.
@@ -44,14 +44,13 @@
 ----------
 
 Two ROC curves can be drawn per pair of labels because either of the two
-labels can be considered the positive class.
+labels can be considered the positive class (and the other the negative
+class). The ROC area of a label pair is approximated taking the average of these
+two ROC AUC scores.
 
-The ROC area can be approximated by first computing the
-approximate ROC area of each label pair as the average of the
-two ROC AUC scores corresponding to that pair. The One-vs-One
-approximation of a multi-class ROC AUC score is the average--
-unweighted or weighted by the a priori class distribution--across
-all of the pairwise approximate ROC AUC scores.
+The One-vs-One approximation of a multi-class ROC AUC score is the average--
+unweighted or weighted by class prevalence--across all of the pairwise
+approximate ROC AUC scores.
 
 .. note::
 
@@ -63,7 +62,7 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
-from itertools import cycle
+from itertools import combinations, cycle
 
 from sklearn import svm, datasets
 from sklearn.metrics import roc_curve, auc, roc_auc_score
@@ -185,37 +184,35 @@
 ##############################################################################
 # Plot ROC curves for the multiclass problem using One vs. One classification.
 
-for pos in range(n_classes):
-    for neg in range(pos + 1, n_classes):
-        # Filter `y_test` and `y_score` to only consider the current
-        # class pair: `pos` and `neg`.
-        class_pair_indices = np.in1d(y_test, [pos, neg])
-        y_true_filtered = y_test[class_pair_indices]
-        y_score_filtered = y_score[class_pair_indices]
-
-        # Compute ROC curve and ROC area with `pos` as the positive class
-        class_a = y_true_filtered == pos
-        fpr[(pos, neg)], tpr[(pos, neg)], _ = roc_curve(
-            class_a, y_score_filtered[:, pos])
-        roc_auc[(pos, neg)] = auc(fpr[(pos, neg)], tpr[(pos, neg)])
-
-        # Compute ROC curve and ROC area with `neg` as the positive class
-        class_b = y_true_filtered == neg
-        fpr[(neg, pos)], tpr[(neg, pos)], _ = roc_curve(
-            class_b, y_score_filtered[:, neg])
-        roc_auc[(neg, pos)] = auc(fpr[(neg, pos)], tpr[(neg, pos)])
+for a, b in combinations(range(n_classes), 2):
+    # Filter `y_test` and `y_score` to only consider the current
+    # `a` and `b` class pair.
+    ab_mask = np.logical_or(y_test == a, y_true == b)
+    y_true_filtered = y_test[ab_mask]
+    y_score_filtered = y_score[ab_mask]
+
+    # Compute ROC curve and ROC area with `a` as the positive class
+    class_a = y_true_filtered == a
+    fpr[(a, b)], tpr[(a, b)], _ = roc_curve(
+        class_a, y_score_filtered[:, a])
+    roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)])
+
+    # Compute ROC curve and ROC area with `b` as the positive class
+    class_b = y_true_filtered == b
+    fpr[(b, a)], tpr[(b, a)], _ = roc_curve(
+        class_b, y_score_filtered[:, b])
+    roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
 
 plt.figure()
-for pos in range(n_classes):
-    for neg in range(pos + 1, n_classes):
-        plt.plot(fpr[(pos, neg)], tpr[(pos, neg)], lw=lw,
-                 label='ROC curve of class {0} against class {1} '
-                       '(area = {2:0.2f})'.format(
-            pos, neg, roc_auc[(pos, neg)]))
-        plt.plot(fpr[(neg, pos)], tpr[(neg, pos)], lw=lw,
-                 label='ROC curve of class {0} against class {1} '
-                       '(area = {2:0.2f})'.format(
-            neg, pos, roc_auc[(neg, pos)]))
+for a, b in combinations(range(n_classes), 2):
+    plt.plot(fpr[(a, b)], tpr[(a, b)], lw=lw,
+             label='ROC curve of class {0} against class {1} '
+                   '(area = {2:0.2f})'.format(
+        a, b, roc_auc[(a, b)]))
+    plt.plot(fpr[(b, a)], tpr[(b, a)], lw=lw,
+             label='ROC curve of class {0} against class {1} '
+                   '(area = {2:0.2f})'.format(
+        b, a, roc_auc[(b, a)]))
 plt.plot([0, 1], [0, 1], 'k--', lw=lw)
 plt.xlim([0.0, 1.0])
 plt.ylim([0.0, 1.05])

From 0138a757e53dce319d6d0c2263f6f02450d1c648 Mon Sep 17 00:00:00 2001
From: Kathy SSH <kchen2013@gmail.com>
Date: Thu, 27 Apr 2017 13:43:03 +0000
Subject: [PATCH 28/31] updating the documentation for model_evaluation with
 new citations

---
 doc/modules/model_evaluation.rst | 36 +++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 4e4bf43704ed4..c057580877f11 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -980,36 +980,36 @@ Compared to metrics such as the subset accuracy, the Hamming loss, or the
 F1 score, ROC doesn't require optimizing a threshold for each label.
 
 The :func:`roc_auc_score` function can also be used in multi-class
-classification. Two averaging strategies are currently supported: the
-[HT2001]_ one-vs-one algorithm computes the average of the pairwise
-ROC AUC scores, and the [PD2000]_ one-vs-rest algorithm
+classification. [F2009]_ Two averaging strategies are currently supported: the
+one-vs-one algorithm computes the average of the pairwise
+ROC AUC scores, and the one-vs-rest algorithm
 computes the average of the ROC AUC scores for each class against
 all other classes. In both cases, the predicted class labels are provided in
 an array with values from 0 to ``n_classes``, and the scores correspond to the
 probability estimates that a sample belongs to a particular class.
 
 **One-vs-one Algorithm**
-[HT2001]_: AUC of each class against each other, computing
+The AUC of each class against each other, computing
 the AUC of all possible pairwise combinations :math:`c(c-1)` for a
 :math:`c`-dimensional classifier.
 
-Using the uniform class distribution:
+[HT2001]_ Using the uniform class distribution:
 
 .. math:: \frac{1}{c(c-1)}\sum_{j=1}^c\sum_{k \neq j}^c \textnormal{AUC}(j, k)
         
-Using the a priori class distribution:
+[F2009]_ Weighted by the prevalence of classes `j` and `k`:
 
-.. math:: \frac{1}{c-1}\sum_{j=1}^c\sum_{k \neq j}^c p(j)\textnormal{AUC}(j, k)
+.. math:: \frac{1}{c-1}\sum_{j=1}^c\sum_{k \neq j}^c p(j \cup k)\textnormal{AUC}(j, k)
 
 **One-vs-rest Algorithm**
-[PD2000]_: AUC of each class against the rest. This treats
+AUC of each class against the rest. This treats
 a :math:`c`-dimensional classifier as :math:`c` two-dimensional classifiers.
 
-Using the uniform class distribution:
+[F2006]_ Using the uniform class distribution:
 
 .. math:: \frac{\sum_{j=1}^c \textnormal{AUC}(j, \textnormal{rest}_j)}{c}
 
-Using the a priori class distribution
+[F2001]_ Weighted by the a priori class distribution:
 
 .. math:: \frac{\sum_{j=1}^c p(j)\textnormal{AUC}(j, \textnormal{rest}_j)}{c}
 
@@ -1034,15 +1034,21 @@ Using the a priori class distribution
 
 .. topic:: References:
 
+    .. [F2001] Fawcett, T., 2001. `Using rule sets to maximize 
+       ROC performance <http://ieeexplore.ieee.org/document/989510/>`_
+       In Data Mining, 2001.
+       Proceedings IEEE International Conference, pp. 131-138.
+    .. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis.
+       <http://www.sciencedirect.com/science/article/pii/S016786550500303X>`_
+       Pattern Recognition Letters, 27(8), pp. 861-874.
+    .. [F2009] Ferri, C., Hernandez-Orallo, J., and Modroiu, R., 2009.
+       `An experimental comparison of performance measures for classification.
+       <http://www.sciencedirect.com/science/article/pii/S0167865508002687>`_
+       Pattern Recognition Letters, 30(1), pp. 27-38.
     .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
        of the area under the ROC curve for multiple class classification problems.
        <http://link.springer.com/article/10.1023/A:1010920819831>`_
        Machine learning, 45(2), pp.171-186.
-    .. [PD2000] Provost, F. and Domingos, P., 2000.
-       `Well-trained PETs: Improving probability estimation trees.
-       <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.33.309&rep=rep1&type=pdf>`_
-       CeDER Working Paper #IS-00-04, Stern School of Business, New
-       York University, NY 10012.
 
 .. _zero_one_loss:
 

From ad5e93ba22c2dac9719d5adff79f848bbff97837 Mon Sep 17 00:00:00 2001
From: Kathy SSH <kchen2013@gmail.com>
Date: Thu, 27 Apr 2017 14:35:33 +0000
Subject: [PATCH 29/31] fix flake8 error in plot_roc

---
 examples/model_selection/plot_roc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 3187c0e80df87..fefd1d9dc1dca 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -45,8 +45,8 @@
 
 Two ROC curves can be drawn per pair of labels because either of the two
 labels can be considered the positive class (and the other the negative
-class). The ROC area of a label pair is approximated taking the average of these
-two ROC AUC scores.
+class). The ROC area of a label pair is approximated taking the average of
+these two ROC AUC scores.
 
 The One-vs-One approximation of a multi-class ROC AUC score is the average--
 unweighted or weighted by class prevalence--across all of the pairwise
@@ -187,7 +187,7 @@
 for a, b in combinations(range(n_classes), 2):
     # Filter `y_test` and `y_score` to only consider the current
     # `a` and `b` class pair.
-    ab_mask = np.logical_or(y_test == a, y_true == b)
+    ab_mask = np.logical_or(y_test == a, y_test == b)
     y_true_filtered = y_test[ab_mask]
     y_score_filtered = y_score[ab_mask]
 

From 165513a34ef3c5c065e6fe11a13f91c8e37765a5 Mon Sep 17 00:00:00 2001
From: Kathy SSH <kchen2013@gmail.com>
Date: Thu, 27 Apr 2017 14:35:46 +0000
Subject: [PATCH 30/31] update with sample weights in ovr case

---
 sklearn/metrics/ranking.py            |  9 +++++----
 sklearn/metrics/tests/test_ranking.py | 17 +++++++++--------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 9862f5c660f81..6bae5c6759cb6 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -289,8 +289,8 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
                                  multiclass, multiclass_options))
         if sample_weight is not None:
             raise ValueError("Parameter 'sample_weight' is not supported"
-                             " for multiclass ROC AUC. 'sample_weight' must"
-                             " be None.")
+                             " for multiclass one-vs-one ROC AUC."
+                             " 'sample_weight' must be None in this case.")
 
         if multiclass == "ovo":
             return _average_multiclass_ovo_score(
@@ -298,8 +298,9 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         else:
             y_true = y_true.reshape((-1, 1))
             y_true_multilabel = LabelBinarizer().fit_transform(y_true)
-            return _average_binary_score(_binary_roc_auc_score,
-                                         y_true_multilabel, y_score, average)
+            return _average_binary_score(
+                 _binary_roc_auc_score, y_true_multilabel, y_score, average,
+                 sample_weight=sample_weight)
     else:
         return _average_binary_score(
             _binary_roc_auc_score, y_true, y_score, average,
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 1c5a78d441482..12eea9a97f2dc 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -446,7 +446,7 @@ def test_multi_ovr_auc_toydata():
     out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
     out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
     out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
-    result_unweighted = (out_0 + out_1 + out_2)/3.
+    result_unweighted = (out_0 + out_1 + out_2) / 3.
 
     assert_almost_equal(
         roc_auc_score(y_true, y_scores, multiclass="ovr"),
@@ -491,22 +491,23 @@ def test_auc_score_multi_error():
     rng = check_random_state(404)
     y_pred = rng.rand(10)
     y_true = rng.randint(0, 3, size=10)
-    average_error_msg = ("Parameter 'average' must be one of " +
+    average_error_msg = ("Parameter 'average' must be one of "
                          "('macro', 'weighted') for multiclass problems.")
     assert_raise_message(ValueError, average_error_msg,
                          roc_auc_score, y_true, y_pred, average="sample")
     assert_raise_message(ValueError, average_error_msg,
                          roc_auc_score, y_true, y_pred, average="micro")
-    multiclass_error_msg = ("Parameter multiclass='invalid' is not " +
-                            "supported for multiclass ROC AUC. 'multiclass' " +
+    multiclass_error_msg = ("Parameter multiclass='invalid' is not "
+                            "supported for multiclass ROC AUC. 'multiclass' "
                             "must be one of ('ovo', 'ovr').")
     assert_raise_message(ValueError, multiclass_error_msg,
                          roc_auc_score, y_true, y_pred, multiclass="invalid")
-    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported " +
-                               "for multiclass ROC AUC. 'sample_weight' " +
-                               "must be None.")
+    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported "
+                               "for multiclass one-vs-one ROC AUC. "
+                               "'sample_weight' must be None in this case.")
     assert_raise_message(ValueError, sample_weight_error_msg,
-                         roc_auc_score, y_true, y_pred, sample_weight=[])
+                         roc_auc_score, y_true, y_pred,
+                         multiclass="ovo", sample_weight=[])
 
 
 def test_auc_score_non_binary_class():

From 9530511e172816cc706646363f7f2a15d439ee9e Mon Sep 17 00:00:00 2001
From: kchen17 <katch@seas.upenn.edu>
Date: Wed, 7 Jun 2017 10:14:49 -0400
Subject: [PATCH 31/31] modifications to plot_roc example to improve
 readability, fixed one bug

---
 examples/model_selection/plot_roc.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index fefd1d9dc1dca..3a233eb5b79ae 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -126,7 +126,8 @@
 # Plot ROC curves for the multiclass problem using One vs. Rest classification.
 
 # Compute micro-average ROC curve and ROC area
-fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
+fpr["micro"], tpr["micro"], _ = roc_curve(
+    y_test_binarized.ravel(), y_score.ravel())
 roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
 # Compute macro-average ROC curve and ROC area
@@ -169,7 +170,7 @@
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
-plt.title('An extension of Receiver operating characteristic to multi-class '
+plt.title('An extension of ROC to multi-class '
           'using One-vs-Rest')
 plt.legend(loc="lower right")
 plt.show()
@@ -206,11 +207,11 @@
 plt.figure()
 for a, b in combinations(range(n_classes), 2):
     plt.plot(fpr[(a, b)], tpr[(a, b)], lw=lw,
-             label='ROC curve of class {0} against class {1} '
+             label='ROC curve: class {0} vs. {1} '
                    '(area = {2:0.2f})'.format(
         a, b, roc_auc[(a, b)]))
     plt.plot(fpr[(b, a)], tpr[(b, a)], lw=lw,
-             label='ROC curve of class {0} against class {1} '
+             label='ROC curve: class {0} vs. {1} '
                    '(area = {2:0.2f})'.format(
         b, a, roc_auc[(b, a)]))
 plt.plot([0, 1], [0, 1], 'k--', lw=lw)
@@ -218,9 +219,9 @@
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
-plt.title('An extension of Receiver operating characteristic to multi-class '
+plt.title('An extension of ROC to multi-class '
           'using One-vs-One')
-plt.legend(bbox_to_anchor=(1.8, 0.55))
+plt.legend(bbox_to_anchor=(1.1, 0.30))
 plt.show()
 
 # Compute the One-vs-One ROC AUC score, weighted and unweighted