diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
index 1acfe10280e86..fc66096aca45d 100755
--- a/build_tools/travis/install.sh
+++ b/build_tools/travis/install.sh
@@ -39,6 +39,12 @@ if [[ "$DISTRIB" == "conda" ]]; then
 
     # Configure the conda environment and put it in the path using the
     # provided versions
+    if [[ "$USE_PYTEST" == "true" ]]; then
+        TEST_RUNNER_PACKAGE=pytest
+    else
+        TEST_RUNNER_PACKAGE=nose
+    fi
+
     if [[ "$INSTALL_MKL" == "true" ]]; then
         conda create -n testenv --yes python=$PYTHON_VERSION pip \
             pytest pytest-cov numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \
diff --git a/doc/README.md b/doc/README.md
index 18d4bde4f5862..e60a66caf719c 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -1,5 +1,6 @@
 # Documentation for scikit-learn
 
+
 This directory contains the full manual and web site as displayed at
 http://scikit-learn.org. See
 http://scikit-learn.org/dev/developers/contributing.html#documentation for
diff --git a/doc/datasets/conftest.py b/doc/datasets/conftest.py
new file mode 100644
index 0000000000000..0ccc0bced9ee7
--- /dev/null
+++ b/doc/datasets/conftest.py
@@ -0,0 +1,75 @@
+from os.path import exists
+from os.path import join
+
+import numpy as np
+
+from sklearn.utils.testing import SkipTest
+from sklearn.utils.testing import check_skip_network
+from sklearn.datasets import get_data_home
+from sklearn.utils.testing import install_mldata_mock
+from sklearn.utils.testing import uninstall_mldata_mock
+
+
+def setup_labeled_faces():
+    data_home = get_data_home()
+    if not exists(join(data_home, 'lfw_home')):
+        raise SkipTest("Skipping dataset loading doctests")
+
+
+def setup_mldata():
+    # setup mock urllib2 module to avoid downloading from mldata.org
+    install_mldata_mock({
+        'mnist-original': {
+            'data': np.empty((70000, 784)),
+            'label': np.repeat(np.arange(10, dtype='d'), 7000),
+        },
+        'iris': {
+            'data': np.empty((150, 4)),
+        },
+        'datasets-uci-iris': {
+            'double0': np.empty((150, 4)),
+            'class': np.empty((150,)),
+        },
+    })
+
+
+def teardown_mldata():
+    uninstall_mldata_mock()
+
+
+def setup_rcv1():
+    check_skip_network()
+    # skip the test in rcv1.rst if the dataset is not already loaded
+    rcv1_dir = join(get_data_home(), "RCV1")
+    if not exists(rcv1_dir):
+        raise SkipTest("Download RCV1 dataset to run this test.")
+
+
+def setup_twenty_newsgroups():
+    data_home = get_data_home()
+    if not exists(join(data_home, '20news_home')):
+        raise SkipTest("Skipping dataset loading doctests")
+
+
+def setup_working_with_text_data():
+    check_skip_network()
+
+
+def pytest_runtest_setup(item):
+    fname = item.fspath.strpath
+    if fname.endswith('datasets/labeled_faces.rst'):
+        setup_labeled_faces()
+    elif fname.endswith('datasets/mldata.rst'):
+        setup_mldata()
+    elif fname.endswith('datasets/rcv1.rst'):
+        setup_rcv1()
+    elif fname.endswith('datasets/twenty_newsgroups.rst'):
+        setup_twenty_newsgroups()
+    elif fname.endswith('datasets/working_with_text_data.rst'):
+        setup_working_with_text_data()
+
+
+def pytest_runtest_teardown(item):
+    fname = item.fspath.strpath
+    if fname.endswith('datasets/mldata.rst'):
+        teardown_mldata()
diff --git a/doc/datasets/mldata_fixture.py b/doc/datasets/mldata_fixture.py
new file mode 100644
index 0000000000000..0ee5cccaa0f5e
--- /dev/null
+++ b/doc/datasets/mldata_fixture.py
@@ -0,0 +1,30 @@
+"""Fixture module to skip the datasets loading when offline
+
+Mock urllib2 access to mldata.org and create a temporary data folder.
+"""
+
+import numpy as np
+
+from sklearn.utils.testing import install_mldata_mock
+from sklearn.utils.testing import uninstall_mldata_mock
+
+
+def setup_module():
+    # setup mock urllib2 module to avoid downloading from mldata.org
+    install_mldata_mock({
+        'mnist-original': {
+            'data': np.empty((70000, 784)),
+            'label': np.repeat(np.arange(10, dtype='d'), 7000),
+        },
+        'iris': {
+            'data': np.empty((150, 4)),
+        },
+        'datasets-uci-iris': {
+            'double0': np.empty((150, 4)),
+            'class': np.empty((150,)),
+        },
+    })
+
+
+def teardown_module():
+    uninstall_mldata_mock()
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
index 9e970478dcf71..c95a996debb58 100644
--- a/doc/modules/clustering.rst
+++ b/doc/modules/clustering.rst
@@ -1150,7 +1150,7 @@ With :math:`P'(j) = |V_j| / N`. The mutual information (MI) between :math:`U`
 and :math:`V` is calculated by:
 
 .. math:: \text{MI}(U, V) = \sum_{i=1}^{|U|}\sum_{j=1}^{|V|}P(i, j)\log\left(\frac{P(i,j)}{P(i)P'(j)}\right)
-
+ 
 where :math:`P(i, j) = |U_i \cap V_j| / N` is the probability that an object
 picked at random falls into both classes :math:`U_i` and :math:`V_j`.
 
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
index 7505ec1819554..fb8d7bf5ee406 100644
--- a/doc/modules/model_evaluation.rst
+++ b/doc/modules/model_evaluation.rst
@@ -309,15 +309,16 @@ Some also work in the multilabel case:
    precision_recall_fscore_support
    precision_score
    recall_score
+   roc_auc_score
    zero_one_loss
 
+
 And some work with binary and multilabel (but not multiclass) problems:
 
 .. autosummary::
    :template: function.rst
 
    average_precision_score
-   roc_auc_score
 
 
 In the following sub-sections, we will describe each of those functions,
@@ -1169,10 +1170,41 @@ In multi-label classification, the :func:`roc_auc_score` function is
 extended by averaging over the labels as :ref:`above <average>`.
 
 Compared to metrics such as the subset accuracy, the Hamming loss, or the
-F1 score, ROC doesn't require optimizing a threshold for each label. The
-:func:`roc_auc_score` function can also be used in multi-class classification,
-if the predicted outputs have been binarized.
+F1 score, ROC doesn't require optimizing a threshold for each label.
+
+The :func:`roc_auc_score` function can also be used in multi-class
+classification. [F2009]_ Two averaging strategies are currently supported: the
+one-vs-one algorithm computes the average of the pairwise
+ROC AUC scores, and the one-vs-rest algorithm
+computes the average of the ROC AUC scores for each class against
+all other classes. In both cases, the predicted class labels are provided in
+an array with values from 0 to ``n_classes``, and the scores correspond to the
+probability estimates that a sample belongs to a particular class.
+
+**One-vs-one Algorithm**
+The AUC of each class against each other, computing
+the AUC of all possible pairwise combinations :math:`c(c-1)` for a
+:math:`c`-dimensional classifier.
+
+[HT2001]_ Using the uniform class distribution:
+
+.. math:: \frac{1}{c(c-1)}\sum_{j=1}^c\sum_{k \neq j}^c \textnormal{AUC}(j, k)
+        
+[F2009]_ Weighted by the prevalence of classes `j` and `k`:
+
+.. math:: \frac{1}{c-1}\sum_{j=1}^c\sum_{k \neq j}^c p(j \cup k)\textnormal{AUC}(j, k)
 
+**One-vs-rest Algorithm**
+AUC of each class against the rest. This treats
+a :math:`c`-dimensional classifier as :math:`c` two-dimensional classifiers.
+
+[F2006]_ Using the uniform class distribution:
+
+.. math:: \frac{\sum_{j=1}^c \textnormal{AUC}(j, \textnormal{rest}_j)}{c}
+
+[F2001]_ Weighted by the a priori class distribution:
+
+.. math:: \frac{\sum_{j=1}^c p(j)\textnormal{AUC}(j, \textnormal{rest}_j)}{c}
 
 .. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_002.png
    :target: ../auto_examples/model_selection/plot_roc.html
@@ -1193,6 +1225,24 @@ if the predicted outputs have been binarized.
     for an example of using ROC to
     model species distribution.
 
+.. topic:: References:
+
+    .. [F2001] Fawcett, T., 2001. `Using rule sets to maximize 
+       ROC performance <http://ieeexplore.ieee.org/document/989510/>`_
+       In Data Mining, 2001.
+       Proceedings IEEE International Conference, pp. 131-138.
+    .. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis.
+       <http://www.sciencedirect.com/science/article/pii/S016786550500303X>`_
+       Pattern Recognition Letters, 27(8), pp. 861-874.
+    .. [F2009] Ferri, C., Hernandez-Orallo, J., and Modroiu, R., 2009.
+       `An experimental comparison of performance measures for classification.
+       <http://www.sciencedirect.com/science/article/pii/S0167865508002687>`_
+       Pattern Recognition Letters, 30(1), pp. 27-38.
+    .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation
+       of the area under the ROC curve for multiple class classification problems.
+       <http://link.springer.com/article/10.1023/A:1010920819831>`_
+       Machine learning, 45(2), pp.171-186.
+
 .. _zero_one_loss:
 
 Zero one loss
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index 0b95cb168bf91..1c64290cbfb53 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -436,4 +436,4 @@ Regressor Chain
 Regressor chains (see :class:`RegressorChain`) is analogous to 
 ClassifierChain as a way of combining a number of regressions 
 into a single multi-target model that is capable of exploiting 
-correlations among targets.
\ No newline at end of file
+correlations among targets.
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index e21dbe4c1ada8..ee7c986318da4 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -302,6 +302,7 @@ This can be confirmed on a independent testing set with similar remarks::
   ... # doctest: +ELLIPSIS +SKIP
   array([ 0.01...,  0.25...,  0.46...,  0.60... ,  0.94...])
 
+
 Mapping to a Gaussian distribution
 ----------------------------------
 
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index caae4c9a1645d..36afb823048df 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -67,6 +67,7 @@ Preprocessing
   :issue:`10210` by :user:`Eric Chang <ericchang00>` and
   :user:`Maniteja Nandana <maniteja123>`.
 
+>>>>>>> upstream/master
 
 Model evaluation
 
diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py
index 475d7b4aba7a6..3a233eb5b79ae 100644
--- a/examples/model_selection/plot_roc.py
+++ b/examples/model_selection/plot_roc.py
@@ -19,16 +19,39 @@
 -------------------
 
 ROC curves are typically used in binary classification to study the output of
-a classifier. In order to extend ROC curve and ROC area to multi-class
-or multi-label classification, it is necessary to binarize the output. One ROC
-curve can be drawn per label, but one can also draw a ROC curve by considering
+a classifier. Extensions of ROC curve and ROC area to multi-class
+or multi-label classification can use the One-vs-Rest or One-vs-One scheme.
+
+One-vs-Rest
+-----------
+
+The output is binarized and one ROC curve is drawn per label,
+where label is set to be the positive class and all other labels (the "rest")
+are considered the negative class.
+
+The ROC area can be approximated by taking the average--unweighted or weighted
+by the a priori class distribution--of the one-vs-rest ROC areas.
+
+One can also draw a ROC curve by considering
 each element of the label indicator matrix as a binary prediction
 (micro-averaging).
 
-Another evaluation measure for multi-class classification is
+Another evaluation measure for one-vs-rest multi-class classification is
 macro-averaging, which gives equal weight to the classification of each
 label.
 
+One-vs-One
+----------
+
+Two ROC curves can be drawn per pair of labels because either of the two
+labels can be considered the positive class (and the other the negative
+class). The ROC area of a label pair is approximated taking the average of
+these two ROC AUC scores.
+
+The One-vs-One approximation of a multi-class ROC AUC score is the average--
+unweighted or weighted by class prevalence--across all of the pairwise
+approximate ROC AUC scores.
+
 .. note::
 
     See also :func:`sklearn.metrics.roc_auc_score`,
@@ -39,10 +62,10 @@
 
 import numpy as np
 import matplotlib.pyplot as plt
-from itertools import cycle
+from itertools import combinations, cycle
 
 from sklearn import svm, datasets
-from sklearn.metrics import roc_curve, auc
+from sklearn.metrics import roc_curve, auc, roc_auc_score
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import label_binarize
 from sklearn.multiclass import OneVsRestClassifier
@@ -53,9 +76,8 @@
 X = iris.data
 y = iris.target
 
-# Binarize the output
-y = label_binarize(y, classes=[0, 1, 2])
-n_classes = y.shape[1]
+classes = np.unique(y)
+n_classes = len(classes)
 
 # Add noisy features to make the problem harder
 random_state = np.random.RandomState(0)
@@ -72,17 +94,17 @@
 y_score = classifier.fit(X_train, y_train).decision_function(X_test)
 
 # Compute ROC curve and ROC area for each class
+
+# Binarize y_test to compute the ROC curve
+y_test_binarized = label_binarize(y_test, classes=classes)
+
 fpr = dict()
 tpr = dict()
 roc_auc = dict()
 for i in range(n_classes):
-    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
+    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i])
     roc_auc[i] = auc(fpr[i], tpr[i])
 
-# Compute micro-average ROC curve and ROC area
-fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
-roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
-
 
 ##############################################################################
 # Plot of a ROC curve for a specific class
@@ -101,7 +123,12 @@
 
 
 ##############################################################################
-# Plot ROC curves for the multiclass problem
+# Plot ROC curves for the multiclass problem using One vs. Rest classification.
+
+# Compute micro-average ROC curve and ROC area
+fpr["micro"], tpr["micro"], _ = roc_curve(
+    y_test_binarized.ravel(), y_score.ravel())
+roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
 
 # Compute macro-average ROC curve and ROC area
 
@@ -143,6 +170,63 @@
 plt.ylim([0.0, 1.05])
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
-plt.title('Some extension of Receiver operating characteristic to multi-class')
+plt.title('An extension of ROC to multi-class '
+          'using One-vs-Rest')
 plt.legend(loc="lower right")
 plt.show()
+
+# Compute the One-vs-Rest ROC AUC score, weighted and unweighted
+unweighted_roc_auc_ovr = roc_auc_score(y_test, y_score, multiclass="ovr")
+weighted_roc_auc_ovr = roc_auc_score(
+    y_test, y_score, multiclass="ovr", average="weighted")
+print("One-vs-Rest ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
+      unweighted_roc_auc_ovr, weighted_roc_auc_ovr))
+
+##############################################################################
+# Plot ROC curves for the multiclass problem using One vs. One classification.
+
+for a, b in combinations(range(n_classes), 2):
+    # Filter `y_test` and `y_score` to only consider the current
+    # `a` and `b` class pair.
+    ab_mask = np.logical_or(y_test == a, y_test == b)
+    y_true_filtered = y_test[ab_mask]
+    y_score_filtered = y_score[ab_mask]
+
+    # Compute ROC curve and ROC area with `a` as the positive class
+    class_a = y_true_filtered == a
+    fpr[(a, b)], tpr[(a, b)], _ = roc_curve(
+        class_a, y_score_filtered[:, a])
+    roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)])
+
+    # Compute ROC curve and ROC area with `b` as the positive class
+    class_b = y_true_filtered == b
+    fpr[(b, a)], tpr[(b, a)], _ = roc_curve(
+        class_b, y_score_filtered[:, b])
+    roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)])
+
+plt.figure()
+for a, b in combinations(range(n_classes), 2):
+    plt.plot(fpr[(a, b)], tpr[(a, b)], lw=lw,
+             label='ROC curve: class {0} vs. {1} '
+                   '(area = {2:0.2f})'.format(
+        a, b, roc_auc[(a, b)]))
+    plt.plot(fpr[(b, a)], tpr[(b, a)], lw=lw,
+             label='ROC curve: class {0} vs. {1} '
+                   '(area = {2:0.2f})'.format(
+        b, a, roc_auc[(b, a)]))
+plt.plot([0, 1], [0, 1], 'k--', lw=lw)
+plt.xlim([0.0, 1.0])
+plt.ylim([0.0, 1.05])
+plt.xlabel('False Positive Rate')
+plt.ylabel('True Positive Rate')
+plt.title('An extension of ROC to multi-class '
+          'using One-vs-One')
+plt.legend(bbox_to_anchor=(1.1, 0.30))
+plt.show()
+
+# Compute the One-vs-One ROC AUC score, weighted and unweighted
+unweighted_roc_auc_ovo = roc_auc_score(y_test, y_score, multiclass="ovo")
+weighted_roc_auc_ovo = roc_auc_score(
+    y_test, y_score, multiclass="ovo", average="weighted")
+print("One-vs-One ROC AUC scores: {0} (unweighted), {1} (weighted)".format(
+      unweighted_roc_auc_ovo, weighted_roc_auc_ovo))
diff --git a/setup.cfg b/setup.cfg
index f96e9cf9f85ab..45f28594f4006 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -8,6 +8,13 @@ addopts =
     --doctest-modules
     --disable-pytest-warnings
 
+[tool:pytest]
+# disable-pytest-warnings should be removed once we drop nose and we
+# rewrite tests using yield with parametrize
+addopts =
+    --doctest-modules
+    --disable-pytest-warnings
+
 [wheelhouse_uploader]
 artifact_indexes=
     # OSX wheels built by travis (only for specific tags):
diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py
index 2c23fcc636deb..d301ebe289ecb 100644
--- a/sklearn/datasets/base.py
+++ b/sklearn/datasets/base.py
@@ -633,6 +633,7 @@ def load_linnerud(return_X_y=False):
         'targets', the two multivariate datasets, with 'data' corresponding to
         the exercise and 'targets' corresponding to the physiological
         measurements, as well as 'feature_names' and 'target_names'.
+
         In addition, you will also have access to 'data_filename',
         the physical location of linnerud data csv dataset, and
         'target_filename', the physical location of
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 4c59b33e675e6..7b6acaaa191ce 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -173,6 +173,25 @@ def test_regressor_parameter_checks():
                          .fit, X, y)
 
 
+def test_regressor_parameter_checks():
+    # Check input parameter validation for GradientBoostingRegressor
+    assert_raise_message(ValueError, "alpha must be in (0.0, 1.0) but was 1.2",
+                         GradientBoostingRegressor(loss='huber', alpha=1.2)
+                         .fit, X, y)
+    assert_raise_message(ValueError, "alpha must be in (0.0, 1.0) but was 1.2",
+                         GradientBoostingRegressor(loss='quantile', alpha=1.2)
+                         .fit, X, y)
+    assert_raise_message(ValueError, "Invalid value for max_features: "
+                         "'invalid'. Allowed string values are 'auto', 'sqrt'"
+                         " or 'log2'.",
+                         GradientBoostingRegressor(max_features='invalid').fit,
+                         X, y)
+    assert_raise_message(ValueError, "n_iter_no_change should either be None"
+                         " or an integer. 'invalid' was passed",
+                         GradientBoostingRegressor(n_iter_no_change='invalid')
+                         .fit, X, y)
+
+
 def test_loss_function():
     assert_raises(ValueError,
                   GradientBoostingClassifier(loss='ls').fit, X, y)
diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py
index b8bbab30930b4..79ff07c7d9537 100644
--- a/sklearn/metrics/base.py
+++ b/sklearn/metrics/base.py
@@ -13,11 +13,13 @@
 # License: BSD 3 clause
 
 from __future__ import division
+import itertools
 
 import numpy as np
 
 from ..utils import check_array, check_consistent_length
 from ..utils.multiclass import type_of_target
+from ..preprocessing import LabelBinarizer
 
 
 def _average_binary_score(binary_metric, y_true, y_score, average,
@@ -33,7 +35,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         Target scores, can either be probability estimates of the positive
         class, confidence values, or binary decisions.
 
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -122,3 +125,127 @@ def _average_binary_score(binary_metric, y_true, y_score, average,
         return np.average(score, weights=average_weight)
     else:
         return score
+
+
+def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average):
+    """Uses the binary metric for one-vs-one multiclass classification,
+    where the score is computed according to the Hand & Till (2001) algorithm.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True multiclass labels.
+        Assumes labels have been recoded to 0 to n_classes.
+
+    y_score : array, shape = [n_samples, n_classes]
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    average : 'macro' or 'weighted', default='macro'
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the prevalence
+             of the classes.
+
+    binary_metric : callable, the binary metric function to use.
+        Accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    Returns
+    -------
+    score : float
+        Average the sum of pairwise binary metric scores
+    """
+    n_classes = len(np.unique(y_true))
+    n_pairs = n_classes * (n_classes - 1) // 2
+    prevalence = np.empty(n_pairs)
+    pair_scores = np.empty(n_pairs)
+
+    ix = 0
+    for a, b in itertools.combinations(range(n_classes), 2):
+        a_mask = y_true == a
+        ab_mask = np.logical_or(a_mask, y_true == b)
+
+        prevalence[ix] = np.sum(ab_mask) / len(y_true)
+
+        y_score_filtered = y_score[ab_mask]
+
+        a_true = a_mask[ab_mask]
+        b_true = np.logical_not(a_true)
+
+        a_true_score = binary_metric(
+                a_true, y_score_filtered[:, a])
+        b_true_score = binary_metric(
+                b_true, y_score_filtered[:, b])
+        binary_avg_score = (a_true_score + b_true_score) / 2
+        pair_scores[ix] = binary_avg_score
+
+        ix += 1
+    return (np.average(pair_scores, weights=prevalence)
+            if average == "weighted" else np.average(pair_scores))
+
+
+def _average_multiclass_ovr_score(binary_metric, y_true, y_score, average):
+    """Uses the binary metric for one-vs-rest multi-class classification,
+    where the score is computed according to the Provost & Domingos (2001)
+    definition of the AUC in multi-class settings (when `average` parameter is
+    set to `weighted`).
+
+    For each class, the ROC curve is generated and the AUC computed.
+    The output is the average of the individual AUCs weighted by the prevalence
+    of the classes in the data.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True multiclass labels.
+        Assumes labels have been recoded to 0 to n_classes.
+
+    y_score : array, shape = [n_samples, n_classes]
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class.
+
+    average : 'macro' or 'weighted', default='macro'
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the prevalence
+             of the classes in the dataset.
+
+    binary_metric : callable, the binary metric function to use.
+        Accepts the following as input
+            y_true_target : array, shape = [n_samples_target]
+                Some sub-array of y_true for a pair of classes designated
+                positive and negative in the one-vs-one scheme.
+            y_score_target : array, shape = [n_samples_target]
+                Scores corresponding to the probability estimates
+                of a sample belonging to the designated positive class label
+
+    Returns
+    -------
+    score : float
+        Average of binary metric scores
+    """
+    n_classes = len(np.unique(y_true))
+    scores = np.zeros((n_classes,))
+
+    y_true_multilabel = LabelBinarizer().fit_transform(y_true)
+    prevalence = np.sum(y_true_multilabel, axis=0) / y_true_multilabel.shape[0]
+
+    for c in range(n_classes):
+        y_true_c = y_true_multilabel.take([c], axis=1).ravel()
+        y_score_c = y_score.take([c], axis=1).ravel()
+        scores[c] = binary_metric(y_true_c, y_score_c)
+
+    return (np.average(scores, weights=prevalence)
+            if average == "weighted" else np.average(scores))
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 1d8d37954b99c..fe6289481a371 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -24,6 +24,7 @@
 from scipy.sparse import csr_matrix
 from scipy.stats import rankdata
 
+from ..preprocessing import LabelBinarizer
 from ..utils import assert_all_finite
 from ..utils import check_consistent_length
 from ..utils import column_or_1d, check_array
@@ -33,7 +34,8 @@
 from ..exceptions import UndefinedMetricWarning
 from ..preprocessing import label_binarize
 
-from .base import _average_binary_score
+from .base import _average_binary_score, _average_multiclass_ovo_score, \
+                  _average_multiclass_ovr_score
 
 
 def auc(x, y, reorder='deprecated'):
@@ -157,7 +159,8 @@ def average_precision_score(y_true, y_score, average="macro",
         class, confidence values, or non-thresholded measure of decisions
         (as returned by "decision_function" on some classifiers).
 
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -217,28 +220,39 @@ def _binary_uninterpolated_average_precision(
                                  sample_weight=sample_weight)
 
 
-def roc_auc_score(y_true, y_score, average="macro", sample_weight=None):
-    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
-    from prediction scores.
-
-    Note: this implementation is restricted to the binary classification task
-    or multilabel classification task in label indicator format.
+def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro",
+                  sample_weight=None):
+    """Compute Area Under the Curve (AUC) from prediction scores
 
     Read more in the :ref:`User Guide <roc_metrics>`.
 
     Parameters
     ----------
     y_true : array, shape = [n_samples] or [n_samples, n_classes]
-        True binary labels or binary label indicators.
+        True binary labels in binary label indicators.
+        The multiclass case expects shape = [n_samples] and labels
+        with values from 0 to (n_classes-1), inclusive.
 
     y_score : array, shape = [n_samples] or [n_samples, n_classes]
         Target scores, can either be probability estimates of the positive
         class, confidence values, or non-thresholded measure of decisions
-        (as returned by "decision_function" on some classifiers). For binary
-        y_true, y_score is supposed to be the score of the class with greater
-        label.
-
-    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+        (as returned by "decision_function" on some classifiers).
+        The multiclass case expects shape = [n_samples, n_classes]
+        where the scores correspond to probability estimates.
+
+    multiclass : string, 'ovr' or 'ovo', default 'ovr'
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages.
+
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    average : string, {None, 'micro', 'macro', 'samples', 'weighted'},
+              default 'macro'
         If ``None``, the scores for each class are returned. Otherwise,
         this determines the type of averaging performed on the data:
 
@@ -295,13 +309,51 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
         return auc(fpr, tpr)
 
     y_type = type_of_target(y_true)
-    if y_type == "binary":
-        labels = np.unique(y_true)
-        y_true = label_binarize(y_true, labels)[:, 0]
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
 
-    return _average_binary_score(
-        _binary_roc_auc_score, y_true, y_score, average,
-        sample_weight=sample_weight)
+    if y_type == "multiclass" or (y_type == "binary" and
+                                  y_score.ndim == 2 and
+                                  y_score.shape[1] > 2):
+        # validation of the input y_score
+        if not np.allclose(1, y_score.sum(axis=1)):
+            raise ValueError("Target scores should sum up to 1.0 for all"
+                             "samples.")
+        # validation for multiclass parameter specifications
+        average_options = ("macro", "weighted")
+        if average not in average_options:
+            raise ValueError("Parameter 'average' must be one of {0} for"
+                             " multiclass problems.".format(average_options))
+        multiclass_options = ("ovo", "ovr")
+        if multiclass not in multiclass_options:
+            raise ValueError("Parameter multiclass='{0}' is not supported"
+                             " for multiclass ROC AUC. 'multiclass' must be"
+                             " one of {1}.".format(
+                                 multiclass, multiclass_options))
+        if sample_weight is not None:
+            # TODO: check if only in ovo case, if yes, do not raise when ovr
+            raise ValueError("Parameter 'sample_weight' is not supported"
+                             " for multiclass one-vs-one ROC AUC."
+                             " 'sample_weight' must be None in this case.")
+
+        if multiclass == "ovo":
+            # Hand & Till (2001) implementation
+            return _average_multiclass_ovo_score(
+                _binary_roc_auc_score, y_true, y_score, average)
+        elif multiclass == "ovr" and average == "weighted":
+            # Provost & Domingos (2001) implementation
+            return _average_multiclass_ovr_score(
+                _binary_roc_auc_score, y_true, y_score, average)
+        else:
+            y_true = y_true.reshape((-1, 1))
+            y_true_multilabel = LabelBinarizer().fit_transform(y_true)
+            return _average_binary_score(
+                 _binary_roc_auc_score, y_true_multilabel, y_score, average,
+                 sample_weight=sample_weight)
+    else:
+        return _average_binary_score(
+            _binary_roc_auc_score, y_true, y_score, average,
+            sample_weight=sample_weight)
 
 
 def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
index ad323a5483621..4f248a67b7b47 100644
--- a/sklearn/metrics/tests/test_common.py
+++ b/sklearn/metrics/tests/test_common.py
@@ -23,6 +23,7 @@
 from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import _named_check
 
 from sklearn.metrics import accuracy_score
 from sklearn.metrics import balanced_accuracy_score
@@ -1045,9 +1046,11 @@ def test_sample_weight_invariance(n_samples=50):
             continue
         metric = ALL_METRICS[name]
         if name in THRESHOLDED_METRICS:
-            yield check_sample_weight_invariance, name, metric, y_true, y_score
+            yield _named_check(check_sample_weight_invariance, name), name,\
+                  metric, y_true, y_score
         else:
-            yield check_sample_weight_invariance, name, metric, y_true, y_pred
+            yield _named_check(check_sample_weight_invariance, name), name,\
+                  metric, y_true, y_pred
 
     # multiclass
     random_state = check_random_state(0)
@@ -1062,9 +1065,11 @@ def test_sample_weight_invariance(n_samples=50):
             continue
         metric = ALL_METRICS[name]
         if name in THRESHOLDED_METRICS:
-            yield check_sample_weight_invariance, name, metric, y_true, y_score
+            yield _named_check(check_sample_weight_invariance, name), name,\
+                  metric, y_true, y_score
         else:
-            yield check_sample_weight_invariance, name, metric, y_true, y_pred
+            yield _named_check(check_sample_weight_invariance, name), name,\
+                  metric, y_true, y_pred
 
     # multilabel indicator
     _, ya = make_multilabel_classification(n_features=1, n_classes=20,
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index a17935ae7de17..f66c39fbe256b 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -442,6 +442,125 @@ def test_deprecated_auc_reorder():
                          [1, 2], [2, 3], reorder=True)
 
 
+def test_multi_ovo_auc_toydata():
+    # Tests the one-vs-one multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_true = np.array([0, 1, 0, 2])
+    n_labels = len(np.unique(y_true))
+    y_scores = np.array(
+        [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]])
+
+    # Used to compute the expected output.
+    # Consider labels 0 and 1:
+    # positive label is 0, negative label is 1
+    score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35])
+    # positive label is 1, negative label is 0
+    score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5])
+    average_score_01 = (score_01 + score_10) / 2.
+
+    # Consider labels 0 and 2:
+    score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0])
+    score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8])
+    average_score_02 = (score_02 + score_20) / 2.
+
+    # Consider labels 1 and 2:
+    score_12 = roc_auc_score([1, 0], [0.4, 0.2])
+    score_21 = roc_auc_score([0, 1], [0.3, 0.8])
+    average_score_12 = (score_12 + score_21) / 2.
+
+    # Unweighted, one-vs-one multiclass ROC AUC algorithm
+    sum_avg_scores = average_score_01 + average_score_02 + average_score_12
+    ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1))
+    ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo"),
+        ovo_unweighted_score)
+
+    # Weighted, one-vs-one multiclass ROC AUC algorithm
+    # Each term is weighted by the prevalence for the positive label.
+    pair_scores = [average_score_01, average_score_02, average_score_12]
+    prevalence = [0.75, 0.75, 0.50]
+    ovo_weighted_score = np.average(pair_scores, weights=prevalence)
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"),
+        ovo_weighted_score)
+
+
+def test_multi_ovr_auc_toydata():
+    # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm
+    # on a small example, representative of an expected use case.
+    y_true = np.array([0, 1, 2, 2])
+    y_scores = np.array(
+        [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]])
+    # Compute the expected result by individually computing the 'one-vs-rest'
+    # ROC AUC scores for classes 0, 1, and 2.
+    out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0])
+    out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1])
+    out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2])
+    result_unweighted = (out_0 + out_1 + out_2) / 3.
+
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr"),
+        result_unweighted)
+
+    # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm
+    # on the same input (Provost & Domingos, 2001)
+    result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5
+    assert_almost_equal(
+        roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"),
+        result_weighted)
+
+
+def test_multi_auc_score_under_permutation():
+    y_score = np.random.rand(100, 3)
+    y_score[:, 2] += .1
+    y_score[:, 1] -= .1
+    y_true = np.argmax(y_score, axis=1)
+    y_true[np.random.randint(len(y_score), size=20)] = np.random.randint(
+        2, size=20)
+    for multiclass in ['ovr', 'ovo']:
+        for average in ['macro', 'weighted']:
+            same_score_under_permutation = None
+            for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2],
+                         [1, 2, 0], [2, 0, 1], [2, 1, 0]]:
+                inv_perm = np.zeros(3, dtype=int)
+                inv_perm[perm] = np.arange(3)
+                y_score_perm = y_score[:, inv_perm]
+                y_true_perm = np.take(perm, y_true)
+                score = roc_auc_score(y_true_perm, y_score_perm,
+                                      multiclass=multiclass, average=average)
+                if same_score_under_permutation is None:
+                    same_score_under_permutation = score
+                else:
+                    assert_almost_equal(score, same_score_under_permutation)
+
+
+def test_auc_score_multi_error():
+    # Test that roc_auc_score function returns an error when trying
+    # to compute multiclass AUC for parameters where an output
+    # is not defined.
+    rng = check_random_state(404)
+    y_pred = rng.rand(10)
+    y_true = rng.randint(0, 3, size=10)
+    average_error_msg = ("Parameter 'average' must be one of "
+                         "('macro', 'weighted') for multiclass problems.")
+    assert_raise_message(ValueError, average_error_msg,
+                         roc_auc_score, y_true, y_pred, average="sample")
+    assert_raise_message(ValueError, average_error_msg,
+                         roc_auc_score, y_true, y_pred, average="micro")
+    multiclass_error_msg = ("Parameter multiclass='invalid' is not "
+                            "supported for multiclass ROC AUC. 'multiclass' "
+                            "must be one of ('ovo', 'ovr').")
+    assert_raise_message(ValueError, multiclass_error_msg,
+                         roc_auc_score, y_true, y_pred, multiclass="invalid")
+    sample_weight_error_msg = ("Parameter 'sample_weight' is not supported "
+                               "for multiclass one-vs-one ROC AUC. "
+                               "'sample_weight' must be None in this case.")
+    assert_raise_message(ValueError, sample_weight_error_msg,
+                         roc_auc_score, y_true, y_pred,
+                         multiclass="ovo", sample_weight=[])
+
+
 def test_auc_score_non_binary_class():
     # Test that roc_auc_score function returns an error when trying
     # to compute AUC for non-binary class values.
@@ -457,10 +576,6 @@ def test_auc_score_non_binary_class():
     y_true = -np.ones(10, dtype="int")
     assert_raise_message(ValueError, "ROC AUC score is not defined",
                          roc_auc_score, y_true, y_pred)
-    # y_true contains three different class values
-    y_true = rng.randint(0, 3, size=10)
-    assert_raise_message(ValueError, "multiclass format is not supported",
-                         roc_auc_score, y_true, y_pred)
 
     clean_warning_registry()
     with warnings.catch_warnings(record=True):
@@ -477,11 +592,6 @@ def test_auc_score_non_binary_class():
         assert_raise_message(ValueError, "ROC AUC score is not defined",
                              roc_auc_score, y_true, y_pred)
 
-        # y_true contains three different class values
-        y_true = rng.randint(0, 3, size=10)
-        assert_raise_message(ValueError, "multiclass format is not supported",
-                             roc_auc_score, y_true, y_pred)
-
 
 def test_binary_clf_curve():
     rng = check_random_state(404)
@@ -491,6 +601,7 @@ def test_binary_clf_curve():
     assert_raise_message(ValueError, msg, precision_recall_curve,
                          y_true, y_pred)
 
+
 def test_precision_recall_curve():
     y_true, _, probas_pred = make_prediction(binary=True)
     _test_precision_recall_curve(y_true, probas_pred)
@@ -675,7 +786,6 @@ def test_score_scale_invariance():
     # issue #3864 (and others), where overly aggressive rounding was causing
     # problems for users with very small y_score values
     y_true, _, probas_pred = make_prediction(binary=True)
-
     roc_auc = roc_auc_score(y_true, probas_pred)
     roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred)
     roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 9ab6b9df88ab5..14644ccf02207 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -704,6 +704,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         # NOTE test_sample counts (weights) remain the same for all candidates
         test_sample_counts = np.array(test_sample_counts[:n_splits],
                                       dtype=np.int)
+
         iid = self.iid
         if self.iid == 'warn':
             if len(np.unique(test_sample_counts)) > 1:
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
index a9bedd53e2ef3..a0582a9c7132c 100644
--- a/sklearn/model_selection/_split.py
+++ b/sklearn/model_selection/_split.py
@@ -2062,6 +2062,7 @@ def train_test_split(*arrays, **options):
 # Tell nose that train_test_split is not a test
 train_test_split.__test__ = False
 
+
 def _build_repr(self):
     # XXX This is copied from BaseEstimator's get_params
     cls = self.__class__
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
index 74ab4503eb34a..93f243caad643 100644
--- a/sklearn/preprocessing/data.py
+++ b/sklearn/preprocessing/data.py
@@ -2170,11 +2170,17 @@ class QuantileTransformer(BaseEstimator, TransformerMixin):
     See also
     --------
     quantile_transform : Equivalent function without the estimator API.
+<<<<<<< HEAD
+    StandardScaler : perform standardization that is faster, but less robust
+        to outliers.
+    RobustScaler : perform robust standardization that removes the influence
+=======
     PowerTransformer : Perform mapping to a normal distribution using a power
         transform.
     StandardScaler : Perform standardization that is faster, but less robust
         to outliers.
     RobustScaler : Perform robust standardization that removes the influence
+>>>>>>> upstream/master
         of outliers but does not put outliers and inliers on the same scale.
 
     Notes
@@ -2558,11 +2564,17 @@ def quantile_transform(X, axis=0, n_quantiles=1000,
     QuantileTransformer : Performs quantile-based scaling using the
         ``Transformer`` API (e.g. as part of a preprocessing
         :class:`sklearn.pipeline.Pipeline`).
+<<<<<<< HEAD
+    scale : perform standardization that is faster, but less robust
+        to outliers.
+    robust_scale : perform robust standardization that removes the influence
+=======
     power_transform : Maps data to a normal distribution using a
         power transformation.
     scale : Performs standardization that is faster, but less robust
         to outliers.
     robust_scale : Performs robust standardization that removes the influence
+>>>>>>> upstream/master
         of outliers but does not put outliers and inliers on the same scale.
 
     Notes
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index 398c12cbddb42..101b9deb39363 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -34,7 +34,6 @@
 from sklearn.utils.testing import assert_dict_equal
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
 
-
 from sklearn.base import (clone, TransformerMixin, ClusterMixin,
                           BaseEstimator, is_classifier, is_regressor)
 
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
index 3c81a2f86d35b..1e607deb0e51d 100644
--- a/sklearn/utils/fixes.py
+++ b/sklearn/utils/fixes.py
@@ -143,7 +143,6 @@ def sparse_min_max(X, axis):
 else:
     from scipy.sparse.linalg import lsqr as sparse_lsqr  # noqa
 
-
 try:  # SciPy >= 0.19
     from scipy.special import comb, logsumexp
 except ImportError:
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index 7d17680c9cee9..43046129a042b 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -827,6 +827,7 @@ def check_docstring_parameters(func, doc=None, ignore=None, class_name=None):
     incorrect : list
         A list of string describing the incorrect results.
     """
+
     from numpydoc import docscrape
     incorrect = []
     ignore = [] if ignore is None else ignore