diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 1acfe10280e86..fc66096aca45d 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -39,6 +39,12 @@ if [[ "$DISTRIB" == "conda" ]]; then # Configure the conda environment and put it in the path using the # provided versions + if [[ "$USE_PYTEST" == "true" ]]; then + TEST_RUNNER_PACKAGE=pytest + else + TEST_RUNNER_PACKAGE=nose + fi + if [[ "$INSTALL_MKL" == "true" ]]; then conda create -n testenv --yes python=$PYTHON_VERSION pip \ pytest pytest-cov numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION \ diff --git a/doc/README.md b/doc/README.md index 18d4bde4f5862..e60a66caf719c 100644 --- a/doc/README.md +++ b/doc/README.md @@ -1,5 +1,6 @@ # Documentation for scikit-learn + This directory contains the full manual and web site as displayed at http://scikit-learn.org. See http://scikit-learn.org/dev/developers/contributing.html#documentation for diff --git a/doc/datasets/conftest.py b/doc/datasets/conftest.py new file mode 100644 index 0000000000000..0ccc0bced9ee7 --- /dev/null +++ b/doc/datasets/conftest.py @@ -0,0 +1,75 @@ +from os.path import exists +from os.path import join + +import numpy as np + +from sklearn.utils.testing import SkipTest +from sklearn.utils.testing import check_skip_network +from sklearn.datasets import get_data_home +from sklearn.utils.testing import install_mldata_mock +from sklearn.utils.testing import uninstall_mldata_mock + + +def setup_labeled_faces(): + data_home = get_data_home() + if not exists(join(data_home, 'lfw_home')): + raise SkipTest("Skipping dataset loading doctests") + + +def setup_mldata(): + # setup mock urllib2 module to avoid downloading from mldata.org + install_mldata_mock({ + 'mnist-original': { + 'data': np.empty((70000, 784)), + 'label': np.repeat(np.arange(10, dtype='d'), 7000), + }, + 'iris': { + 'data': np.empty((150, 4)), + }, + 'datasets-uci-iris': { + 'double0': np.empty((150, 4)), + 'class': np.empty((150,)), + }, + }) + + +def teardown_mldata(): + uninstall_mldata_mock() + + +def setup_rcv1(): + check_skip_network() + # skip the test in rcv1.rst if the dataset is not already loaded + rcv1_dir = join(get_data_home(), "RCV1") + if not exists(rcv1_dir): + raise SkipTest("Download RCV1 dataset to run this test.") + + +def setup_twenty_newsgroups(): + data_home = get_data_home() + if not exists(join(data_home, '20news_home')): + raise SkipTest("Skipping dataset loading doctests") + + +def setup_working_with_text_data(): + check_skip_network() + + +def pytest_runtest_setup(item): + fname = item.fspath.strpath + if fname.endswith('datasets/labeled_faces.rst'): + setup_labeled_faces() + elif fname.endswith('datasets/mldata.rst'): + setup_mldata() + elif fname.endswith('datasets/rcv1.rst'): + setup_rcv1() + elif fname.endswith('datasets/twenty_newsgroups.rst'): + setup_twenty_newsgroups() + elif fname.endswith('datasets/working_with_text_data.rst'): + setup_working_with_text_data() + + +def pytest_runtest_teardown(item): + fname = item.fspath.strpath + if fname.endswith('datasets/mldata.rst'): + teardown_mldata() diff --git a/doc/datasets/mldata_fixture.py b/doc/datasets/mldata_fixture.py new file mode 100644 index 0000000000000..0ee5cccaa0f5e --- /dev/null +++ b/doc/datasets/mldata_fixture.py @@ -0,0 +1,30 @@ +"""Fixture module to skip the datasets loading when offline + +Mock urllib2 access to mldata.org and create a temporary data folder. +""" + +import numpy as np + +from sklearn.utils.testing import install_mldata_mock +from sklearn.utils.testing import uninstall_mldata_mock + + +def setup_module(): + # setup mock urllib2 module to avoid downloading from mldata.org + install_mldata_mock({ + 'mnist-original': { + 'data': np.empty((70000, 784)), + 'label': np.repeat(np.arange(10, dtype='d'), 7000), + }, + 'iris': { + 'data': np.empty((150, 4)), + }, + 'datasets-uci-iris': { + 'double0': np.empty((150, 4)), + 'class': np.empty((150,)), + }, + }) + + +def teardown_module(): + uninstall_mldata_mock() diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst index 9e970478dcf71..c95a996debb58 100644 --- a/doc/modules/clustering.rst +++ b/doc/modules/clustering.rst @@ -1150,7 +1150,7 @@ With :math:`P'(j) = |V_j| / N`. The mutual information (MI) between :math:`U` and :math:`V` is calculated by: .. math:: \text{MI}(U, V) = \sum_{i=1}^{|U|}\sum_{j=1}^{|V|}P(i, j)\log\left(\frac{P(i,j)}{P(i)P'(j)}\right) - + where :math:`P(i, j) = |U_i \cap V_j| / N` is the probability that an object picked at random falls into both classes :math:`U_i` and :math:`V_j`. diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 7505ec1819554..fb8d7bf5ee406 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -309,15 +309,16 @@ Some also work in the multilabel case: precision_recall_fscore_support precision_score recall_score + roc_auc_score zero_one_loss + And some work with binary and multilabel (but not multiclass) problems: .. autosummary:: :template: function.rst average_precision_score - roc_auc_score In the following sub-sections, we will describe each of those functions, @@ -1169,10 +1170,41 @@ In multi-label classification, the :func:`roc_auc_score` function is extended by averaging over the labels as :ref:`above `. Compared to metrics such as the subset accuracy, the Hamming loss, or the -F1 score, ROC doesn't require optimizing a threshold for each label. The -:func:`roc_auc_score` function can also be used in multi-class classification, -if the predicted outputs have been binarized. +F1 score, ROC doesn't require optimizing a threshold for each label. + +The :func:`roc_auc_score` function can also be used in multi-class +classification. [F2009]_ Two averaging strategies are currently supported: the +one-vs-one algorithm computes the average of the pairwise +ROC AUC scores, and the one-vs-rest algorithm +computes the average of the ROC AUC scores for each class against +all other classes. In both cases, the predicted class labels are provided in +an array with values from 0 to ``n_classes``, and the scores correspond to the +probability estimates that a sample belongs to a particular class. + +**One-vs-one Algorithm** +The AUC of each class against each other, computing +the AUC of all possible pairwise combinations :math:`c(c-1)` for a +:math:`c`-dimensional classifier. + +[HT2001]_ Using the uniform class distribution: + +.. math:: \frac{1}{c(c-1)}\sum_{j=1}^c\sum_{k \neq j}^c \textnormal{AUC}(j, k) + +[F2009]_ Weighted by the prevalence of classes `j` and `k`: + +.. math:: \frac{1}{c-1}\sum_{j=1}^c\sum_{k \neq j}^c p(j \cup k)\textnormal{AUC}(j, k) +**One-vs-rest Algorithm** +AUC of each class against the rest. This treats +a :math:`c`-dimensional classifier as :math:`c` two-dimensional classifiers. + +[F2006]_ Using the uniform class distribution: + +.. math:: \frac{\sum_{j=1}^c \textnormal{AUC}(j, \textnormal{rest}_j)}{c} + +[F2001]_ Weighted by the a priori class distribution: + +.. math:: \frac{\sum_{j=1}^c p(j)\textnormal{AUC}(j, \textnormal{rest}_j)}{c} .. image:: ../auto_examples/model_selection/images/sphx_glr_plot_roc_002.png :target: ../auto_examples/model_selection/plot_roc.html @@ -1193,6 +1225,24 @@ if the predicted outputs have been binarized. for an example of using ROC to model species distribution. +.. topic:: References: + + .. [F2001] Fawcett, T., 2001. `Using rule sets to maximize + ROC performance `_ + In Data Mining, 2001. + Proceedings IEEE International Conference, pp. 131-138. + .. [F2006] Fawcett, T., 2006. `An introduction to ROC analysis. + `_ + Pattern Recognition Letters, 27(8), pp. 861-874. + .. [F2009] Ferri, C., Hernandez-Orallo, J., and Modroiu, R., 2009. + `An experimental comparison of performance measures for classification. + `_ + Pattern Recognition Letters, 30(1), pp. 27-38. + .. [HT2001] Hand, D.J. and Till, R.J., 2001. `A simple generalisation + of the area under the ROC curve for multiple class classification problems. + `_ + Machine learning, 45(2), pp.171-186. + .. _zero_one_loss: Zero one loss diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index 0b95cb168bf91..1c64290cbfb53 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -436,4 +436,4 @@ Regressor Chain Regressor chains (see :class:`RegressorChain`) is analogous to ClassifierChain as a way of combining a number of regressions into a single multi-target model that is capable of exploiting -correlations among targets. \ No newline at end of file +correlations among targets. diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e21dbe4c1ada8..ee7c986318da4 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -302,6 +302,7 @@ This can be confirmed on a independent testing set with similar remarks:: ... # doctest: +ELLIPSIS +SKIP array([ 0.01..., 0.25..., 0.46..., 0.60... , 0.94...]) + Mapping to a Gaussian distribution ---------------------------------- diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index caae4c9a1645d..36afb823048df 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -67,6 +67,7 @@ Preprocessing :issue:`10210` by :user:`Eric Chang ` and :user:`Maniteja Nandana `. +>>>>>>> upstream/master Model evaluation diff --git a/examples/model_selection/plot_roc.py b/examples/model_selection/plot_roc.py index 475d7b4aba7a6..3a233eb5b79ae 100644 --- a/examples/model_selection/plot_roc.py +++ b/examples/model_selection/plot_roc.py @@ -19,16 +19,39 @@ ------------------- ROC curves are typically used in binary classification to study the output of -a classifier. In order to extend ROC curve and ROC area to multi-class -or multi-label classification, it is necessary to binarize the output. One ROC -curve can be drawn per label, but one can also draw a ROC curve by considering +a classifier. Extensions of ROC curve and ROC area to multi-class +or multi-label classification can use the One-vs-Rest or One-vs-One scheme. + +One-vs-Rest +----------- + +The output is binarized and one ROC curve is drawn per label, +where label is set to be the positive class and all other labels (the "rest") +are considered the negative class. + +The ROC area can be approximated by taking the average--unweighted or weighted +by the a priori class distribution--of the one-vs-rest ROC areas. + +One can also draw a ROC curve by considering each element of the label indicator matrix as a binary prediction (micro-averaging). -Another evaluation measure for multi-class classification is +Another evaluation measure for one-vs-rest multi-class classification is macro-averaging, which gives equal weight to the classification of each label. +One-vs-One +---------- + +Two ROC curves can be drawn per pair of labels because either of the two +labels can be considered the positive class (and the other the negative +class). The ROC area of a label pair is approximated taking the average of +these two ROC AUC scores. + +The One-vs-One approximation of a multi-class ROC AUC score is the average-- +unweighted or weighted by class prevalence--across all of the pairwise +approximate ROC AUC scores. + .. note:: See also :func:`sklearn.metrics.roc_auc_score`, @@ -39,10 +62,10 @@ import numpy as np import matplotlib.pyplot as plt -from itertools import cycle +from itertools import combinations, cycle from sklearn import svm, datasets -from sklearn.metrics import roc_curve, auc +from sklearn.metrics import roc_curve, auc, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import label_binarize from sklearn.multiclass import OneVsRestClassifier @@ -53,9 +76,8 @@ X = iris.data y = iris.target -# Binarize the output -y = label_binarize(y, classes=[0, 1, 2]) -n_classes = y.shape[1] +classes = np.unique(y) +n_classes = len(classes) # Add noisy features to make the problem harder random_state = np.random.RandomState(0) @@ -72,17 +94,17 @@ y_score = classifier.fit(X_train, y_train).decision_function(X_test) # Compute ROC curve and ROC area for each class + +# Binarize y_test to compute the ROC curve +y_test_binarized = label_binarize(y_test, classes=classes) + fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): - fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) + fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) -# Compute micro-average ROC curve and ROC area -fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) -roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) - ############################################################################## # Plot of a ROC curve for a specific class @@ -101,7 +123,12 @@ ############################################################################## -# Plot ROC curves for the multiclass problem +# Plot ROC curves for the multiclass problem using One vs. Rest classification. + +# Compute micro-average ROC curve and ROC area +fpr["micro"], tpr["micro"], _ = roc_curve( + y_test_binarized.ravel(), y_score.ravel()) +roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Compute macro-average ROC curve and ROC area @@ -143,6 +170,63 @@ plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') -plt.title('Some extension of Receiver operating characteristic to multi-class') +plt.title('An extension of ROC to multi-class ' + 'using One-vs-Rest') plt.legend(loc="lower right") plt.show() + +# Compute the One-vs-Rest ROC AUC score, weighted and unweighted +unweighted_roc_auc_ovr = roc_auc_score(y_test, y_score, multiclass="ovr") +weighted_roc_auc_ovr = roc_auc_score( + y_test, y_score, multiclass="ovr", average="weighted") +print("One-vs-Rest ROC AUC scores: {0} (unweighted), {1} (weighted)".format( + unweighted_roc_auc_ovr, weighted_roc_auc_ovr)) + +############################################################################## +# Plot ROC curves for the multiclass problem using One vs. One classification. + +for a, b in combinations(range(n_classes), 2): + # Filter `y_test` and `y_score` to only consider the current + # `a` and `b` class pair. + ab_mask = np.logical_or(y_test == a, y_test == b) + y_true_filtered = y_test[ab_mask] + y_score_filtered = y_score[ab_mask] + + # Compute ROC curve and ROC area with `a` as the positive class + class_a = y_true_filtered == a + fpr[(a, b)], tpr[(a, b)], _ = roc_curve( + class_a, y_score_filtered[:, a]) + roc_auc[(a, b)] = auc(fpr[(a, b)], tpr[(a, b)]) + + # Compute ROC curve and ROC area with `b` as the positive class + class_b = y_true_filtered == b + fpr[(b, a)], tpr[(b, a)], _ = roc_curve( + class_b, y_score_filtered[:, b]) + roc_auc[(b, a)] = auc(fpr[(b, a)], tpr[(b, a)]) + +plt.figure() +for a, b in combinations(range(n_classes), 2): + plt.plot(fpr[(a, b)], tpr[(a, b)], lw=lw, + label='ROC curve: class {0} vs. {1} ' + '(area = {2:0.2f})'.format( + a, b, roc_auc[(a, b)])) + plt.plot(fpr[(b, a)], tpr[(b, a)], lw=lw, + label='ROC curve: class {0} vs. {1} ' + '(area = {2:0.2f})'.format( + b, a, roc_auc[(b, a)])) +plt.plot([0, 1], [0, 1], 'k--', lw=lw) +plt.xlim([0.0, 1.0]) +plt.ylim([0.0, 1.05]) +plt.xlabel('False Positive Rate') +plt.ylabel('True Positive Rate') +plt.title('An extension of ROC to multi-class ' + 'using One-vs-One') +plt.legend(bbox_to_anchor=(1.1, 0.30)) +plt.show() + +# Compute the One-vs-One ROC AUC score, weighted and unweighted +unweighted_roc_auc_ovo = roc_auc_score(y_test, y_score, multiclass="ovo") +weighted_roc_auc_ovo = roc_auc_score( + y_test, y_score, multiclass="ovo", average="weighted") +print("One-vs-One ROC AUC scores: {0} (unweighted), {1} (weighted)".format( + unweighted_roc_auc_ovo, weighted_roc_auc_ovo)) diff --git a/setup.cfg b/setup.cfg index f96e9cf9f85ab..45f28594f4006 100644 --- a/setup.cfg +++ b/setup.cfg @@ -8,6 +8,13 @@ addopts = --doctest-modules --disable-pytest-warnings +[tool:pytest] +# disable-pytest-warnings should be removed once we drop nose and we +# rewrite tests using yield with parametrize +addopts = + --doctest-modules + --disable-pytest-warnings + [wheelhouse_uploader] artifact_indexes= # OSX wheels built by travis (only for specific tags): diff --git a/sklearn/datasets/base.py b/sklearn/datasets/base.py index 2c23fcc636deb..d301ebe289ecb 100644 --- a/sklearn/datasets/base.py +++ b/sklearn/datasets/base.py @@ -633,6 +633,7 @@ def load_linnerud(return_X_y=False): 'targets', the two multivariate datasets, with 'data' corresponding to the exercise and 'targets' corresponding to the physiological measurements, as well as 'feature_names' and 'target_names'. + In addition, you will also have access to 'data_filename', the physical location of linnerud data csv dataset, and 'target_filename', the physical location of diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py index 4c59b33e675e6..7b6acaaa191ce 100644 --- a/sklearn/ensemble/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/tests/test_gradient_boosting.py @@ -173,6 +173,25 @@ def test_regressor_parameter_checks(): .fit, X, y) +def test_regressor_parameter_checks(): + # Check input parameter validation for GradientBoostingRegressor + assert_raise_message(ValueError, "alpha must be in (0.0, 1.0) but was 1.2", + GradientBoostingRegressor(loss='huber', alpha=1.2) + .fit, X, y) + assert_raise_message(ValueError, "alpha must be in (0.0, 1.0) but was 1.2", + GradientBoostingRegressor(loss='quantile', alpha=1.2) + .fit, X, y) + assert_raise_message(ValueError, "Invalid value for max_features: " + "'invalid'. Allowed string values are 'auto', 'sqrt'" + " or 'log2'.", + GradientBoostingRegressor(max_features='invalid').fit, + X, y) + assert_raise_message(ValueError, "n_iter_no_change should either be None" + " or an integer. 'invalid' was passed", + GradientBoostingRegressor(n_iter_no_change='invalid') + .fit, X, y) + + def test_loss_function(): assert_raises(ValueError, GradientBoostingClassifier(loss='ls').fit, X, y) diff --git a/sklearn/metrics/base.py b/sklearn/metrics/base.py index b8bbab30930b4..79ff07c7d9537 100644 --- a/sklearn/metrics/base.py +++ b/sklearn/metrics/base.py @@ -13,11 +13,13 @@ # License: BSD 3 clause from __future__ import division +import itertools import numpy as np from ..utils import check_array, check_consistent_length from ..utils.multiclass import type_of_target +from ..preprocessing import LabelBinarizer def _average_binary_score(binary_metric, y_true, y_score, average, @@ -33,7 +35,8 @@ def _average_binary_score(binary_metric, y_true, y_score, average, Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions. - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + average : string, {None, 'micro', 'macro', 'samples', 'weighted'}, + default 'macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: @@ -122,3 +125,127 @@ def _average_binary_score(binary_metric, y_true, y_score, average, return np.average(score, weights=average_weight) else: return score + + +def _average_multiclass_ovo_score(binary_metric, y_true, y_score, average): + """Uses the binary metric for one-vs-one multiclass classification, + where the score is computed according to the Hand & Till (2001) algorithm. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True multiclass labels. + Assumes labels have been recoded to 0 to n_classes. + + y_score : array, shape = [n_samples, n_classes] + Target scores corresponding to probability estimates of a sample + belonging to a particular class + + average : 'macro' or 'weighted', default='macro' + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. Classes + are assumed to be uniformly distributed. + ``'weighted'``: + Calculate metrics for each label, taking into account the prevalence + of the classes. + + binary_metric : callable, the binary metric function to use. + Accepts the following as input + y_true_target : array, shape = [n_samples_target] + Some sub-array of y_true for a pair of classes designated + positive and negative in the one-vs-one scheme. + y_score_target : array, shape = [n_samples_target] + Scores corresponding to the probability estimates + of a sample belonging to the designated positive class label + + Returns + ------- + score : float + Average the sum of pairwise binary metric scores + """ + n_classes = len(np.unique(y_true)) + n_pairs = n_classes * (n_classes - 1) // 2 + prevalence = np.empty(n_pairs) + pair_scores = np.empty(n_pairs) + + ix = 0 + for a, b in itertools.combinations(range(n_classes), 2): + a_mask = y_true == a + ab_mask = np.logical_or(a_mask, y_true == b) + + prevalence[ix] = np.sum(ab_mask) / len(y_true) + + y_score_filtered = y_score[ab_mask] + + a_true = a_mask[ab_mask] + b_true = np.logical_not(a_true) + + a_true_score = binary_metric( + a_true, y_score_filtered[:, a]) + b_true_score = binary_metric( + b_true, y_score_filtered[:, b]) + binary_avg_score = (a_true_score + b_true_score) / 2 + pair_scores[ix] = binary_avg_score + + ix += 1 + return (np.average(pair_scores, weights=prevalence) + if average == "weighted" else np.average(pair_scores)) + + +def _average_multiclass_ovr_score(binary_metric, y_true, y_score, average): + """Uses the binary metric for one-vs-rest multi-class classification, + where the score is computed according to the Provost & Domingos (2001) + definition of the AUC in multi-class settings (when `average` parameter is + set to `weighted`). + + For each class, the ROC curve is generated and the AUC computed. + The output is the average of the individual AUCs weighted by the prevalence + of the classes in the data. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True multiclass labels. + Assumes labels have been recoded to 0 to n_classes. + + y_score : array, shape = [n_samples, n_classes] + Target scores corresponding to probability estimates of a sample + belonging to a particular class. + + average : 'macro' or 'weighted', default='macro' + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. Classes + are assumed to be uniformly distributed. + ``'weighted'``: + Calculate metrics for each label, taking into account the prevalence + of the classes in the dataset. + + binary_metric : callable, the binary metric function to use. + Accepts the following as input + y_true_target : array, shape = [n_samples_target] + Some sub-array of y_true for a pair of classes designated + positive and negative in the one-vs-one scheme. + y_score_target : array, shape = [n_samples_target] + Scores corresponding to the probability estimates + of a sample belonging to the designated positive class label + + Returns + ------- + score : float + Average of binary metric scores + """ + n_classes = len(np.unique(y_true)) + scores = np.zeros((n_classes,)) + + y_true_multilabel = LabelBinarizer().fit_transform(y_true) + prevalence = np.sum(y_true_multilabel, axis=0) / y_true_multilabel.shape[0] + + for c in range(n_classes): + y_true_c = y_true_multilabel.take([c], axis=1).ravel() + y_score_c = y_score.take([c], axis=1).ravel() + scores[c] = binary_metric(y_true_c, y_score_c) + + return (np.average(scores, weights=prevalence) + if average == "weighted" else np.average(scores)) diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 1d8d37954b99c..fe6289481a371 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -24,6 +24,7 @@ from scipy.sparse import csr_matrix from scipy.stats import rankdata +from ..preprocessing import LabelBinarizer from ..utils import assert_all_finite from ..utils import check_consistent_length from ..utils import column_or_1d, check_array @@ -33,7 +34,8 @@ from ..exceptions import UndefinedMetricWarning from ..preprocessing import label_binarize -from .base import _average_binary_score +from .base import _average_binary_score, _average_multiclass_ovo_score, \ + _average_multiclass_ovr_score def auc(x, y, reorder='deprecated'): @@ -157,7 +159,8 @@ def average_precision_score(y_true, y_score, average="macro", class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + average : string, {None, 'micro', 'macro', 'samples', 'weighted'}, + default 'macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: @@ -217,28 +220,39 @@ def _binary_uninterpolated_average_precision( sample_weight=sample_weight) -def roc_auc_score(y_true, y_score, average="macro", sample_weight=None): - """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) - from prediction scores. - - Note: this implementation is restricted to the binary classification task - or multilabel classification task in label indicator format. +def roc_auc_score(y_true, y_score, multiclass="ovr", average="macro", + sample_weight=None): + """Compute Area Under the Curve (AUC) from prediction scores Read more in the :ref:`User Guide `. Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] - True binary labels or binary label indicators. + True binary labels in binary label indicators. + The multiclass case expects shape = [n_samples] and labels + with values from 0 to (n_classes-1), inclusive. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions - (as returned by "decision_function" on some classifiers). For binary - y_true, y_score is supposed to be the score of the class with greater - label. - - average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + (as returned by "decision_function" on some classifiers). + The multiclass case expects shape = [n_samples, n_classes] + where the scores correspond to probability estimates. + + multiclass : string, 'ovr' or 'ovo', default 'ovr' + Note: multiclass ROC AUC currently only handles the 'macro' and + 'weighted' averages. + + ``'ovr'``: + Calculate metrics for the multiclass case using the one-vs-rest + approach. + ``'ovo'``: + Calculate metrics for the multiclass case using the one-vs-one + approach. + + average : string, {None, 'micro', 'macro', 'samples', 'weighted'}, + default 'macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: @@ -295,13 +309,51 @@ def _binary_roc_auc_score(y_true, y_score, sample_weight=None): return auc(fpr, tpr) y_type = type_of_target(y_true) - if y_type == "binary": - labels = np.unique(y_true) - y_true = label_binarize(y_true, labels)[:, 0] + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) - return _average_binary_score( - _binary_roc_auc_score, y_true, y_score, average, - sample_weight=sample_weight) + if y_type == "multiclass" or (y_type == "binary" and + y_score.ndim == 2 and + y_score.shape[1] > 2): + # validation of the input y_score + if not np.allclose(1, y_score.sum(axis=1)): + raise ValueError("Target scores should sum up to 1.0 for all" + "samples.") + # validation for multiclass parameter specifications + average_options = ("macro", "weighted") + if average not in average_options: + raise ValueError("Parameter 'average' must be one of {0} for" + " multiclass problems.".format(average_options)) + multiclass_options = ("ovo", "ovr") + if multiclass not in multiclass_options: + raise ValueError("Parameter multiclass='{0}' is not supported" + " for multiclass ROC AUC. 'multiclass' must be" + " one of {1}.".format( + multiclass, multiclass_options)) + if sample_weight is not None: + # TODO: check if only in ovo case, if yes, do not raise when ovr + raise ValueError("Parameter 'sample_weight' is not supported" + " for multiclass one-vs-one ROC AUC." + " 'sample_weight' must be None in this case.") + + if multiclass == "ovo": + # Hand & Till (2001) implementation + return _average_multiclass_ovo_score( + _binary_roc_auc_score, y_true, y_score, average) + elif multiclass == "ovr" and average == "weighted": + # Provost & Domingos (2001) implementation + return _average_multiclass_ovr_score( + _binary_roc_auc_score, y_true, y_score, average) + else: + y_true = y_true.reshape((-1, 1)) + y_true_multilabel = LabelBinarizer().fit_transform(y_true) + return _average_binary_score( + _binary_roc_auc_score, y_true_multilabel, y_score, average, + sample_weight=sample_weight) + else: + return _average_binary_score( + _binary_roc_auc_score, y_true, y_score, average, + sample_weight=sample_weight) def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index ad323a5483621..4f248a67b7b47 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -23,6 +23,7 @@ from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_true from sklearn.utils.testing import ignore_warnings +from sklearn.utils.testing import _named_check from sklearn.metrics import accuracy_score from sklearn.metrics import balanced_accuracy_score @@ -1045,9 +1046,11 @@ def test_sample_weight_invariance(n_samples=50): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: - yield check_sample_weight_invariance, name, metric, y_true, y_score + yield _named_check(check_sample_weight_invariance, name), name,\ + metric, y_true, y_score else: - yield check_sample_weight_invariance, name, metric, y_true, y_pred + yield _named_check(check_sample_weight_invariance, name), name,\ + metric, y_true, y_pred # multiclass random_state = check_random_state(0) @@ -1062,9 +1065,11 @@ def test_sample_weight_invariance(n_samples=50): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: - yield check_sample_weight_invariance, name, metric, y_true, y_score + yield _named_check(check_sample_weight_invariance, name), name,\ + metric, y_true, y_score else: - yield check_sample_weight_invariance, name, metric, y_true, y_pred + yield _named_check(check_sample_weight_invariance, name), name,\ + metric, y_true, y_pred # multilabel indicator _, ya = make_multilabel_classification(n_features=1, n_classes=20, diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index a17935ae7de17..f66c39fbe256b 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -442,6 +442,125 @@ def test_deprecated_auc_reorder(): [1, 2], [2, 3], reorder=True) +def test_multi_ovo_auc_toydata(): + # Tests the one-vs-one multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_true = np.array([0, 1, 0, 2]) + n_labels = len(np.unique(y_true)) + y_scores = np.array( + [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.35, 0.5, 0.15], [0, 0.2, 0.8]]) + + # Used to compute the expected output. + # Consider labels 0 and 1: + # positive label is 0, negative label is 1 + score_01 = roc_auc_score([1, 0, 1], [0.1, 0.3, 0.35]) + # positive label is 1, negative label is 0 + score_10 = roc_auc_score([0, 1, 0], [0.8, 0.4, 0.5]) + average_score_01 = (score_01 + score_10) / 2. + + # Consider labels 0 and 2: + score_02 = roc_auc_score([1, 1, 0], [0.1, 0.35, 0]) + score_20 = roc_auc_score([0, 0, 1], [0.1, 0.15, 0.8]) + average_score_02 = (score_02 + score_20) / 2. + + # Consider labels 1 and 2: + score_12 = roc_auc_score([1, 0], [0.4, 0.2]) + score_21 = roc_auc_score([0, 1], [0.3, 0.8]) + average_score_12 = (score_12 + score_21) / 2. + + # Unweighted, one-vs-one multiclass ROC AUC algorithm + sum_avg_scores = average_score_01 + average_score_02 + average_score_12 + ovo_unweighted_coefficient = 2. / (n_labels * (n_labels - 1)) + ovo_unweighted_score = ovo_unweighted_coefficient * sum_avg_scores + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovo"), + ovo_unweighted_score) + + # Weighted, one-vs-one multiclass ROC AUC algorithm + # Each term is weighted by the prevalence for the positive label. + pair_scores = [average_score_01, average_score_02, average_score_12] + prevalence = [0.75, 0.75, 0.50] + ovo_weighted_score = np.average(pair_scores, weights=prevalence) + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovo", average="weighted"), + ovo_weighted_score) + + +def test_multi_ovr_auc_toydata(): + # Tests the unweighted, one-vs-rest multiclass ROC AUC algorithm + # on a small example, representative of an expected use case. + y_true = np.array([0, 1, 2, 2]) + y_scores = np.array( + [[1.0, 0.0, 0.0], [0.1, 0.5, 0.4], [0.1, 0.1, 0.8], [0.3, 0.3, 0.4]]) + # Compute the expected result by individually computing the 'one-vs-rest' + # ROC AUC scores for classes 0, 1, and 2. + out_0 = roc_auc_score([1, 0, 0, 0], y_scores[:, 0]) + out_1 = roc_auc_score([0, 1, 0, 0], y_scores[:, 1]) + out_2 = roc_auc_score([0, 0, 1, 1], y_scores[:, 2]) + result_unweighted = (out_0 + out_1 + out_2) / 3. + + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovr"), + result_unweighted) + + # Tests the weighted, one-vs-rest multiclass ROC AUC algorithm + # on the same input (Provost & Domingos, 2001) + result_weighted = out_0 * 0.25 + out_1 * 0.25 + out_2 * 0.5 + assert_almost_equal( + roc_auc_score(y_true, y_scores, multiclass="ovr", average="weighted"), + result_weighted) + + +def test_multi_auc_score_under_permutation(): + y_score = np.random.rand(100, 3) + y_score[:, 2] += .1 + y_score[:, 1] -= .1 + y_true = np.argmax(y_score, axis=1) + y_true[np.random.randint(len(y_score), size=20)] = np.random.randint( + 2, size=20) + for multiclass in ['ovr', 'ovo']: + for average in ['macro', 'weighted']: + same_score_under_permutation = None + for perm in [[0, 1, 2], [0, 2, 1], [1, 0, 2], + [1, 2, 0], [2, 0, 1], [2, 1, 0]]: + inv_perm = np.zeros(3, dtype=int) + inv_perm[perm] = np.arange(3) + y_score_perm = y_score[:, inv_perm] + y_true_perm = np.take(perm, y_true) + score = roc_auc_score(y_true_perm, y_score_perm, + multiclass=multiclass, average=average) + if same_score_under_permutation is None: + same_score_under_permutation = score + else: + assert_almost_equal(score, same_score_under_permutation) + + +def test_auc_score_multi_error(): + # Test that roc_auc_score function returns an error when trying + # to compute multiclass AUC for parameters where an output + # is not defined. + rng = check_random_state(404) + y_pred = rng.rand(10) + y_true = rng.randint(0, 3, size=10) + average_error_msg = ("Parameter 'average' must be one of " + "('macro', 'weighted') for multiclass problems.") + assert_raise_message(ValueError, average_error_msg, + roc_auc_score, y_true, y_pred, average="sample") + assert_raise_message(ValueError, average_error_msg, + roc_auc_score, y_true, y_pred, average="micro") + multiclass_error_msg = ("Parameter multiclass='invalid' is not " + "supported for multiclass ROC AUC. 'multiclass' " + "must be one of ('ovo', 'ovr').") + assert_raise_message(ValueError, multiclass_error_msg, + roc_auc_score, y_true, y_pred, multiclass="invalid") + sample_weight_error_msg = ("Parameter 'sample_weight' is not supported " + "for multiclass one-vs-one ROC AUC. " + "'sample_weight' must be None in this case.") + assert_raise_message(ValueError, sample_weight_error_msg, + roc_auc_score, y_true, y_pred, + multiclass="ovo", sample_weight=[]) + + def test_auc_score_non_binary_class(): # Test that roc_auc_score function returns an error when trying # to compute AUC for non-binary class values. @@ -457,10 +576,6 @@ def test_auc_score_non_binary_class(): y_true = -np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) - # y_true contains three different class values - y_true = rng.randint(0, 3, size=10) - assert_raise_message(ValueError, "multiclass format is not supported", - roc_auc_score, y_true, y_pred) clean_warning_registry() with warnings.catch_warnings(record=True): @@ -477,11 +592,6 @@ def test_auc_score_non_binary_class(): assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) - # y_true contains three different class values - y_true = rng.randint(0, 3, size=10) - assert_raise_message(ValueError, "multiclass format is not supported", - roc_auc_score, y_true, y_pred) - def test_binary_clf_curve(): rng = check_random_state(404) @@ -491,6 +601,7 @@ def test_binary_clf_curve(): assert_raise_message(ValueError, msg, precision_recall_curve, y_true, y_pred) + def test_precision_recall_curve(): y_true, _, probas_pred = make_prediction(binary=True) _test_precision_recall_curve(y_true, probas_pred) @@ -675,7 +786,6 @@ def test_score_scale_invariance(): # issue #3864 (and others), where overly aggressive rounding was causing # problems for users with very small y_score values y_true, _, probas_pred = make_prediction(binary=True) - roc_auc = roc_auc_score(y_true, probas_pred) roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred) roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 9ab6b9df88ab5..14644ccf02207 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -704,6 +704,7 @@ def _store(key_name, array, weights=None, splits=False, rank=False): # NOTE test_sample counts (weights) remain the same for all candidates test_sample_counts = np.array(test_sample_counts[:n_splits], dtype=np.int) + iid = self.iid if self.iid == 'warn': if len(np.unique(test_sample_counts)) > 1: diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index a9bedd53e2ef3..a0582a9c7132c 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -2062,6 +2062,7 @@ def train_test_split(*arrays, **options): # Tell nose that train_test_split is not a test train_test_split.__test__ = False + def _build_repr(self): # XXX This is copied from BaseEstimator's get_params cls = self.__class__ diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 74ab4503eb34a..93f243caad643 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2170,11 +2170,17 @@ class QuantileTransformer(BaseEstimator, TransformerMixin): See also -------- quantile_transform : Equivalent function without the estimator API. +<<<<<<< HEAD + StandardScaler : perform standardization that is faster, but less robust + to outliers. + RobustScaler : perform robust standardization that removes the influence +======= PowerTransformer : Perform mapping to a normal distribution using a power transform. StandardScaler : Perform standardization that is faster, but less robust to outliers. RobustScaler : Perform robust standardization that removes the influence +>>>>>>> upstream/master of outliers but does not put outliers and inliers on the same scale. Notes @@ -2558,11 +2564,17 @@ def quantile_transform(X, axis=0, n_quantiles=1000, QuantileTransformer : Performs quantile-based scaling using the ``Transformer`` API (e.g. as part of a preprocessing :class:`sklearn.pipeline.Pipeline`). +<<<<<<< HEAD + scale : perform standardization that is faster, but less robust + to outliers. + robust_scale : perform robust standardization that removes the influence +======= power_transform : Maps data to a normal distribution using a power transformation. scale : Performs standardization that is faster, but less robust to outliers. robust_scale : Performs robust standardization that removes the influence +>>>>>>> upstream/master of outliers but does not put outliers and inliers on the same scale. Notes diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 398c12cbddb42..101b9deb39363 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -34,7 +34,6 @@ from sklearn.utils.testing import assert_dict_equal from sklearn.discriminant_analysis import LinearDiscriminantAnalysis - from sklearn.base import (clone, TransformerMixin, ClusterMixin, BaseEstimator, is_classifier, is_regressor) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 3c81a2f86d35b..1e607deb0e51d 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -143,7 +143,6 @@ def sparse_min_max(X, axis): else: from scipy.sparse.linalg import lsqr as sparse_lsqr # noqa - try: # SciPy >= 0.19 from scipy.special import comb, logsumexp except ImportError: diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 7d17680c9cee9..43046129a042b 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -827,6 +827,7 @@ def check_docstring_parameters(func, doc=None, ignore=None, class_name=None): incorrect : list A list of string describing the incorrect results. """ + from numpydoc import docscrape incorrect = [] ignore = [] if ignore is None else ignore