diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index a28652879cba9..b3a67af026a33 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -33,7 +33,8 @@ by decomposing such problems into binary classification problems. several joint classification tasks. This is a generalization of the multi-label classification task, where the set of classification problem is restricted to binary classification, and of the multi-class - classification task. *The output format is a 2d numpy array.* + classification task. *The output format is a 2d numpy array or sparse + matrix.* The set of labels can be different for each output variable. For instance a sample could be assigned "pear" for an output variable that diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0f88a7fae08e2..ccdf733b6b0b2 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -32,14 +32,19 @@ # # License: BSD 3 clause +import array import numpy as np import warnings +import scipy.sparse as sp from .base import BaseEstimator, ClassifierMixin, clone, is_classifier from .base import MetaEstimatorMixin from .preprocessing import LabelBinarizer from .metrics.pairwise import euclidean_distances from .utils import check_random_state +from .utils.multiclass import type_of_target +from .utils.multiclass import unique_labels +from .utils.validation import _num_samples from .externals.joblib import Parallel from .externals.joblib import delayed @@ -81,24 +86,96 @@ def _check_estimator(estimator): def fit_ovr(estimator, X, y, n_jobs=1): - """Fit a one-vs-the-rest strategy.""" - _check_estimator(estimator) + """Fit a list of estimators using a one-vs-the-rest strategy. - lb = LabelBinarizer() - Y = lb.fit_transform(y) + Parameters + ---------- + estimator : estimator object + An estimator object implementing `fit` and one of `decision_function` + or `predict_proba`. - estimators = Parallel(n_jobs=n_jobs)( - delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i]) - for i in range(Y.shape[1])) + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Data. + + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes] Multi-class targets. An indicator matrix + turns on multilabel classification. + + Returns + ------- + self + """ + _check_estimator(estimator) + # A sparse LabelBinarizer, with sparse_output=True, has been shown to + # outpreform or match a dense label binarizer in all cases and has also + # resulted in less or equal memory consumption in the fit_ovr function + # overall. + lb = LabelBinarizer(sparse_output=True) + Y = lb.fit_transform(y) + Y = Y.tocsc() + columns = (col.toarray().ravel() for col in Y.T) + # In cases where individual estimators are very fast to train setting + # n_jobs > 1 in can results in slower performance due to the overhead + # of spawning threads. + estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) + (estimator, + X, + column, + classes=["not %s" % i, + lb.classes_[i]]) + for i, column in enumerate(columns)) return estimators, lb def predict_ovr(estimators, label_binarizer, X): - """Make predictions using the one-vs-the-rest strategy.""" - Y = np.array([_predict_binary(e, X) for e in estimators]) + """Predict multi-class targets using the one vs rest strategy. + + Parameters + ---------- + estimators : list of `n_classes` estimators, Estimators used for + predictions. The list must be homogeneous with respect to the type of + estimators. fit_ovr supplies this list as part of its output. + + label_binarizer : LabelBinarizer object, Object used to transform + multiclass labels to binary labels and vice-versa. fit_ovr supplies + this object as part of its output. + + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Data. + + Returns + ------- + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes]. Predicted multi-class targets. + """ + e_types = set([type(e) for e in estimators if not + isinstance(e, _ConstantPredictor)]) + if len(e_types) > 1: + raise ValueError("List of estimators must contain estimators of the" + " same type but contains types {0}".format(e_types)) e = estimators[0] thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 - return label_binarizer.inverse_transform(Y.T, threshold=thresh) + + if label_binarizer.y_type_ == "multiclass": + maxima = np.empty(X.shape[0], dtype=float) + maxima.fill(-np.inf) + argmaxima = np.zeros(X.shape[0], dtype=int) + for i, e in enumerate(estimators): + pred = _predict_binary(e, X) + np.maximum(maxima, pred, out=maxima) + argmaxima[maxima == pred] = i + return label_binarizer.classes_[np.array(argmaxima.T)] + else: + n_samples = _num_samples(X) + indices = array.array('i') + indptr = array.array('i', [0]) + for e in estimators: + indices.extend(np.where(_predict_binary(e, X) > thresh)[0]) + indptr.append(len(indices)) + data = np.ones(len(indices), dtype=int) + indicator = sp.csc_matrix((data, indices, indptr), + shape=(n_samples, len(estimators))) + return label_binarizer.inverse_transform(indicator) def predict_proba_ovr(estimators, X, is_multilabel): @@ -190,9 +267,9 @@ def fit(self, X, y): X : {array-like, sparse matrix}, shape = [n_samples, n_features] Data. - y : array-like, shape = [n_samples] or [n_samples, n_classes] - Multi-class targets. An indicator matrix turns on multilabel - classification. + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes] Multi-class targets. An indicator matrix + turns on multilabel classification. Returns ------- @@ -216,8 +293,8 @@ def predict(self, X): Returns ------- - y : array-like, shape = [n_samples] - Predicted multi-class targets. + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes]. Predicted multi-class targets. """ self._check_is_fitted() @@ -242,7 +319,7 @@ def predict_proba(self, X): Returns ------- - T : array-like, shape = [n_samples, n_classes] + T : {array-like, sparse matrix}, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model, where classes are ordered as they are in `self.classes_`. """ @@ -271,7 +348,7 @@ def decision_function(self, X): @property def multilabel_(self): """Whether this is a multilabel classifier""" - return self.label_binarizer_.multilabel_ + return self.label_binarizer_.y_type_.startswith('multilabel') def score(self, X, y): if self.multilabel_: diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 4c8102160a98b..5f6d7a0badd2e 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -201,6 +201,13 @@ class LabelBinarizer(BaseEstimator, TransformerMixin): `classes_` : array of shape [n_class] Holds the label for each class. + `y_type_` : str, + Represents the type of the target data as evaluated by + utils.multiclass.type_of_target. Possible type are 'continuous', + 'continuous-multioutput', 'binary', 'multiclass', + 'mutliclass-multioutput', 'multilabel-sequences', + 'multilabel-indicator', and 'unknown'. + `multilabel_` : boolean True if the transformer was fitted on a multilabel rather than a multiclass set of labels. The multilabel_ attribute is deprecated @@ -301,6 +308,10 @@ def fit(self, y): self : returns an instance of self. """ self.y_type_ = type_of_target(y) + if 'multioutput' in self.y_type_: + raise ValueError("Multioutput target data is not supported with " + "label binarization") + self.sparse_input_ = sp.issparse(y) self.classes_ = unique_labels(y) return self @@ -462,6 +473,9 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, pos_label = -neg_label y_type = type_of_target(y) + if 'multioutput' in y_type: + raise ValueError("Multioutput target data is not supported with label " + "binarization") n_samples = y.shape[0] if sp.issparse(y) else len(y) n_classes = len(classes) @@ -517,6 +531,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, if pos_switch: Y[Y == pos_label] = 0 + else: + Y.data = astype(Y.data, int, copy=False) # preserve label ordering if np.any(classes != sorted_class): @@ -524,7 +540,10 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, Y = Y[:, indices] if y_type == "binary": - Y = Y[:, -1].reshape((-1, 1)) + if sparse_output: + Y = Y.getcol(-1) + else: + Y = Y[:, -1].reshape((-1, 1)) return Y @@ -600,6 +619,8 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold): # Inverse transform data if output_type == "binary": + if sp.issparse(y): + y = y.toarray() if y.ndim == 2 and y.shape[1] == 2: return classes[y[:, 1]] else: diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 2cc786605f380..826eaec9fc0a9 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -194,6 +194,11 @@ def test_label_binarizer_errors(): y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) + # Fail on multioutput data + assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]])) + assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]), + [1, 2, 3]) + def test_label_encoder(): """Test LabelEncoder's transform and inverse_transform methods""" @@ -467,6 +472,15 @@ def test_label_binarize_binary(): yield check_binarized_results, y, classes, pos_label, neg_label, expected + # Binary case where sparse_output = True will not result in a ValueError + y = [0, 1, 0] + classes = [0, 1] + pos_label = 3 + neg_label = 0 + expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) + + yield check_binarized_results, y, classes, pos_label, neg_label, expected + def test_label_binarize_multiclass(): y = [0, 1, 2] diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 0b23951bc8d8f..4fef34a901bd8 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -1,5 +1,6 @@ import numpy as np import warnings +import scipy.sparse as sp from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal @@ -13,10 +14,14 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OutputCodeClassifier +from sklearn.multiclass import predict_ovr +from sklearn.multiclass import fit_ovr from sklearn.metrics import precision_score from sklearn.metrics import recall_score +from sklearn.preprocessing import LabelBinarizer + from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, Ridge, @@ -40,6 +45,17 @@ def test_ovr_exceptions(): ovr = OneVsRestClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ovr.predict, []) + assert_raises(ValueError, predict_ovr, [LinearSVC(), MultinomialNB()], + LabelBinarizer(), []) + + # Fail on multioutput data + assert_raises(ValueError, fit_ovr, MultinomialNB(), + np.array([[1, 0], [0, 1]]), + np.array([[1, 2], [3, 1]])) + assert_raises(ValueError, fit_ovr, MultinomialNB(), + np.array([[1, 0], [0, 1]]), + np.array([[1.5, 2.4], [3.1, 0.8]])) + def test_ovr_fit_predict(): # A classifier which implements decision_function. @@ -57,9 +73,49 @@ def test_ovr_fit_predict(): assert_greater(np.mean(iris.target == pred), 0.65) +def test_ovr_fit_predict_sparse(): + for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, + sp.lil_matrix]: + base_clf = MultinomialNB(alpha=1) + + X, Y = datasets.make_multilabel_classification(n_samples=100, + n_features=20, + n_classes=5, + n_labels=3, + length=50, + allow_unlabeled=True, + return_indicator=True, + random_state=0) + + X_train, Y_train = X[:80], Y[:80] + X_test, Y_test = X[80:], Y[80:] + + clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) + Y_pred = clf.predict(X_test) + + clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) + Y_pred_sprs = clf_sprs.predict(X_test) + + assert_true(clf.multilabel_) + assert_true(sp.issparse(Y_pred_sprs)) + assert_array_equal(Y_pred_sprs.toarray(), Y_pred) + + # Test predict_proba + Y_proba = clf_sprs.predict_proba(X_test) + + # predict assigns a label if the probability that the + # sample has the label is greater than 0.5. + pred = Y_proba > .5 + assert_array_equal(pred, Y_pred_sprs.toarray()) + + # Test decision_function + clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train)) + dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int) + assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray()) + + def test_ovr_always_present(): - """Test that ovr works with classes that are always present or absent - """ + """Test that ovr works with classes that are always present or absent.""" # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) X[:5, :] = 0 @@ -87,12 +143,62 @@ def test_ovr_always_present(): assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0])) +def test_ovr_multiclass(): + # Toy dataset where features correspond directly to labels. + X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]]) + y = ["eggs", "spam", "ham", "eggs", "ham"] + Y = np.array([[0, 0, 1], + [0, 1, 0], + [1, 0, 0], + [0, 0, 1], + [1, 0, 0]]) + + classes = set("ham eggs spam".split()) + + for base_clf in (MultinomialNB(), LinearSVC(random_state=0), + LinearRegression(), Ridge(), + ElasticNet()): + + clf = OneVsRestClassifier(base_clf).fit(X, y) + assert_equal(set(clf.classes_), classes) + y_pred = clf.predict(np.array([[0, 0, 4]]))[0] + assert_equal(set(y_pred), set("eggs")) + + # test input as label indicator matrix + clf = OneVsRestClassifier(base_clf).fit(X, Y) + y_pred = clf.predict([[0, 0, 4]])[0] + assert_array_equal(y_pred, [0, 0, 1]) + + +def test_ovr_binary(): + # Toy dataset where features correspond directly to labels. + X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]]) + y = ["eggs", "spam", "spam", "eggs", "spam"] + Y = np.array([[0, 1, 1, 0, 1]]).T + + classes = set("eggs spam".split()) + + for base_clf in (MultinomialNB(), LinearSVC(random_state=0), + LinearRegression(), Ridge(), + ElasticNet()): + + clf = OneVsRestClassifier(base_clf).fit(X, y) + assert_equal(set(clf.classes_), classes) + y_pred = clf.predict(np.array([[0, 0, 4]]))[0] + assert_equal(set(y_pred), set("eggs")) + + # test input as label indicator matrix + clf = OneVsRestClassifier(base_clf).fit(X, Y) + y_pred = clf.predict([[3, 0, 0]])[0] + assert_equal(y_pred, 1) + + def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"], ["ham", "eggs"], ["ham"]] - #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] + # y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] Y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], @@ -142,6 +248,7 @@ def test_ovr_multilabel_dataset(): X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) + assert_true(clf.multilabel_) assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), prec, @@ -379,3 +486,8 @@ def test_ecoc_gridsearch(): cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs) + + +if __name__ == "__main__": + import nose + nose.runmodule() diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index f3ee830c64f10..195836ccced2d 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -155,11 +155,10 @@ def assert_warns(warning_class, func, *args, **kw): raise AssertionError("No warning raised when calling %s" % func.__name__) - if not w[0].category is warning_class: - raise AssertionError("First warning for %s is not a " - "%s( is %s)" - % (func.__name__, warning_class, w[0])) - + found = any(warning.category is warning_class for warning in w) + if not found: + raise AssertionError("%s did not give warning: %s( is %s)" + % (func.__name__, warning_class, w)) return result @@ -579,6 +578,7 @@ def if_not_mac_os(versions=('10.7', '10.8', '10.9'), """ mac_version, _, _ = platform.mac_ver() skip = '.'.join(mac_version.split('.')[:2]) in versions + def decorator(func): if skip: @wraps(func)