From a4fe5b008142087b8acf7c3b6e8d64f9e7cc019b Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 12 Jun 2014 21:27:45 -0400 Subject: [PATCH 01/54] Modified sparse OvR to handle sparse target data --- sklearn/multiclass.py | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0f88a7fae08e2..685e8e1dc86b5 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -34,6 +34,7 @@ import numpy as np import warnings +import scipy.sparse as sp from .base import BaseEstimator, ClassifierMixin, clone, is_classifier from .base import MetaEstimatorMixin @@ -84,21 +85,38 @@ def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) - lb = LabelBinarizer() - Y = lb.fit_transform(y) + lb = LabelBinarizer(sparse_output=True) + + ## debug + print(y) + ## + + Y = sp.csc_matrix(lb.fit_transform(y)) + + ## Debug + print(Y.toarray()) + ## estimators = Parallel(n_jobs=n_jobs)( - delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i]) - for i in range(Y.shape[1])) + delayed(_fit_binary)(estimator, X, Y.getcol(i).toarray(), + classes=["not %s" % i, i]) for i in range(Y.shape[1])) return estimators, lb def predict_ovr(estimators, label_binarizer, X): """Make predictions using the one-vs-the-rest strategy.""" - Y = np.array([_predict_binary(e, X) for e in estimators]) + e = estimators[0] thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 - return label_binarizer.inverse_transform(Y.T, threshold=thresh) + + Y = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, dtype=np.int)) + + for e in estimators[1:]: + r = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, + dtype=np.int)) + Y = sp.vstack([Y, r]) + + return label_binarizer.inverse_transform(Y.T) def predict_proba_ovr(estimators, X, is_multilabel): From 8dbae826d86d1933048d04e7c68f9b357e34bcaf Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 12 Jun 2014 21:47:08 -0400 Subject: [PATCH 02/54] Progress comment --- sklearn/multiclass.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 685e8e1dc86b5..7ecc51a2fb7f0 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -109,6 +109,9 @@ def predict_ovr(estimators, label_binarizer, X): e = estimators[0] thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 + # XXX: this needs to handle multiclass correctly, a copy of + # inverse_transform is needed where only the thresholding behavior is used + Y = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, dtype=np.int)) for e in estimators[1:]: From 4662b9552927c4d3f3560e408ca7971012b094d6 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 13 Jun 2014 09:21:06 -0400 Subject: [PATCH 03/54] Introduce multiclass behvior into predict_ovr, minimal testing in test_multiclass --- sklearn/multiclass.py | 30 +++++++++++++++++++----------- sklearn/tests/test_multiclass.py | 20 ++++++++++++++++++++ 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 7ecc51a2fb7f0..862607af12354 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -88,13 +88,13 @@ def fit_ovr(estimator, X, y, n_jobs=1): lb = LabelBinarizer(sparse_output=True) ## debug - print(y) + #print(y) ## Y = sp.csc_matrix(lb.fit_transform(y)) ## Debug - print(Y.toarray()) + #print(Y.toarray()) ## estimators = Parallel(n_jobs=n_jobs)( @@ -109,17 +109,25 @@ def predict_ovr(estimators, label_binarizer, X): e = estimators[0] thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 - # XXX: this needs to handle multiclass correctly, a copy of - # inverse_transform is needed where only the thresholding behavior is used + if label_binarizer.y_type_ == "multiclass": + Y = np.array([]) + for x in X: + x_scores = np.array([]) + for e in estimators: + x_scores = np.append(x_scores, _predict_binary(e, x)) + c = label_binarizer.classes_[x_scores.argmax()] + Y = np.append(Y,c) + return Y - Y = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, dtype=np.int)) - - for e in estimators[1:]: - r = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, + else: + Y = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, dtype=np.int)) - Y = sp.vstack([Y, r]) - - return label_binarizer.inverse_transform(Y.T) + for e in estimators[1:]: + r = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, + dtype=np.int)) + Y = sp.vstack([Y, r]) + print(Y) + return label_binarizer.inverse_transform(Y.T, threshold=0.5) def predict_proba_ovr(estimators, X, is_multilabel): diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 0b23951bc8d8f..498750f19ef0a 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -1,5 +1,6 @@ import numpy as np import warnings +import scipy.sparse as sp from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal @@ -45,10 +46,16 @@ def test_ovr_fit_predict(): # A classifier which implements decision_function. ovr = OneVsRestClassifier(LinearSVC(random_state=0)) pred = ovr.fit(iris.data, iris.target).predict(iris.data) + print(iris.target) + + print(pred) assert_equal(len(ovr.estimators_), n_classes) clf = LinearSVC(random_state=0) pred2 = clf.fit(iris.data, iris.target).predict(iris.data) + print(pred2) + print(np.mean(iris.target == pred)) + print(np.mean(iris.target == pred2)) assert_equal(np.mean(iris.target == pred), np.mean(iris.target == pred2)) # A classifier which implements predict_proba. @@ -142,6 +149,15 @@ def test_ovr_multilabel_dataset(): X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) + + ## XXX + print(Y) + clf = OneVsRestClassifier(base_clf).fit(X_train, sp.coo_matrix(Y_train)) + Y_pred = clf.predict(X_test) + assert_true(sp.issparse(Y_pred)) + Y_pred = Y_pred.toarray() + ## + assert_true(clf.multilabel_) assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), prec, @@ -379,3 +395,7 @@ def test_ecoc_gridsearch(): cv.fit(iris.data, iris.target) best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs) + +if __name__ == "__main__": + import nose + nose.runmodule() \ No newline at end of file From 0771ba06caf3157d48c1b337a710a70ac7e19772 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 13 Jun 2014 16:29:00 -0400 Subject: [PATCH 04/54] Initialized sparse output in fit_ovr LabelBinarizer to be sparse if Y is sparse, removed debug statements --- sklearn/multiclass.py | 25 +++++++++++-------------- sklearn/tests/test_multiclass.py | 14 -------------- 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 862607af12354..bd55e7d99cb86 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -85,21 +85,19 @@ def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) - lb = LabelBinarizer(sparse_output=True) - - ## debug - #print(y) - ## + lb = LabelBinarizer(sparse_output=sp.issparse(y)) - Y = sp.csc_matrix(lb.fit_transform(y)) + Y = lb.fit_transform(y) - ## Debug - #print(Y.toarray()) - ## + if sp.issparse(Y): + estimators = Parallel(n_jobs=n_jobs)( + delayed(_fit_binary)(estimator, X, Y.getcol(i).toarray(), + classes=["not %s" % i, i]) for i in range(Y.shape[1])) + else: + estimators = Parallel(n_jobs=n_jobs)( + delayed(_fit_binary)(estimator, X, Y[:, i], + classes=["not %s" % i, i]) for i in range(Y.shape[1])) - estimators = Parallel(n_jobs=n_jobs)( - delayed(_fit_binary)(estimator, X, Y.getcol(i).toarray(), - classes=["not %s" % i, i]) for i in range(Y.shape[1])) return estimators, lb @@ -117,7 +115,7 @@ def predict_ovr(estimators, label_binarizer, X): x_scores = np.append(x_scores, _predict_binary(e, x)) c = label_binarizer.classes_[x_scores.argmax()] Y = np.append(Y,c) - return Y + return Y.T else: Y = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, @@ -126,7 +124,6 @@ def predict_ovr(estimators, label_binarizer, X): r = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, dtype=np.int)) Y = sp.vstack([Y, r]) - print(Y) return label_binarizer.inverse_transform(Y.T, threshold=0.5) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 498750f19ef0a..6234fd6846c61 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -46,16 +46,10 @@ def test_ovr_fit_predict(): # A classifier which implements decision_function. ovr = OneVsRestClassifier(LinearSVC(random_state=0)) pred = ovr.fit(iris.data, iris.target).predict(iris.data) - print(iris.target) - - print(pred) assert_equal(len(ovr.estimators_), n_classes) clf = LinearSVC(random_state=0) pred2 = clf.fit(iris.data, iris.target).predict(iris.data) - print(pred2) - print(np.mean(iris.target == pred)) - print(np.mean(iris.target == pred2)) assert_equal(np.mean(iris.target == pred), np.mean(iris.target == pred2)) # A classifier which implements predict_proba. @@ -150,14 +144,6 @@ def test_ovr_multilabel_dataset(): clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) - ## XXX - print(Y) - clf = OneVsRestClassifier(base_clf).fit(X_train, sp.coo_matrix(Y_train)) - Y_pred = clf.predict(X_test) - assert_true(sp.issparse(Y_pred)) - Y_pred = Y_pred.toarray() - ## - assert_true(clf.multilabel_) assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), prec, From 2a5ebfb3d8411d1cd17f906a176033f1d884e295 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 13 Jun 2014 17:45:42 -0400 Subject: [PATCH 05/54] Enforced correct dtype in predict_ovr --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index bd55e7d99cb86..90613ee175e76 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -115,7 +115,7 @@ def predict_ovr(estimators, label_binarizer, X): x_scores = np.append(x_scores, _predict_binary(e, x)) c = label_binarizer.classes_[x_scores.argmax()] Y = np.append(Y,c) - return Y.T + return np.array(Y.T, dtype=label_binarizer.classes_.dtype) else: Y = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, From 1275ce5a18bfa1e141bfbca4ae91d196561e6669 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 17 Jun 2014 22:28:44 -0400 Subject: [PATCH 06/54] Included first test for sparse ovr --- sklearn/tests/test_multiclass.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 6234fd6846c61..8614fb9f33b60 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -57,6 +57,27 @@ def test_ovr_fit_predict(): pred = ovr.fit(iris.data, iris.target).predict(iris.data) assert_greater(np.mean(iris.target == pred), 0.65) +def test_ovr_fit_predict_sparse(): + for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, + sp.lil_matrix]: + # A classifier which implements decision_function. + ovr = OneVsRestClassifier(LinearSVC(random_state=0)) + pred = ovr.fit(iris.data, + sparse(iris.target)).predict(sparse(iris.data)) + assert_equal(len(ovr.estimators_), n_classes) + assert_true(sp.issparse(pred)) + + clf = LinearSVC(random_state=0) + pred2 = clf.fit(iris.data, iris.target).predict(iris.data) + assert_equal(np.mean(iris.target == pred.toarray()), + np.mean(iris.target == pred2)) + + # A classifier which implements predict_proba. + ovr = OneVsRestClassifier(MultinomialNB()) + pred = ovr.fit(iris.data, iris.target).predict(iris.data) + assert_greater(np.mean(iris.target == pred), 0.65) + + def test_ovr_always_present(): """Test that ovr works with classes that are always present or absent From c668fc77a2b947a73c2340421392eda2c865499e Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 17 Jun 2014 22:30:07 -0400 Subject: [PATCH 07/54] Included first test for sparse ovr --- sklearn/tests/test_multiclass.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 8614fb9f33b60..c584df1535623 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -57,19 +57,20 @@ def test_ovr_fit_predict(): pred = ovr.fit(iris.data, iris.target).predict(iris.data) assert_greater(np.mean(iris.target == pred), 0.65) + def test_ovr_fit_predict_sparse(): for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]: # A classifier which implements decision_function. ovr = OneVsRestClassifier(LinearSVC(random_state=0)) - pred = ovr.fit(iris.data, + pred = ovr.fit(iris.data, sparse(iris.target)).predict(sparse(iris.data)) assert_equal(len(ovr.estimators_), n_classes) assert_true(sp.issparse(pred)) clf = LinearSVC(random_state=0) pred2 = clf.fit(iris.data, iris.target).predict(iris.data) - assert_equal(np.mean(iris.target == pred.toarray()), + assert_equal(np.mean(iris.target == pred.toarray()), np.mean(iris.target == pred2)) # A classifier which implements predict_proba. @@ -78,7 +79,6 @@ def test_ovr_fit_predict_sparse(): assert_greater(np.mean(iris.target == pred), 0.65) - def test_ovr_always_present(): """Test that ovr works with classes that are always present or absent """ @@ -405,4 +405,4 @@ def test_ecoc_gridsearch(): if __name__ == "__main__": import nose - nose.runmodule() \ No newline at end of file + nose.runmodule() From 94d7d0400b00e221d3e1ca38e9f84a9d0d5a051f Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Wed, 18 Jun 2014 22:33:10 -0400 Subject: [PATCH 08/54] Revised sparse ovr predict test --- sklearn/tests/test_multiclass.py | 38 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index c584df1535623..c2a70aaeb3049 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -61,22 +61,28 @@ def test_ovr_fit_predict(): def test_ovr_fit_predict_sparse(): for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]: - # A classifier which implements decision_function. - ovr = OneVsRestClassifier(LinearSVC(random_state=0)) - pred = ovr.fit(iris.data, - sparse(iris.target)).predict(sparse(iris.data)) - assert_equal(len(ovr.estimators_), n_classes) - assert_true(sp.issparse(pred)) - - clf = LinearSVC(random_state=0) - pred2 = clf.fit(iris.data, iris.target).predict(iris.data) - assert_equal(np.mean(iris.target == pred.toarray()), - np.mean(iris.target == pred2)) - - # A classifier which implements predict_proba. - ovr = OneVsRestClassifier(MultinomialNB()) - pred = ovr.fit(iris.data, iris.target).predict(iris.data) - assert_greater(np.mean(iris.target == pred), 0.65) + base_clf = MultinomialNB(alpha=1) + for au, prec, recall in zip((True, False), (0.65, 0.74), (0.72, 0.84)): + make_mlb = datasets.make_multilabel_classification + X, Y = make_mlb(n_samples=100, + n_features=20, + n_classes=5, + n_labels=2, + length=50, + allow_unlabeled=au, + return_indicator=True, + random_state=0) + + X_train, Y_train = X[:80], Y[:80] + X_test, Y_test = X[80:], Y[80:] + clf = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) + Y_pred = clf.predict(X_test) + + assert_true(clf.multilabel_) + assert_almost_equal(precision_score(Y_test, Y_pred.toarray(), + average="micro"), prec, decimal=2) + assert_almost_equal(recall_score(Y_test, Y_pred.toarray(), + average="micro"), recall, decimal=2) def test_ovr_always_present(): From 63aab298afe8201495392676e6ce06375f463e33 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 19 Jun 2014 11:29:54 -0400 Subject: [PATCH 09/54] Implemented construction of csc_matrix by column indicies in predict_ovr --- sklearn/multiclass.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 90613ee175e76..8e0a8c65d483d 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -32,6 +32,7 @@ # # License: BSD 3 clause +import array import numpy as np import warnings import scipy.sparse as sp @@ -91,11 +92,11 @@ def fit_ovr(estimator, X, y, n_jobs=1): if sp.issparse(Y): estimators = Parallel(n_jobs=n_jobs)( - delayed(_fit_binary)(estimator, X, Y.getcol(i).toarray(), + delayed(_fit_binary)(estimator, X, Y.getcol(i).toarray(), classes=["not %s" % i, i]) for i in range(Y.shape[1])) else: estimators = Parallel(n_jobs=n_jobs)( - delayed(_fit_binary)(estimator, X, Y[:, i], + delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i]) for i in range(Y.shape[1])) return estimators, lb @@ -114,17 +115,19 @@ def predict_ovr(estimators, label_binarizer, X): for e in estimators: x_scores = np.append(x_scores, _predict_binary(e, x)) c = label_binarizer.classes_[x_scores.argmax()] - Y = np.append(Y,c) + Y = np.append(Y, c) return np.array(Y.T, dtype=label_binarizer.classes_.dtype) else: - Y = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, - dtype=np.int)) - for e in estimators[1:]: - r = sp.coo_matrix(np.array(_predict_binary(e, X) > thresh, - dtype=np.int)) - Y = sp.vstack([Y, r]) - return label_binarizer.inverse_transform(Y.T, threshold=0.5) + indices = array.array('i') + indptr = array.array('i', [0]) + for e in estimators: + indices.extend(np.where(_predict_binary(e, X) > thresh)[0]) + indptr.append(len(indices)) + data = np.ones(len(indices), dtype=int) + indicator = sp.csc_matrix((data, indices, indptr), + shape=(len(X), len(estimators))) + return label_binarizer.inverse_transform(indicator) def predict_proba_ovr(estimators, X, is_multilabel): From 7c51b1a3fff3202996a5484f402bf109e10a6e4e Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 24 Jun 2014 14:44:34 -0400 Subject: [PATCH 10/54] Revised formating and indentations --- sklearn/multiclass.py | 19 ++++++++++++------- sklearn/tests/test_multiclass.py | 3 ++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 8e0a8c65d483d..b2259a78ff5ee 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -91,14 +91,19 @@ def fit_ovr(estimator, X, y, n_jobs=1): Y = lb.fit_transform(y) if sp.issparse(Y): - estimators = Parallel(n_jobs=n_jobs)( - delayed(_fit_binary)(estimator, X, Y.getcol(i).toarray(), - classes=["not %s" % i, i]) for i in range(Y.shape[1])) + estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) + (estimator, + X, + Y.getcol(i).toarray(), + classes=["not %s" % i, i]) + for i in range(Y.shape[1])) else: - estimators = Parallel(n_jobs=n_jobs)( - delayed(_fit_binary)(estimator, X, Y[:, i], - classes=["not %s" % i, i]) for i in range(Y.shape[1])) - + estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) + (estimator, + X, + Y[:, i], + classes=["not %s" % i, i]) + for i in range(Y.shape[1])) return estimators, lb diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index c2a70aaeb3049..cd526174bb936 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -120,7 +120,7 @@ def test_ovr_multilabel(): X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"], ["ham", "eggs"], ["ham"]] - #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] + # y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] Y = np.array([[0, 1, 1], [0, 1, 0], [1, 1, 1], @@ -409,6 +409,7 @@ def test_ecoc_gridsearch(): best_C = cv.best_estimator_.estimators_[0].C assert_true(best_C in Cs) + if __name__ == "__main__": import nose nose.runmodule() From bba115adf66f19015c52a74df56ba97685ab5c06 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 24 Jun 2014 16:18:31 -0400 Subject: [PATCH 11/54] Revised test_ovr_fit_predict_sparse to ensure identical results from sparsely and densely trained classifiers --- sklearn/tests/test_multiclass.py | 44 +++++++++++++++++--------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index cd526174bb936..1253cef519e29 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -62,27 +62,29 @@ def test_ovr_fit_predict_sparse(): for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix, sp.lil_matrix]: base_clf = MultinomialNB(alpha=1) - for au, prec, recall in zip((True, False), (0.65, 0.74), (0.72, 0.84)): - make_mlb = datasets.make_multilabel_classification - X, Y = make_mlb(n_samples=100, - n_features=20, - n_classes=5, - n_labels=2, - length=50, - allow_unlabeled=au, - return_indicator=True, - random_state=0) - - X_train, Y_train = X[:80], Y[:80] - X_test, Y_test = X[80:], Y[80:] - clf = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) - Y_pred = clf.predict(X_test) - - assert_true(clf.multilabel_) - assert_almost_equal(precision_score(Y_test, Y_pred.toarray(), - average="micro"), prec, decimal=2) - assert_almost_equal(recall_score(Y_test, Y_pred.toarray(), - average="micro"), recall, decimal=2) + + make_mlb = datasets.make_multilabel_classification + X, Y = make_mlb(n_samples=100, + n_features=20, + n_classes=5, + n_labels=3, + length=50, + allow_unlabeled=True, + return_indicator=True, + random_state=0) + + X_train, Y_train = X[:80], Y[:80] + X_test, Y_test = X[80:], Y[80:] + + clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) + Y_pred = clf.predict(X_test) + + clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train)) + Y_pred_sprs = clf_sprs.predict(X_test) + + assert_true(clf.multilabel_) + assert_true(sp.issparse(Y_pred_sprs)) + assert_array_equal(Y_pred_sprs.toarray(), Y_pred) def test_ovr_always_present(): From 53431387fc6edaa8ddd2e445dab0cc5798f9c9c2 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 24 Jun 2014 19:24:23 -0400 Subject: [PATCH 12/54] Revised predict_ovr to loop over estimators in the multiclass case --- sklearn/multiclass.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index b2259a78ff5ee..8617b93c7332d 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -114,15 +114,14 @@ def predict_ovr(estimators, label_binarizer, X): thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 if label_binarizer.y_type_ == "multiclass": - Y = np.array([]) - for x in X: - x_scores = np.array([]) - for e in estimators: - x_scores = np.append(x_scores, _predict_binary(e, x)) - c = label_binarizer.classes_[x_scores.argmax()] - Y = np.append(Y, c) - return np.array(Y.T, dtype=label_binarizer.classes_.dtype) - + maxima = np.empty(X.shape[0], dtype=float) + maxima.fill(-np.inf) + argmaxima = np.zeros(X.shape[0], dtype=int) + for i, e in enumerate(estimators): + pred = _predict_binary(e, X) + np.maximum(maxima, pred, out=maxima) + argmaxima[maxima == pred] = i + return np.array(argmaxima.T, dtype=label_binarizer.classes_.dtype) else: indices = array.array('i') indptr = array.array('i', [0]) From b3860cd95ef1c677ed519675eba868fd8a8d88e4 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 24 Jun 2014 19:30:35 -0400 Subject: [PATCH 13/54] Removed blank line --- sklearn/multiclass.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 8617b93c7332d..0acebca84ca5c 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -109,7 +109,6 @@ def fit_ovr(estimator, X, y, n_jobs=1): def predict_ovr(estimators, label_binarizer, X): """Make predictions using the one-vs-the-rest strategy.""" - e = estimators[0] thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 From b4920309d12d749f7d343d132ed4973dc2ccb734 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 26 Jun 2014 17:11:11 -0400 Subject: [PATCH 14/54] Defaulted label binarizer to set sparse_output=True when training ovr classifiers, editied Label binarizer to allow for sparse binary column output --- sklearn/multiclass.py | 4 ++-- sklearn/preprocessing/label.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0acebca84ca5c..46c0de7337aea 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -86,7 +86,7 @@ def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) - lb = LabelBinarizer(sparse_output=sp.issparse(y)) + lb = LabelBinarizer(sparse_output=True) Y = lb.fit_transform(y) @@ -94,7 +94,7 @@ def fit_ovr(estimator, X, y, n_jobs=1): estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) (estimator, X, - Y.getcol(i).toarray(), + Y.getcol(i).toarray().ravel(), classes=["not %s" % i, i]) for i in range(Y.shape[1])) else: diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 4c8102160a98b..d91db1038d637 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -517,6 +517,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, if pos_switch: Y[Y == pos_label] = 0 + else: + Y.data = astype(Y.data, int, copy=False) # preserve label ordering if np.any(classes != sorted_class): @@ -524,7 +526,10 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, Y = Y[:, indices] if y_type == "binary": - Y = Y[:, -1].reshape((-1, 1)) + if sparse_output: + Y = Y.getcol(-1) + else: + Y = Y[:, -1].reshape((-1, 1)) return Y @@ -600,6 +605,8 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold): # Inverse transform data if output_type == "binary": + if sp.issparse(y): + y = y.toarray() if y.ndim == 2 and y.shape[1] == 2: return classes[y[:, 1]] else: From 9a3c831be6533a846279eb8eeeaf6242fcd7f8dd Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 27 Jun 2014 11:50:45 -0400 Subject: [PATCH 15/54] Revised predict_ovr to work with non integer labels --- sklearn/multiclass.py | 6 ++++-- sklearn/tests/test_multiclass.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 46c0de7337aea..715aa86e6e30e 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -86,7 +86,9 @@ def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) - lb = LabelBinarizer(sparse_output=True) + # XXX benchmark one line vs the other + lb = LabelBinarizer(sparse_output=sp.issparse(y)) + # lb = LabelBinarizer(sparse_output=sp.issparse(y)) Y = lb.fit_transform(y) @@ -120,7 +122,7 @@ def predict_ovr(estimators, label_binarizer, X): pred = _predict_binary(e, X) np.maximum(maxima, pred, out=maxima) argmaxima[maxima == pred] = i - return np.array(argmaxima.T, dtype=label_binarizer.classes_.dtype) + return label_binarizer.classes_[np.array(argmaxima.T)] else: indices = array.array('i') indptr = array.array('i', [0]) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 1253cef519e29..ecbd1aa2d3c5b 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -117,6 +117,34 @@ def test_ovr_always_present(): assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0])) +def test_ovr_multiclass(): + # Toy dataset where features correspond directly to labels. + X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]]) + y = ["eggs", "spam", "ham", "eggs", "ham"] + # y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] + Y = np.array([[0, 0, 1], + [0, 1, 0], + [1, 0, 0], + [0, 0, 1], + [1, 0, 0]]) + + classes = set("ham eggs spam".split()) + + for base_clf in (MultinomialNB(), LinearSVC(random_state=0), + LinearRegression(), Ridge(), + ElasticNet(), Lasso(alpha=0.5)): + + clf = OneVsRestClassifier(base_clf).fit(X, y) + assert_equal(set(clf.classes_), classes) + y_pred = clf.predict(np.array([[0, 0, 4]]))[0] + assert_equal(set(y_pred), set("eggs")) + + # test input as label indicator matrix + clf = OneVsRestClassifier(base_clf).fit(X, Y) + y_pred = clf.predict([[0, 0, 4]])[0] + assert_array_equal(y_pred, [0, 0, 1]) + + def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) From 312e108c47c4c131174b7fa885cf81f3621a39f0 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 27 Jun 2014 11:55:15 -0400 Subject: [PATCH 16/54] Fixed type, sp.issparse(y) -> True --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 715aa86e6e30e..b28564f4a1ac7 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -87,7 +87,7 @@ def fit_ovr(estimator, X, y, n_jobs=1): _check_estimator(estimator) # XXX benchmark one line vs the other - lb = LabelBinarizer(sparse_output=sp.issparse(y)) + lb = LabelBinarizer(sparse_output=True) # lb = LabelBinarizer(sparse_output=sp.issparse(y)) Y = lb.fit_transform(y) From ee4a715b7a4bf6fb0284d07ba298471d597c4f09 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 27 Jun 2014 12:29:03 -0400 Subject: [PATCH 17/54] Attempt to avoid a sparse effecieny warning by not converting a csc matricies in label_binarize --- sklearn/preprocessing/label.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index d91db1038d637..db38998e60daf 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -492,7 +492,14 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": - Y = sp.csr_matrix(y) + # XXX simplify this conditional path + if sp.issparse(y): + if y.format not in ('csr', 'csc'): + Y = sp.csr_matrix(y) + else: + Y = y + else: + Y = sp.csr_matrix(y) if pos_label != 1: data = np.empty_like(Y.data) data.fill(pos_label) From b2d0f1e83d1d4ffaf80592b3b57e568248c07218 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 27 Jun 2014 14:07:32 -0400 Subject: [PATCH 18/54] Revised csc sparse matrix case in label_binarize in attempt to avoid reformatting --- sklearn/preprocessing/label.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index db38998e60daf..0d40af55498f0 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -492,12 +492,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": - # XXX simplify this conditional path - if sp.issparse(y): - if y.format not in ('csr', 'csc'): - Y = sp.csr_matrix(y) - else: - Y = y + if sp.issparse(y) and y.format == 'csc': + Y = sp.csc_matrix(y) else: Y = sp.csr_matrix(y) if pos_label != 1: From 8e2f9a2e56814b6bd5a1cb9993ef97a1e951118e Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 27 Jun 2014 20:40:12 -0400 Subject: [PATCH 19/54] Cast sparse array to csc in fit_ovr and wrote a get_col_ helper, removed unecessary csc_cast from label_binarizer --- sklearn/multiclass.py | 10 +++++++++- sklearn/preprocessing/label.py | 5 +---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index b28564f4a1ac7..0cd3089042332 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -93,10 +93,11 @@ def fit_ovr(estimator, X, y, n_jobs=1): Y = lb.fit_transform(y) if sp.issparse(Y): + Y = Y.tocsc() estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) (estimator, X, - Y.getcol(i).toarray().ravel(), + get_col_(Y, i), classes=["not %s" % i, i]) for i in range(Y.shape[1])) else: @@ -109,6 +110,13 @@ def fit_ovr(estimator, X, y, n_jobs=1): return estimators, lb +def get_col_(Y, i): + """Y is CSC matrix, i is the column index. Returns the dense column.""" + c = np.zeros(Y.shape[0], dtype=int) + c[Y.indices[Y.indptr[i]:Y.indptr[i+1]]] = Y.data[Y.indptr[i]:Y.indptr[i+1]] + return c + + def predict_ovr(estimators, label_binarizer, X): """Make predictions using the one-vs-the-rest strategy.""" e = estimators[0] diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 0d40af55498f0..d91db1038d637 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -492,10 +492,7 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, shape=(n_samples, n_classes)) elif y_type == "multilabel-indicator": - if sp.issparse(y) and y.format == 'csc': - Y = sp.csc_matrix(y) - else: - Y = sp.csr_matrix(y) + Y = sp.csr_matrix(y) if pos_label != 1: data = np.empty_like(Y.data) data.fill(pos_label) From 4f82b6691a180787618de8da8ec3ab13e6048614 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 30 Jun 2014 16:17:02 -0400 Subject: [PATCH 20/54] Implemented tests for predict_proba and decison_function with a classifer fit on sparse target data, also made some formatting revisions in multiclass.py --- sklearn/multiclass.py | 25 +++++++++++-------------- sklearn/tests/test_multiclass.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0cd3089042332..de2e6f58e4f31 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -94,23 +94,20 @@ def fit_ovr(estimator, X, y, n_jobs=1): if sp.issparse(Y): Y = Y.tocsc() - estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) - (estimator, - X, - get_col_(Y, i), - classes=["not %s" % i, i]) - for i in range(Y.shape[1])) + columns = [_get_col(Y, i) for i in range(Y.shape[1])] else: - estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) - (estimator, - X, - Y[:, i], - classes=["not %s" % i, i]) - for i in range(Y.shape[1])) + columns = Y.T + estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) + (estimator, + X, + column, + classes=["not %s" % i, i]) + for i, column in enumerate(columns)) + return estimators, lb -def get_col_(Y, i): +def _get_col(Y, i): """Y is CSC matrix, i is the column index. Returns the dense column.""" c = np.zeros(Y.shape[0], dtype=int) c[Y.indices[Y.indptr[i]:Y.indptr[i+1]]] = Y.data[Y.indptr[i]:Y.indptr[i+1]] @@ -313,7 +310,7 @@ def decision_function(self, X): @property def multilabel_(self): """Whether this is a multilabel classifier""" - return self.label_binarizer_.multilabel_ + return self.label_binarizer_.y_type_.startswith('multilabel') def score(self, X, y): if self.multilabel_: diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index ecbd1aa2d3c5b..1abec23b0d0db 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -86,6 +86,19 @@ def test_ovr_fit_predict_sparse(): assert_true(sp.issparse(Y_pred_sprs)) assert_array_equal(Y_pred_sprs.toarray(), Y_pred) + # Test predict_proba + Y_proba = clf_sprs.predict_proba(X_test) + + # predict assigns a label if the probability that the + # sample has the label is greater than 0.5. + pred = Y_proba > .5 + assert_array_equal(pred, Y_pred_sprs.toarray()) + + # Test decision_function + clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train)) + dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int) + assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray()) + def test_ovr_always_present(): """Test that ovr works with classes that are always present or absent From 0ae6dec2a6322c02e7e6df05d7407eb1d20eb45d Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 30 Jun 2014 16:32:43 -0400 Subject: [PATCH 21/54] Restarting OrthogonalMatchingPursuitCV failure --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index de2e6f58e4f31..3856f890a73f6 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -83,7 +83,7 @@ def _check_estimator(estimator): def fit_ovr(estimator, X, y, n_jobs=1): - """Fit a one-vs-the-rest strategy.""" + """Fit a one-vs-the-rest strategy""" _check_estimator(estimator) # XXX benchmark one line vs the other From 6e5c3ae6f312f671d6e58ae3994d30f0674d3801 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 1 Jul 2014 18:46:37 -0400 Subject: [PATCH 22/54] Measured len of X in predict_ovr to allow for sparse data --- sklearn/multiclass.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 3856f890a73f6..e6b8916e30e89 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -129,6 +129,7 @@ def predict_ovr(estimators, label_binarizer, X): argmaxima[maxima == pred] = i return label_binarizer.classes_[np.array(argmaxima.T)] else: + len_X = X.shape[0] if sp.issparse(X) else len(X) indices = array.array('i') indptr = array.array('i', [0]) for e in estimators: @@ -136,7 +137,7 @@ def predict_ovr(estimators, label_binarizer, X): indptr.append(len(indices)) data = np.ones(len(indices), dtype=int) indicator = sp.csc_matrix((data, indices, indptr), - shape=(len(X), len(estimators))) + shape=(len_X, len(estimators))) return label_binarizer.inverse_transform(indicator) From 6b9b53e2cffa28844ca604623fac18168eb3a457 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 1 Jul 2014 19:43:17 -0400 Subject: [PATCH 23/54] Revised column stacking in fit_ovr to be a generator expression --- sklearn/multiclass.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index e6b8916e30e89..026728933a3a3 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -86,15 +86,13 @@ def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy""" _check_estimator(estimator) - # XXX benchmark one line vs the other lb = LabelBinarizer(sparse_output=True) - # lb = LabelBinarizer(sparse_output=sp.issparse(y)) Y = lb.fit_transform(y) if sp.issparse(Y): Y = Y.tocsc() - columns = [_get_col(Y, i) for i in range(Y.shape[1])] + columns = (_get_col(Y, i) for i in range(Y.shape[1])) else: columns = Y.T estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) From b35671a2756250b39ba70e392a385b37e9fe9092 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 1 Jul 2014 20:16:06 -0400 Subject: [PATCH 24/54] Restartinig travis MD5 sums mismatch --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 026728933a3a3..12e3ebe8514e0 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -83,7 +83,7 @@ def _check_estimator(estimator): def fit_ovr(estimator, X, y, n_jobs=1): - """Fit a one-vs-the-rest strategy""" + """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) lb = LabelBinarizer(sparse_output=True) From c766dd6d0151f2ec0376d01775c62f16125d9c8b Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Wed, 2 Jul 2014 18:01:16 -0400 Subject: [PATCH 25/54] Tested label binarizer with a sparse_output=True binary case --- sklearn/preprocessing/tests/test_label.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 2cc786605f380..273c5997f91c8 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -467,6 +467,15 @@ def test_label_binarize_binary(): yield check_binarized_results, y, classes, pos_label, neg_label, expected + # Binary case where sparse_output = True will no result in a Value Error + y = [0, 1, 0] + classes = [0, 1] + pos_label = 3 + neg_label = 0 + expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) + + yield check_binarized_results, y, classes, pos_label, neg_label, expected + def test_label_binarize_multiclass(): y = [0, 1, 2] From 55cea431f3716ce729b92ba291f2b842c350cce1 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Sat, 5 Jul 2014 13:08:18 -0400 Subject: [PATCH 26/54] len_X -> n_samples --- sklearn/multiclass.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 12e3ebe8514e0..0fdb6977d5d50 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -127,7 +127,7 @@ def predict_ovr(estimators, label_binarizer, X): argmaxima[maxima == pred] = i return label_binarizer.classes_[np.array(argmaxima.T)] else: - len_X = X.shape[0] if sp.issparse(X) else len(X) + n_samples = X.shape[0] if sp.issparse(X) else len(X) indices = array.array('i') indptr = array.array('i', [0]) for e in estimators: @@ -135,7 +135,7 @@ def predict_ovr(estimators, label_binarizer, X): indptr.append(len(indices)) data = np.ones(len(indices), dtype=int) indicator = sp.csc_matrix((data, indices, indptr), - shape=(len_X, len(estimators))) + shape=(n_samples, len(estimators))) return label_binarizer.inverse_transform(indicator) From 986b43be57096561591da8ed759655e929ca16a0 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 7 Jul 2014 16:09:33 -0400 Subject: [PATCH 27/54] swithced to a dense label binarizer in the three class multiclass case --- sklearn/multiclass.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0fdb6977d5d50..a8ee8bab2b416 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -42,6 +42,8 @@ from .preprocessing import LabelBinarizer from .metrics.pairwise import euclidean_distances from .utils import check_random_state +from .utils.multiclass import type_of_target +from .utils.multiclass import unique_labels from .externals.joblib import Parallel from .externals.joblib import delayed @@ -86,7 +88,11 @@ def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) - lb = LabelBinarizer(sparse_output=True) + if (type_of_target(y).startswith("multiclass") and + len(unique_labels(y)) >=3): + lb = LabelBinarizer(sparse_output=True) + else: + lb = LabelBinarizer(sparse_output=True) Y = lb.fit_transform(y) From 2454009c5d565e0b8bab358ddea22a51f1b7ae64 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 7 Jul 2014 16:13:50 -0400 Subject: [PATCH 28/54] Corrected True-> False back from trials --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index a8ee8bab2b416..be20d3fe7191f 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -90,7 +90,7 @@ def fit_ovr(estimator, X, y, n_jobs=1): if (type_of_target(y).startswith("multiclass") and len(unique_labels(y)) >=3): - lb = LabelBinarizer(sparse_output=True) + lb = LabelBinarizer(sparse_output=False) else: lb = LabelBinarizer(sparse_output=True) From 86fa71909c298960ded1608b1a5b8005508afff3 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 7 Jul 2014 16:17:21 -0400 Subject: [PATCH 29/54] Corrected multiclass conditional --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index be20d3fe7191f..928879ab23b7c 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -89,7 +89,7 @@ def fit_ovr(estimator, X, y, n_jobs=1): _check_estimator(estimator) if (type_of_target(y).startswith("multiclass") and - len(unique_labels(y)) >=3): + len(unique_labels(y)) == 3): lb = LabelBinarizer(sparse_output=False) else: lb = LabelBinarizer(sparse_output=True) From 350ccc3cb6c94213f0cf858987c1883f30a3deb2 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 7 Jul 2014 17:19:30 -0400 Subject: [PATCH 30/54] Documentation on working of _get_col --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 928879ab23b7c..d0b48feecc48e 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -112,7 +112,7 @@ def fit_ovr(estimator, X, y, n_jobs=1): def _get_col(Y, i): - """Y is CSC matrix, i is the column index. Returns the dense column.""" + """Y is CSC matrix, i is the column. Returns a dense binary column.""" c = np.zeros(Y.shape[0], dtype=int) c[Y.indices[Y.indptr[i]:Y.indptr[i+1]]] = Y.data[Y.indptr[i]:Y.indptr[i+1]] return c From 9a7635fcadbab13419fb252405dbbe04507fcb0b Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 8 Jul 2014 09:25:45 -0400 Subject: [PATCH 31/54] _get_col => getcol, formating revisions in fit_ovr --- sklearn/multiclass.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index d0b48feecc48e..0132509c10d7e 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -87,18 +87,14 @@ def _check_estimator(estimator): def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) - - if (type_of_target(y).startswith("multiclass") and - len(unique_labels(y)) == 3): - lb = LabelBinarizer(sparse_output=False) - else: - lb = LabelBinarizer(sparse_output=True) - + sparse_putput = (type_of_target(y).startswith("multiclass") and + len(unique_labels(y)) == 3) + lb = LabelBinarizer(sparse_output=sparse_putput) Y = lb.fit_transform(y) if sp.issparse(Y): Y = Y.tocsc() - columns = (_get_col(Y, i) for i in range(Y.shape[1])) + columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1])) else: columns = Y.T estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) @@ -111,13 +107,6 @@ def fit_ovr(estimator, X, y, n_jobs=1): return estimators, lb -def _get_col(Y, i): - """Y is CSC matrix, i is the column. Returns a dense binary column.""" - c = np.zeros(Y.shape[0], dtype=int) - c[Y.indices[Y.indptr[i]:Y.indptr[i+1]]] = Y.data[Y.indptr[i]:Y.indptr[i+1]] - return c - - def predict_ovr(estimators, label_binarizer, X): """Make predictions using the one-vs-the-rest strategy.""" e = estimators[0] From 6805704623b0c7c993665ea5f47aa525c3b43087 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 8 Jul 2014 11:05:10 -0400 Subject: [PATCH 32/54] Removed special handling for multiclass case in fit_ovr --- sklearn/multiclass.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 0132509c10d7e..2ba3b6124cf56 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -87,9 +87,7 @@ def _check_estimator(estimator): def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) - sparse_putput = (type_of_target(y).startswith("multiclass") and - len(unique_labels(y)) == 3) - lb = LabelBinarizer(sparse_output=sparse_putput) + lb = LabelBinarizer(sparse_output=True) Y = lb.fit_transform(y) if sp.issparse(Y): @@ -103,10 +101,16 @@ def fit_ovr(estimator, X, y, n_jobs=1): column, classes=["not %s" % i, i]) for i, column in enumerate(columns)) - return estimators, lb +def _get_col(Y, i): + """Y is CSC matrix, i is the column. Returns a dense binary column.""" + c = np.zeros(Y.shape[0], dtype=int) + c[Y.indices[Y.indptr[i]:Y.indptr[i+1]]] = Y.data[Y.indptr[i]:Y.indptr[i+1]] + return c + + def predict_ovr(estimators, label_binarizer, X): """Make predictions using the one-vs-the-rest strategy.""" e = estimators[0] From 63c3b58a66c234bbe5a68b33c8bc3c9d4d445c09 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Wed, 9 Jul 2014 21:26:19 -0400 Subject: [PATCH 33/54] Supressed SparseEfficienyWarning by writing ignore_warning_class helper --- sklearn/tests/test_multiclass.py | 7 +++++-- sklearn/utils/testing.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 1abec23b0d0db..346f0963c0f3a 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -9,6 +9,7 @@ from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import ignore_warning_class from sklearn.utils.testing import assert_greater from sklearn.multiclass import OneVsRestClassifier @@ -113,7 +114,8 @@ def test_ovr_always_present(): [[int(i >= 5), 2, 3] for i in range(10)] ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ovr.fit, X, y) + assert_warns(UserWarning, ignore_warning_class, sp.SparseEfficiencyWarning, + ovr.fit, X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) @@ -125,7 +127,8 @@ def test_ovr_always_present(): y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ovr.fit, X, y) + assert_warns(UserWarning, ignore_warning_class, sp.SparseEfficiencyWarning, + ovr.fit, X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0])) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index f3ee830c64f10..fe3e88451f0b3 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -238,6 +238,15 @@ def assert_no_warnings(func, *args, **kw): return result +def ignore_warning_class(warning_class, func, *args, **kw): + """Returns the result of func and ignores warnings of type warning_class""" + clean_warning_registry() + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=warning_class) + result = func(*args, **kw) + return result + + def ignore_warnings(obj=None): """ Context manager and decorator to ignore warnings @@ -579,6 +588,7 @@ def if_not_mac_os(versions=('10.7', '10.8', '10.9'), """ mac_version, _, _ = platform.mac_ver() skip = '.'.join(mac_version.split('.')[:2]) in versions + def decorator(func): if skip: @wraps(func) From 1dd6e953a1fbf2aa809e72ab51a82037b9e08025 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 10 Jul 2014 15:00:13 -0400 Subject: [PATCH 34/54] Revised ignore_warning_class to wrapp the function in a way to allow for arguments to be handled externaly --- sklearn/tests/test_multiclass.py | 8 ++++---- sklearn/utils/testing.py | 19 ++++++++++++------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 346f0963c0f3a..e2f5162ac8f6d 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -114,8 +114,8 @@ def test_ovr_always_present(): [[int(i >= 5), 2, 3] for i in range(10)] ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ignore_warning_class, sp.SparseEfficiencyWarning, - ovr.fit, X, y) + assert_warns(UserWarning, ignore_warning_class(sp.SparseEfficiencyWarning, + ovr.fit), X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) @@ -127,8 +127,8 @@ def test_ovr_always_present(): y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ignore_warning_class, sp.SparseEfficiencyWarning, - ovr.fit, X, y) + assert_warns(UserWarning, ignore_warning_class(sp.SparseEfficiencyWarning, + ovr.fit), X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0])) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index fe3e88451f0b3..8928a288f2ae1 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -238,13 +238,18 @@ def assert_no_warnings(func, *args, **kw): return result -def ignore_warning_class(warning_class, func, *args, **kw): - """Returns the result of func and ignores warnings of type warning_class""" - clean_warning_registry() - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=warning_class) - result = func(*args, **kw) - return result +def ignore_warning_class(warning_class, fn): + """Decorator to catch and hide warnings without visual nesting""" + @wraps(fn) + def wrapper(*args, **kwargs): + # very important to avoid uncontrolled state propagation + clean_warning_registry() + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=warning_class) + return fn(*args, **kwargs) + w[:] = [] + + return wrapper def ignore_warnings(obj=None): From ebdae526e03295063004e54f94ee644fb021bc02 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 10 Jul 2014 15:17:09 -0400 Subject: [PATCH 35/54] ignore_warning_class Documentation /travis rebuild --- sklearn/utils/testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 8928a288f2ae1..3d3284e59e2c0 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -239,7 +239,7 @@ def assert_no_warnings(func, *args, **kw): def ignore_warning_class(warning_class, fn): - """Decorator to catch and hide warnings without visual nesting""" + """Wraps function fn and ignore any warnings of type warning_class.""" @wraps(fn) def wrapper(*args, **kwargs): # very important to avoid uncontrolled state propagation From 6a50b8236bb518d3db588d6685c2b9ca695891eb Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 11 Jul 2014 17:36:58 -0400 Subject: [PATCH 36/54] Implemeneted search of all warnings raised in assert_warns --- sklearn/tests/test_multiclass.py | 7 ++----- sklearn/utils/testing.py | 22 +++++----------------- 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index e2f5162ac8f6d..099106f8c3882 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -9,7 +9,6 @@ from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_warns -from sklearn.utils.testing import ignore_warning_class from sklearn.utils.testing import assert_greater from sklearn.multiclass import OneVsRestClassifier @@ -114,8 +113,7 @@ def test_ovr_always_present(): [[int(i >= 5), 2, 3] for i in range(10)] ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ignore_warning_class(sp.SparseEfficiencyWarning, - ovr.fit), X, y) + assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict(X) assert_array_equal(np.array(y_pred), np.array(y)) y_pred = ovr.decision_function(X) @@ -127,8 +125,7 @@ def test_ovr_always_present(): y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, ignore_warning_class(sp.SparseEfficiencyWarning, - ovr.fit), X, y) + assert_warns(UserWarning, X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0])) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 3d3284e59e2c0..443fcbda67d2c 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -154,10 +154,12 @@ def assert_warns(warning_class, func, *args, **kw): if not len(w) > 0: raise AssertionError("No warning raised when calling %s" % func.__name__) + found = False + for W in w: + found = found or W.category is warning_class - if not w[0].category is warning_class: - raise AssertionError("First warning for %s is not a " - "%s( is %s)" + if not found: + raise AssertionError("%s did not give warning: %s( is %s)" % (func.__name__, warning_class, w[0])) return result @@ -238,20 +240,6 @@ def assert_no_warnings(func, *args, **kw): return result -def ignore_warning_class(warning_class, fn): - """Wraps function fn and ignore any warnings of type warning_class.""" - @wraps(fn) - def wrapper(*args, **kwargs): - # very important to avoid uncontrolled state propagation - clean_warning_registry() - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=warning_class) - return fn(*args, **kwargs) - w[:] = [] - - return wrapper - - def ignore_warnings(obj=None): """ Context manager and decorator to ignore warnings From 99dfe1bc29bd659e57838833e62d2a126b430b47 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 11 Jul 2014 17:46:42 -0400 Subject: [PATCH 37/54] Fixed assert_warns call --- sklearn/tests/test_multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 099106f8c3882..1abec23b0d0db 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -125,7 +125,7 @@ def test_ovr_always_present(): y = np.zeros((10, 2)) y[5:, 0] = 1 # variable label ovr = OneVsRestClassifier(LogisticRegression()) - assert_warns(UserWarning, X, y) + assert_warns(UserWarning, ovr.fit, X, y) y_pred = ovr.predict_proba(X) assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0])) From ec0558c69b69e04b4636c680a482376b67e4c949 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 11 Jul 2014 17:59:11 -0400 Subject: [PATCH 38/54] Restart OrthogonalMatchingPursuitCV failure --- sklearn/tests/test_multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 1abec23b0d0db..4fe67334cd0d8 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -101,7 +101,7 @@ def test_ovr_fit_predict_sparse(): def test_ovr_always_present(): - """Test that ovr works with classes that are always present or absent + """Test that ovr works with classes that are always present or absent. """ # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) From 01a4cb7e9faf3f32216bc25d6bbfe213953d21f3 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Fri, 11 Jul 2014 18:02:02 -0400 Subject: [PATCH 39/54] Cleaned unsused additions from multiclass.py --- sklearn/multiclass.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 2ba3b6124cf56..b6126dea7af42 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -89,12 +89,8 @@ def fit_ovr(estimator, X, y, n_jobs=1): _check_estimator(estimator) lb = LabelBinarizer(sparse_output=True) Y = lb.fit_transform(y) - - if sp.issparse(Y): - Y = Y.tocsc() - columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1])) - else: - columns = Y.T + Y = Y.tocsc() + columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1])) estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) (estimator, X, @@ -104,13 +100,6 @@ def fit_ovr(estimator, X, y, n_jobs=1): return estimators, lb -def _get_col(Y, i): - """Y is CSC matrix, i is the column. Returns a dense binary column.""" - c = np.zeros(Y.shape[0], dtype=int) - c[Y.indices[Y.indptr[i]:Y.indptr[i+1]]] = Y.data[Y.indptr[i]:Y.indptr[i+1]] - return c - - def predict_ovr(estimators, label_binarizer, X): """Make predictions using the one-vs-the-rest strategy.""" e = estimators[0] From 91d93545b98e55af021b2cb02c721d33f34abf7c Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 14 Jul 2014 09:41:23 -0400 Subject: [PATCH 40/54] Updated documentation in multiclass.py --- sklearn/multiclass.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index b6126dea7af42..ff4210e1fe402 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -216,9 +216,9 @@ def fit(self, X, y): X : {array-like, sparse matrix}, shape = [n_samples, n_features] Data. - y : array-like, shape = [n_samples] or [n_samples, n_classes] - Multi-class targets. An indicator matrix turns on multilabel - classification. + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes] Multi-class targets. An indicator matrix + turns on multilabel classification. Returns ------- @@ -268,7 +268,7 @@ def predict_proba(self, X): Returns ------- - T : array-like, shape = [n_samples, n_classes] + T : {array-like, sparse matrix}, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model, where classes are ordered as they are in `self.classes_`. """ @@ -426,7 +426,7 @@ def fit(self, X, y): X : {array-like, sparse matrix}, shape = [n_samples, n_features] Data. - y : numpy array of shape [n_samples] + y : {array-like, sparse matrix}, shape [n_samples] Multi-class targets. Returns @@ -447,7 +447,7 @@ def predict(self, X): Returns ------- - y : numpy array of shape [n_samples] + y : {array-like, sparse matrix}, shape [n_samples] Predicted multi-class targets. """ if not hasattr(self, "estimators_"): From 104ee77bf491d4afd535fb94d30dbe24d2f7410e Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 14 Jul 2014 09:52:03 -0400 Subject: [PATCH 41/54] Rewrote expression for found in assert_warms idiomatically --- sklearn/utils/testing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 443fcbda67d2c..80feaa4a9f572 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -154,13 +154,12 @@ def assert_warns(warning_class, func, *args, **kw): if not len(w) > 0: raise AssertionError("No warning raised when calling %s" % func.__name__) - found = False - for W in w: - found = found or W.category is warning_class + + found = any(warning.category is warning_class for warning in w) if not found: raise AssertionError("%s did not give warning: %s( is %s)" - % (func.__name__, warning_class, w[0])) + % (func.__name__, warning_class, w)) return result From b31e9b6bd7357ba10d209668e9b5d218614c5fb8 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 14 Jul 2014 09:58:16 -0400 Subject: [PATCH 42/54] Edited mention of 2d array output to also include possiblity of sparse matrix --- doc/modules/multiclass.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst index a28652879cba9..b3a67af026a33 100644 --- a/doc/modules/multiclass.rst +++ b/doc/modules/multiclass.rst @@ -33,7 +33,8 @@ by decomposing such problems into binary classification problems. several joint classification tasks. This is a generalization of the multi-label classification task, where the set of classification problem is restricted to binary classification, and of the multi-class - classification task. *The output format is a 2d numpy array.* + classification task. *The output format is a 2d numpy array or sparse + matrix.* The set of labels can be different for each output variable. For instance a sample could be assigned "pear" for an output variable that From 37d7b19dfa3545a108acf3748ccec0a2df96ff4b Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Mon, 14 Jul 2014 10:08:07 -0400 Subject: [PATCH 43/54] Removed extra blank lines from assert_warns --- sklearn/utils/testing.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 80feaa4a9f572..195836ccced2d 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -156,11 +156,9 @@ def assert_warns(warning_class, func, *args, **kw): % func.__name__) found = any(warning.category is warning_class for warning in w) - if not found: raise AssertionError("%s did not give warning: %s( is %s)" % (func.__name__, warning_class, w)) - return result From a20cf272a3933386661211c2b13a40a8f08feaa4 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 15 Jul 2014 08:44:46 -0400 Subject: [PATCH 44/54] A collection of small changes, Commented sparse_output = True desicion in fit_ovr, simplified columns expression in fit_ovr, removed lasso estimator from test_ovr_multiclass --- sklearn/multiclass.py | 9 +++++++-- sklearn/tests/test_multiclass.py | 5 ++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index ff4210e1fe402..eb7a82e39ce84 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -44,6 +44,7 @@ from .utils import check_random_state from .utils.multiclass import type_of_target from .utils.multiclass import unique_labels +from .utils.validation import _num_samples from .externals.joblib import Parallel from .externals.joblib import delayed @@ -87,10 +88,14 @@ def _check_estimator(estimator): def fit_ovr(estimator, X, y, n_jobs=1): """Fit a one-vs-the-rest strategy.""" _check_estimator(estimator) + # A sparse LabelBinarizer, with sparse_output=True, has been shown to + # outpreform or match a dense label binarizer in all cases and has also + # resulted in less or equal memory consumption in the fit_ovr function + # overall. lb = LabelBinarizer(sparse_output=True) Y = lb.fit_transform(y) Y = Y.tocsc() - columns = (Y.getcol(i).toarray().ravel() for i in range(Y.shape[1])) + columns = (col.toarray().ravel() for col in Y.T) estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) (estimator, X, @@ -115,7 +120,7 @@ def predict_ovr(estimators, label_binarizer, X): argmaxima[maxima == pred] = i return label_binarizer.classes_[np.array(argmaxima.T)] else: - n_samples = X.shape[0] if sp.issparse(X) else len(X) + n_samples = _num_samples(X) indices = array.array('i') indptr = array.array('i', [0]) for e in estimators: diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 4fe67334cd0d8..f4cebd2acc83b 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -101,8 +101,7 @@ def test_ovr_fit_predict_sparse(): def test_ovr_always_present(): - """Test that ovr works with classes that are always present or absent. - """ + """Test that ovr works with classes that are always present or absent.""" # Note: tests is the case where _ConstantPredictor is utilised X = np.ones((10, 2)) X[:5, :] = 0 @@ -145,7 +144,7 @@ def test_ovr_multiclass(): for base_clf in (MultinomialNB(), LinearSVC(random_state=0), LinearRegression(), Ridge(), - ElasticNet(), Lasso(alpha=0.5)): + ElasticNet()): clf = OneVsRestClassifier(base_clf).fit(X, y) assert_equal(set(clf.classes_), classes) From 2c15f1ba6b4ee08e673c15bd00d49af8ae4157a4 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 15 Jul 2014 09:13:34 -0400 Subject: [PATCH 45/54] Undo documentation edits to OvO fit and predict --- sklearn/multiclass.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index eb7a82e39ce84..6d3df97419fa2 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -431,7 +431,7 @@ def fit(self, X, y): X : {array-like, sparse matrix}, shape = [n_samples, n_features] Data. - y : {array-like, sparse matrix}, shape [n_samples] + y : numpy array of shape [n_samples] Multi-class targets. Returns @@ -452,7 +452,7 @@ def predict(self, X): Returns ------- - y : {array-like, sparse matrix}, shape [n_samples] + y : numpy array of shape [n_samples] Predicted multi-class targets. """ if not hasattr(self, "estimators_"): From 09f4b259152e38056a51e4a00b01f61afd979f2f Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 15 Jul 2014 09:45:35 -0400 Subject: [PATCH 46/54] Document the y_type_ attribute of LabelBinarizer In addtion fix some small typos in the function test_label_binarize_binary --- sklearn/preprocessing/label.py | 7 +++++++ sklearn/preprocessing/tests/test_label.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index d91db1038d637..e7834988f3f83 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -201,6 +201,13 @@ class LabelBinarizer(BaseEstimator, TransformerMixin): `classes_` : array of shape [n_class] Holds the label for each class. + `y_type_` : str, + Represents the type of the target data as evaluated by + utils.multiclass.type_of_target. Possible type are 'continuous', + 'continuous-multioutput', 'binary', 'multiclass', + 'mutliclass-multioutput', 'multilabel-sequences', + 'multilabel-indicator', and 'unknown'. + `multilabel_` : boolean True if the transformer was fitted on a multilabel rather than a multiclass set of labels. The multilabel_ attribute is deprecated diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 273c5997f91c8..da4d9530d6acf 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -467,7 +467,7 @@ def test_label_binarize_binary(): yield check_binarized_results, y, classes, pos_label, neg_label, expected - # Binary case where sparse_output = True will no result in a Value Error + # Binary case where sparse_output = True will not result in a ValueError y = [0, 1, 0] classes = [0, 1] pos_label = 3 From 17407a82db477b6a5044be1143f23a256291bed2 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 15 Jul 2014 11:09:16 -0400 Subject: [PATCH 47/54] Comment fit_ovr and predict_ovr as public functions Insert check in predict_ovr to enforce all estimators are of the same type, if not throw a value error. --- sklearn/multiclass.py | 50 +++++++++++++++++++++++++++++--- sklearn/tests/test_multiclass.py | 6 ++++ 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 6d3df97419fa2..36270370d040c 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -86,7 +86,25 @@ def _check_estimator(estimator): def fit_ovr(estimator, X, y, n_jobs=1): - """Fit a one-vs-the-rest strategy.""" + """Fit a list of estimators using a one-vs-the-rest strategy. + + Parameters + ---------- + estimator : estimator object + An estimator object implementing `fit` and one of `decision_function` + or `predict_proba`. + + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Data. + + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes] Multi-class targets. An indicator matrix + turns on multilabel classification. + + Returns + ------- + self + """ _check_estimator(estimator) # A sparse LabelBinarizer, with sparse_output=True, has been shown to # outpreform or match a dense label binarizer in all cases and has also @@ -106,7 +124,31 @@ def fit_ovr(estimator, X, y, n_jobs=1): def predict_ovr(estimators, label_binarizer, X): - """Make predictions using the one-vs-the-rest strategy.""" + """Predict multi-class targets using the one vs rest strategy. + + Parameters + ---------- + estimators : list of `n_classes` estimators, Estimators used for + predictions. The list must be homogeneous with respect to the type of + estimators. fit_ovr supplies this list as part of its output. + + label_binarizer : LabelBinarizer object, Object used to transform + multiclass labels to binary labels and vice-versa. fit_ovr supplies + this object as part of its output. + + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + Data. + + Returns + ------- + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes]. Predicted multi-class targets. + """ + e_types = set([type(e) for e in estimators if not + isinstance(e, _ConstantPredictor)]) + if len(e_types) > 1: + raise ValueError("List of estimators must contain estimators of the" + " same type but contains types {0}".format(e_types)) e = estimators[0] thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 @@ -247,8 +289,8 @@ def predict(self, X): Returns ------- - y : array-like, shape = [n_samples] - Predicted multi-class targets. + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes]. Predicted multi-class targets. """ self._check_is_fitted() diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index f4cebd2acc83b..7214fe429388e 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -14,10 +14,13 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OutputCodeClassifier +from sklearn.multiclass import predict_ovr from sklearn.metrics import precision_score from sklearn.metrics import recall_score +from sklearn.preprocessing import LabelBinarizer + from sklearn.svm import LinearSVC from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, Ridge, @@ -41,6 +44,9 @@ def test_ovr_exceptions(): ovr = OneVsRestClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ovr.predict, []) + assert_raises(ValueError, predict_ovr, [LinearSVC(), MultinomialNB()], + LabelBinarizer(), []) + def test_ovr_fit_predict(): # A classifier which implements decision_function. From a3b909a4a9a2eb9a0fd6fb241deb0b4629ea18c4 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 15 Jul 2014 11:53:29 -0400 Subject: [PATCH 48/54] Removed make_mlb rename in test_ovr_fit_predict_sparse --- sklearn/tests/test_multiclass.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 7214fe429388e..386e7b759103a 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -69,15 +69,14 @@ def test_ovr_fit_predict_sparse(): sp.lil_matrix]: base_clf = MultinomialNB(alpha=1) - make_mlb = datasets.make_multilabel_classification - X, Y = make_mlb(n_samples=100, - n_features=20, - n_classes=5, - n_labels=3, - length=50, - allow_unlabeled=True, - return_indicator=True, - random_state=0) + X, Y = datasets.make_multilabel_classification(n_samples=100, + n_features=20, + n_classes=5, + n_labels=3, + length=50, + allow_unlabeled=True, + return_indicator=True, + random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] From 911bff286781f57c9864fe82abad19ddb6bd7408 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Tue, 15 Jul 2014 14:02:59 -0400 Subject: [PATCH 49/54] Use lb.classes_ in fit_ovr to maintain class dtype --- sklearn/multiclass.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 36270370d040c..c7cccaca4d05d 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -118,7 +118,8 @@ def fit_ovr(estimator, X, y, n_jobs=1): (estimator, X, column, - classes=["not %s" % i, i]) + classes=["not %s" % i, + lb.classes_[i]]) for i, column in enumerate(columns)) return estimators, lb From 80f57f28b247c4d6de25bcee35dfcf9e5ab3a55b Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Wed, 16 Jul 2014 10:09:15 -0400 Subject: [PATCH 50/54] Comment j_jobs > 1 in fit_orv --- sklearn/multiclass.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index c7cccaca4d05d..8635e0ba10502 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -114,6 +114,9 @@ def fit_ovr(estimator, X, y, n_jobs=1): Y = lb.fit_transform(y) Y = Y.tocsc() columns = (col.toarray().ravel() for col in Y.T) + # In cases where indivdual estimators are very fast to train setting + # n_jobs > 1 in can results in slower performance due to the overhead + # of spawning threads. estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) (estimator, X, From dbc67afb31e4fe35d847e46df603463d6dfefafa Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Wed, 16 Jul 2014 10:58:09 -0400 Subject: [PATCH 51/54] LabelBinarizer and ovr fail multioutput, test binary ovr Raise a ValueError in the label binarizer when dealing with multiputput target data, test a ValueError is raised in ovr for multioutput target data. Test a binary classification task with ovr --- sklearn/preprocessing/label.py | 7 +++++ sklearn/preprocessing/tests/test_label.py | 5 +++ sklearn/tests/test_multiclass.py | 37 ++++++++++++++++++++++- 3 files changed, 48 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index e7834988f3f83..5f6d7a0badd2e 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -308,6 +308,10 @@ def fit(self, y): self : returns an instance of self. """ self.y_type_ = type_of_target(y) + if 'multioutput' in self.y_type_: + raise ValueError("Multioutput target data is not supported with " + "label binarization") + self.sparse_input_ = sp.issparse(y) self.classes_ = unique_labels(y) return self @@ -469,6 +473,9 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, pos_label = -neg_label y_type = type_of_target(y) + if 'multioutput' in y_type: + raise ValueError("Multioutput target data is not supported with label " + "binarization") n_samples = y.shape[0] if sp.issparse(y) else len(y) n_classes = len(classes) diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index da4d9530d6acf..826eaec9fc0a9 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -194,6 +194,11 @@ def test_label_binarizer_errors(): y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) + # Fail on multioutput data + assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]])) + assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]), + [1, 2, 3]) + def test_label_encoder(): """Test LabelEncoder's transform and inverse_transform methods""" diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 386e7b759103a..36a410942bf58 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -15,6 +15,7 @@ from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OutputCodeClassifier from sklearn.multiclass import predict_ovr +from sklearn.multiclass import fit_ovr from sklearn.metrics import precision_score from sklearn.metrics import recall_score @@ -47,6 +48,14 @@ def test_ovr_exceptions(): assert_raises(ValueError, predict_ovr, [LinearSVC(), MultinomialNB()], LabelBinarizer(), []) + # Fail on multioutput data + assert_raises(ValueError, fit_ovr, MultinomialNB(), + np.array([[1, 0], [0, 1]]), + np.array([[1, 2], [3, 1]])) + assert_raises(ValueError, fit_ovr, MultinomialNB(), + np.array([[1, 0], [0, 1]]), + np.array([[1.5, 2.4], [3.1, 0.8]])) + def test_ovr_fit_predict(): # A classifier which implements decision_function. @@ -138,7 +147,6 @@ def test_ovr_multiclass(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]]) y = ["eggs", "spam", "ham", "eggs", "ham"] - # y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]] Y = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], @@ -162,6 +170,33 @@ def test_ovr_multiclass(): assert_array_equal(y_pred, [0, 0, 1]) +def test_ovr_binary(): + # Toy dataset where features correspond directly to labels. + X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]]) + y = ["eggs", "spam", "spam", "eggs", "spam"] + Y = np.array([[0], + [1], + [1], + [0], + [1]]) + + classes = set("eggs spam".split()) + + for base_clf in (MultinomialNB(), LinearSVC(random_state=0), + LinearRegression(), Ridge(), + ElasticNet()): + + clf = OneVsRestClassifier(base_clf).fit(X, y) + assert_equal(set(clf.classes_), classes) + y_pred = clf.predict(np.array([[0, 0, 4]]))[0] + assert_equal(set(y_pred), set("eggs")) + + # test input as label indicator matrix + clf = OneVsRestClassifier(base_clf).fit(X, Y) + y_pred = clf.predict([[3, 0, 0]])[0] + assert_equal(y_pred, 1) + + def test_ovr_multilabel(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]]) From 26d63c30070a65eee1493e2fd11e3566340b88b4 Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 17 Jul 2014 08:38:50 -0400 Subject: [PATCH 52/54] Fit binary target data on one line --- sklearn/tests/test_multiclass.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index 36a410942bf58..4fef34a901bd8 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -174,11 +174,7 @@ def test_ovr_binary(): # Toy dataset where features correspond directly to labels. X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]]) y = ["eggs", "spam", "spam", "eggs", "spam"] - Y = np.array([[0], - [1], - [1], - [0], - [1]]) + Y = np.array([[0, 1, 1, 0, 1]]).T classes = set("eggs spam".split()) From 855fdd149bb0bbd6b5bd7a0c8ca8827cc845b52e Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 17 Jul 2014 08:53:22 -0400 Subject: [PATCH 53/54] Fix typo individual --- sklearn/multiclass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 8635e0ba10502..5c57e4b010f0e 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -114,7 +114,7 @@ def fit_ovr(estimator, X, y, n_jobs=1): Y = lb.fit_transform(y) Y = Y.tocsc() columns = (col.toarray().ravel() for col in Y.T) - # In cases where indivdual estimators are very fast to train setting + # In cases where individual estimators are very fast to train setting # n_jobs > 1 in can results in slower performance due to the overhead # of spawning threads. estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) From e1dc47033364a76126c92f81d152a45c5452b5fd Mon Sep 17 00:00:00 2001 From: Hamzeh Alsalhi Date: Thu, 17 Jul 2014 09:43:54 -0400 Subject: [PATCH 54/54] Untab overindented line in predict docstring --- sklearn/multiclass.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 5c57e4b010f0e..ccdf733b6b0b2 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -293,8 +293,8 @@ def predict(self, X): Returns ------- - y : {array-like, sparse matrix}, shape = [n_samples] or - [n_samples, n_classes]. Predicted multi-class targets. + y : {array-like, sparse matrix}, shape = [n_samples] or + [n_samples, n_classes]. Predicted multi-class targets. """ self._check_is_fitted()