diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
index a28652879cba9..b3a67af026a33 100644
--- a/doc/modules/multiclass.rst
+++ b/doc/modules/multiclass.rst
@@ -33,7 +33,8 @@ by decomposing such problems into binary classification problems.
     several joint classification tasks. This is a generalization
     of the multi-label classification task, where the set of classification
     problem is restricted to binary classification, and of the multi-class
-    classification task. *The output format is a 2d numpy array.*
+    classification task. *The output format is a 2d numpy array or sparse 
+    matrix.*
 
     The set of labels can be different for each output variable.
     For instance a sample could be assigned "pear" for an output variable that
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 0f88a7fae08e2..ccdf733b6b0b2 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -32,14 +32,19 @@
 #
 # License: BSD 3 clause
 
+import array
 import numpy as np
 import warnings
+import scipy.sparse as sp
 
 from .base import BaseEstimator, ClassifierMixin, clone, is_classifier
 from .base import MetaEstimatorMixin
 from .preprocessing import LabelBinarizer
 from .metrics.pairwise import euclidean_distances
 from .utils import check_random_state
+from .utils.multiclass import type_of_target
+from .utils.multiclass import unique_labels
+from .utils.validation import _num_samples
 from .externals.joblib import Parallel
 from .externals.joblib import delayed
 
@@ -81,24 +86,96 @@ def _check_estimator(estimator):
 
 
 def fit_ovr(estimator, X, y, n_jobs=1):
-    """Fit a one-vs-the-rest strategy."""
-    _check_estimator(estimator)
+    """Fit a list of estimators using a one-vs-the-rest strategy.
 
-    lb = LabelBinarizer()
-    Y = lb.fit_transform(y)
+    Parameters
+    ----------
+    estimator : estimator object
+        An estimator object implementing `fit` and one of `decision_function`
+        or `predict_proba`.
 
-    estimators = Parallel(n_jobs=n_jobs)(
-        delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i])
-        for i in range(Y.shape[1]))
+    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+        Data.
+
+    y : {array-like, sparse matrix}, shape = [n_samples] or
+        [n_samples, n_classes] Multi-class targets. An indicator matrix
+        turns on multilabel classification.
+
+    Returns
+    -------
+    self
+    """
+    _check_estimator(estimator)
+    # A sparse LabelBinarizer, with sparse_output=True, has been shown to
+    # outpreform or match a dense label binarizer in all cases and has also
+    # resulted in less or equal memory consumption in the fit_ovr function
+    # overall.
+    lb = LabelBinarizer(sparse_output=True)
+    Y = lb.fit_transform(y)
+    Y = Y.tocsc()
+    columns = (col.toarray().ravel() for col in Y.T)
+    # In cases where individual estimators are very fast to train setting
+    # n_jobs > 1 in can results in slower performance due to the overhead
+    # of spawning threads.
+    estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
+                                         (estimator,
+                                          X,
+                                          column,
+                                          classes=["not %s" % i,
+                                                   lb.classes_[i]])
+                                         for i, column in enumerate(columns))
     return estimators, lb
 
 
 def predict_ovr(estimators, label_binarizer, X):
-    """Make predictions using the one-vs-the-rest strategy."""
-    Y = np.array([_predict_binary(e, X) for e in estimators])
+    """Predict multi-class targets using the one vs rest strategy.
+
+    Parameters
+    ----------
+    estimators : list of `n_classes` estimators, Estimators used for
+        predictions. The list must be homogeneous with respect to the type of
+        estimators. fit_ovr supplies this list as part of its output.
+
+    label_binarizer : LabelBinarizer object, Object used to transform
+        multiclass labels to binary labels and vice-versa. fit_ovr supplies
+        this object as part of its output.
+
+    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+        Data.
+
+    Returns
+    -------
+    y : {array-like, sparse matrix}, shape = [n_samples] or
+        [n_samples, n_classes]. Predicted multi-class targets.
+    """
+    e_types = set([type(e) for e in estimators if not
+                   isinstance(e, _ConstantPredictor)])
+    if len(e_types) > 1:
+        raise ValueError("List of estimators must contain estimators of the"
+                         " same type but contains types {0}".format(e_types))
     e = estimators[0]
     thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5
-    return label_binarizer.inverse_transform(Y.T, threshold=thresh)
+
+    if label_binarizer.y_type_ == "multiclass":
+        maxima = np.empty(X.shape[0], dtype=float)
+        maxima.fill(-np.inf)
+        argmaxima = np.zeros(X.shape[0], dtype=int)
+        for i, e in enumerate(estimators):
+            pred = _predict_binary(e, X)
+            np.maximum(maxima, pred, out=maxima)
+            argmaxima[maxima == pred] = i
+        return label_binarizer.classes_[np.array(argmaxima.T)]
+    else:
+        n_samples = _num_samples(X)
+        indices = array.array('i')
+        indptr = array.array('i', [0])
+        for e in estimators:
+            indices.extend(np.where(_predict_binary(e, X) > thresh)[0])
+            indptr.append(len(indices))
+        data = np.ones(len(indices), dtype=int)
+        indicator = sp.csc_matrix((data, indices, indptr),
+                                  shape=(n_samples, len(estimators)))
+        return label_binarizer.inverse_transform(indicator)
 
 
 def predict_proba_ovr(estimators, X, is_multilabel):
@@ -190,9 +267,9 @@ def fit(self, X, y):
         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
             Data.
 
-        y : array-like, shape = [n_samples] or [n_samples, n_classes]
-            Multi-class targets. An indicator matrix turns on multilabel
-            classification.
+        y : {array-like, sparse matrix}, shape = [n_samples] or
+            [n_samples, n_classes] Multi-class targets. An indicator matrix
+            turns on multilabel classification.
 
         Returns
         -------
@@ -216,8 +293,8 @@ def predict(self, X):
 
         Returns
         -------
-        y : array-like, shape = [n_samples]
-            Predicted multi-class targets.
+        y : {array-like, sparse matrix}, shape = [n_samples] or
+            [n_samples, n_classes]. Predicted multi-class targets.
         """
         self._check_is_fitted()
 
@@ -242,7 +319,7 @@ def predict_proba(self, X):
 
         Returns
         -------
-        T : array-like, shape = [n_samples, n_classes]
+        T : {array-like, sparse matrix}, shape = [n_samples, n_classes]
             Returns the probability of the sample for each class in the model,
             where classes are ordered as they are in `self.classes_`.
         """
@@ -271,7 +348,7 @@ def decision_function(self, X):
     @property
     def multilabel_(self):
         """Whether this is a multilabel classifier"""
-        return self.label_binarizer_.multilabel_
+        return self.label_binarizer_.y_type_.startswith('multilabel')
 
     def score(self, X, y):
         if self.multilabel_:
diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
index 4c8102160a98b..5f6d7a0badd2e 100644
--- a/sklearn/preprocessing/label.py
+++ b/sklearn/preprocessing/label.py
@@ -201,6 +201,13 @@ class LabelBinarizer(BaseEstimator, TransformerMixin):
     `classes_` : array of shape [n_class]
         Holds the label for each class.
 
+    `y_type_` : str,
+        Represents the type of the target data as evaluated by
+        utils.multiclass.type_of_target. Possible type are 'continuous',
+        'continuous-multioutput', 'binary', 'multiclass',
+        'mutliclass-multioutput', 'multilabel-sequences',
+        'multilabel-indicator', and 'unknown'.
+
     `multilabel_` : boolean
         True if the transformer was fitted on a multilabel rather than a
         multiclass set of labels. The multilabel_ attribute is deprecated
@@ -301,6 +308,10 @@ def fit(self, y):
         self : returns an instance of self.
         """
         self.y_type_ = type_of_target(y)
+        if 'multioutput' in self.y_type_:
+            raise ValueError("Multioutput target data is not supported with "
+                             "label binarization")
+
         self.sparse_input_ = sp.issparse(y)
         self.classes_ = unique_labels(y)
         return self
@@ -462,6 +473,9 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
         pos_label = -neg_label
 
     y_type = type_of_target(y)
+    if 'multioutput' in y_type:
+        raise ValueError("Multioutput target data is not supported with label "
+                         "binarization")
 
     n_samples = y.shape[0] if sp.issparse(y) else len(y)
     n_classes = len(classes)
@@ -517,6 +531,8 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
 
         if pos_switch:
             Y[Y == pos_label] = 0
+    else:
+        Y.data = astype(Y.data, int, copy=False)
 
     # preserve label ordering
     if np.any(classes != sorted_class):
@@ -524,7 +540,10 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
         Y = Y[:, indices]
 
     if y_type == "binary":
-        Y = Y[:, -1].reshape((-1, 1))
+        if sparse_output:
+            Y = Y.getcol(-1)
+        else:
+            Y = Y[:, -1].reshape((-1, 1))
 
     return Y
 
@@ -600,6 +619,8 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold):
 
     # Inverse transform data
     if output_type == "binary":
+        if sp.issparse(y):
+            y = y.toarray()
         if y.ndim == 2 and y.shape[1] == 2:
             return classes[y[:, 1]]
         else:
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
index 2cc786605f380..826eaec9fc0a9 100644
--- a/sklearn/preprocessing/tests/test_label.py
+++ b/sklearn/preprocessing/tests/test_label.py
@@ -194,6 +194,11 @@ def test_label_binarizer_errors():
                   y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary",
                   classes=[1, 2, 3], threshold=0)
 
+    # Fail on multioutput data
+    assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]]))
+    assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]),
+                  [1, 2, 3])
+
 
 def test_label_encoder():
     """Test LabelEncoder's transform and inverse_transform methods"""
@@ -467,6 +472,15 @@ def test_label_binarize_binary():
 
     yield check_binarized_results, y, classes, pos_label, neg_label, expected
 
+    # Binary case where sparse_output = True will not result in a ValueError
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 3
+    neg_label = 0
+    expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
+
+    yield check_binarized_results, y, classes, pos_label, neg_label, expected
+
 
 def test_label_binarize_multiclass():
     y = [0, 1, 2]
diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py
index 0b23951bc8d8f..4fef34a901bd8 100644
--- a/sklearn/tests/test_multiclass.py
+++ b/sklearn/tests/test_multiclass.py
@@ -1,5 +1,6 @@
 import numpy as np
 import warnings
+import scipy.sparse as sp
 
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_equal
@@ -13,10 +14,14 @@
 from sklearn.multiclass import OneVsRestClassifier
 from sklearn.multiclass import OneVsOneClassifier
 from sklearn.multiclass import OutputCodeClassifier
+from sklearn.multiclass import predict_ovr
+from sklearn.multiclass import fit_ovr
 
 from sklearn.metrics import precision_score
 from sklearn.metrics import recall_score
 
+from sklearn.preprocessing import LabelBinarizer
+
 from sklearn.svm import LinearSVC
 from sklearn.naive_bayes import MultinomialNB
 from sklearn.linear_model import (LinearRegression, Lasso, ElasticNet, Ridge,
@@ -40,6 +45,17 @@ def test_ovr_exceptions():
     ovr = OneVsRestClassifier(LinearSVC(random_state=0))
     assert_raises(ValueError, ovr.predict, [])
 
+    assert_raises(ValueError, predict_ovr, [LinearSVC(), MultinomialNB()],
+                  LabelBinarizer(), [])
+
+    # Fail on multioutput data
+    assert_raises(ValueError, fit_ovr, MultinomialNB(),
+                  np.array([[1, 0], [0, 1]]),
+                  np.array([[1, 2], [3, 1]]))
+    assert_raises(ValueError, fit_ovr, MultinomialNB(),
+                  np.array([[1, 0], [0, 1]]),
+                  np.array([[1.5, 2.4], [3.1, 0.8]]))
+
 
 def test_ovr_fit_predict():
     # A classifier which implements decision_function.
@@ -57,9 +73,49 @@ def test_ovr_fit_predict():
     assert_greater(np.mean(iris.target == pred), 0.65)
 
 
+def test_ovr_fit_predict_sparse():
+    for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
+                   sp.lil_matrix]:
+        base_clf = MultinomialNB(alpha=1)
+
+        X, Y = datasets.make_multilabel_classification(n_samples=100,
+                                                       n_features=20,
+                                                       n_classes=5,
+                                                       n_labels=3,
+                                                       length=50,
+                                                       allow_unlabeled=True,
+                                                       return_indicator=True,
+                                                       random_state=0)
+
+        X_train, Y_train = X[:80], Y[:80]
+        X_test, Y_test = X[80:], Y[80:]
+
+        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
+        Y_pred = clf.predict(X_test)
+
+        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
+        Y_pred_sprs = clf_sprs.predict(X_test)
+
+        assert_true(clf.multilabel_)
+        assert_true(sp.issparse(Y_pred_sprs))
+        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)
+
+        # Test predict_proba
+        Y_proba = clf_sprs.predict_proba(X_test)
+
+        # predict assigns a label if the probability that the
+        # sample has the label is greater than 0.5.
+        pred = Y_proba > .5
+        assert_array_equal(pred, Y_pred_sprs.toarray())
+
+        # Test decision_function
+        clf_sprs = OneVsRestClassifier(svm.SVC()).fit(X_train, sparse(Y_train))
+        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
+        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray())
+
+
 def test_ovr_always_present():
-    """Test that ovr works with classes that are always present or absent
-    """
+    """Test that ovr works with classes that are always present or absent."""
     # Note: tests is the case where _ConstantPredictor is utilised
     X = np.ones((10, 2))
     X[:5, :] = 0
@@ -87,12 +143,62 @@ def test_ovr_always_present():
     assert_array_equal(y_pred[:, -1], np.zeros(X.shape[0]))
 
 
+def test_ovr_multiclass():
+    # Toy dataset where features correspond directly to labels.
+    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
+    y = ["eggs", "spam", "ham", "eggs", "ham"]
+    Y = np.array([[0, 0, 1],
+                  [0, 1, 0],
+                  [1, 0, 0],
+                  [0, 0, 1],
+                  [1, 0, 0]])
+
+    classes = set("ham eggs spam".split())
+
+    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
+                     LinearRegression(), Ridge(),
+                     ElasticNet()):
+
+        clf = OneVsRestClassifier(base_clf).fit(X, y)
+        assert_equal(set(clf.classes_), classes)
+        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
+        assert_equal(set(y_pred), set("eggs"))
+
+        # test input as label indicator matrix
+        clf = OneVsRestClassifier(base_clf).fit(X, Y)
+        y_pred = clf.predict([[0, 0, 4]])[0]
+        assert_array_equal(y_pred, [0, 0, 1])
+
+
+def test_ovr_binary():
+    # Toy dataset where features correspond directly to labels.
+    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
+    y = ["eggs", "spam", "spam", "eggs", "spam"]
+    Y = np.array([[0, 1, 1, 0, 1]]).T
+
+    classes = set("eggs spam".split())
+
+    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
+                     LinearRegression(), Ridge(),
+                     ElasticNet()):
+
+        clf = OneVsRestClassifier(base_clf).fit(X, y)
+        assert_equal(set(clf.classes_), classes)
+        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
+        assert_equal(set(y_pred), set("eggs"))
+
+        # test input as label indicator matrix
+        clf = OneVsRestClassifier(base_clf).fit(X, Y)
+        y_pred = clf.predict([[3, 0, 0]])[0]
+        assert_equal(y_pred, 1)
+
+
 def test_ovr_multilabel():
     # Toy dataset where features correspond directly to labels.
     X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
     y = [["spam", "eggs"], ["spam"], ["ham", "eggs", "spam"],
          ["ham", "eggs"], ["ham"]]
-    #y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]]
+    # y = [[1, 2], [1], [0, 1, 2], [0, 2], [0]]
     Y = np.array([[0, 1, 1],
                   [0, 1, 0],
                   [1, 1, 1],
@@ -142,6 +248,7 @@ def test_ovr_multilabel_dataset():
         X_test, Y_test = X[80:], Y[80:]
         clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
         Y_pred = clf.predict(X_test)
+
         assert_true(clf.multilabel_)
         assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
                             prec,
@@ -379,3 +486,8 @@ def test_ecoc_gridsearch():
     cv.fit(iris.data, iris.target)
     best_C = cv.best_estimator_.estimators_[0].C
     assert_true(best_C in Cs)
+
+
+if __name__ == "__main__":
+    import nose
+    nose.runmodule()
diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
index f3ee830c64f10..195836ccced2d 100644
--- a/sklearn/utils/testing.py
+++ b/sklearn/utils/testing.py
@@ -155,11 +155,10 @@ def assert_warns(warning_class, func, *args, **kw):
             raise AssertionError("No warning raised when calling %s"
                                  % func.__name__)
 
-        if not w[0].category is warning_class:
-            raise AssertionError("First warning for %s is not a "
-                                 "%s( is %s)"
-                                 % (func.__name__, warning_class, w[0]))
-
+        found = any(warning.category is warning_class for warning in w)
+        if not found:
+            raise AssertionError("%s did not give warning: %s( is %s)"
+                                 % (func.__name__, warning_class, w))
     return result
 
 
@@ -579,6 +578,7 @@ def if_not_mac_os(versions=('10.7', '10.8', '10.9'),
     """
     mac_version, _, _ = platform.mac_ver()
     skip = '.'.join(mac_version.split('.')[:2]) in versions
+
     def decorator(func):
         if skip:
             @wraps(func)