scikit-learn · hamsal · Jun 13, 2014 · Jun 13, 2014 · Jun 13, 2014 · Jun 13, 2014
diff --git a/doc/modules/multiclass.rst b/doc/modules/multiclass.rst
@@ -33,7 +33,8 @@ by decomposing such problems into binary classification problems.
     several joint classification tasks. This is a generalization
     of the multi-label classification task, where the set of classification
     problem is restricted to binary classification, and of the multi-class
-    classification task. *The output format is a 2d numpy array.*
+    classification task. *The output format is a 2d numpy array or sparse 
+    matrix.*
 
     The set of labels can be different for each output variable.
     For instance a sample could be assigned "pear" for an output variable that

diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
@@ -32,14 +32,19 @@
 #
 # License: BSD 3 clause
 
+import array
 import numpy as np
 import warnings
+import scipy.sparse as sp
 
 from .base import BaseEstimator, ClassifierMixin, clone, is_classifier
 from .base import MetaEstimatorMixin
 from .preprocessing import LabelBinarizer
 from .metrics.pairwise import euclidean_distances
 from .utils import check_random_state
+from .utils.multiclass import type_of_target
+from .utils.multiclass import unique_labels
+from .utils.validation import _num_samples
 from .externals.joblib import Parallel
 from .externals.joblib import delayed
 
@@ -81,24 +86,96 @@ def _check_estimator(estimator):
 
 
 def fit_ovr(estimator, X, y, n_jobs=1):
-    """Fit a one-vs-the-rest strategy."""
-    _check_estimator(estimator)
+    """Fit a list of estimators using a one-vs-the-rest strategy.
 
-    lb = LabelBinarizer()
-    Y = lb.fit_transform(y)
+    Parameters
+    ----------
+    estimator : estimator object
+        An estimator object implementing `fit` and one of `decision_function`
+        or `predict_proba`.
 
-    estimators = Parallel(n_jobs=n_jobs)(
-        delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i])
-        for i in range(Y.shape[1]))
+    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+        Data.
+
+    y : {array-like, sparse matrix}, shape = [n_samples] or
+        [n_samples, n_classes] Multi-class targets. An indicator matrix
+        turns on multilabel classification.
+
+    Returns
+    -------
+    self
+    """
+    _check_estimator(estimator)
+    # A sparse LabelBinarizer, with sparse_output=True, has been shown to
+    # outpreform or match a dense label binarizer in all cases and has also
+    # resulted in less or equal memory consumption in the fit_ovr function
+    # overall.
+    lb = LabelBinarizer(sparse_output=True)
+    Y = lb.fit_transform(y)
+    Y = Y.tocsc()
+    columns = (col.toarray().ravel() for col in Y.T)
+    # In cases where individual estimators are very fast to train setting
+    # n_jobs > 1 in can results in slower performance due to the overhead
+    # of spawning threads.
+    estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary)
+                                         (estimator,
+                                          X,
+                                          column,
+                                          classes=["not %s" % i,
+                                                   lb.classes_[i]])
+                                         for i, column in enumerate(columns))
     return estimators, lb
 
 
 def predict_ovr(estimators, label_binarizer, X):
-    """Make predictions using the one-vs-the-rest strategy."""
-    Y = np.array([_predict_binary(e, X) for e in estimators])
+    """Predict multi-class targets using the one vs rest strategy.
+
+    Parameters
+    ----------
+    estimators : list of `n_classes` estimators, Estimators used for
+        predictions. The list must be homogeneous with respect to the type of
+        estimators. fit_ovr supplies this list as part of its output.
+
+    label_binarizer : LabelBinarizer object, Object used to transform
+        multiclass labels to binary labels and vice-versa. fit_ovr supplies
+        this object as part of its output.
+
+    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
+        Data.
+
+    Returns
+    -------
+    y : {array-like, sparse matrix}, shape = [n_samples] or
+        [n_samples, n_classes]. Predicted multi-class targets.
+    """
+    e_types = set([type(e) for e in estimators if not
+                   isinstance(e, _ConstantPredictor)])
+    if len(e_types) > 1:
+        raise ValueError("List of estimators must contain estimators of the"
+                         " same type but contains types {0}".format(e_types))
     e = estimators[0]
     thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5
-    return label_binarizer.inverse_transform(Y.T, threshold=thresh)
+
+    if label_binarizer.y_type_ == "multiclass":
+        maxima = np.empty(X.shape[0], dtype=float)
+        maxima.fill(-np.inf)
+        argmaxima = np.zeros(X.shape[0], dtype=int)
+        for i, e in enumerate(estimators):
+            pred = _predict_binary(e, X)
+            np.maximum(maxima, pred, out=maxima)
+            argmaxima[maxima == pred] = i
+        return label_binarizer.classes_[np.array(argmaxima.T)]
+    else:
+        n_samples = _num_samples(X)
+        indices = array.array('i')
+        indptr = array.array('i', [0])
+        for e in estimators:
+            indices.extend(np.where(_predict_binary(e, X) > thresh)[0])
+            indptr.append(len(indices))
+        data = np.ones(len(indices), dtype=int)
+        indicator = sp.csc_matrix((data, indices, indptr),
+                                  shape=(n_samples, len(estimators)))
+        return label_binarizer.inverse_transform(indicator)
 
 
 def predict_proba_ovr(estimators, X, is_multilabel):
@@ -190,9 +267,9 @@ def fit(self, X, y):
         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
             Data.
 
-        y : array-like, shape = [n_samples] or [n_samples, n_classes]
-            Multi-class targets. An indicator matrix turns on multilabel
-            classification.
+        y : {array-like, sparse matrix}, shape = [n_samples] or
+            [n_samples, n_classes] Multi-class targets. An indicator matrix
+            turns on multilabel classification.
 
         Returns
         -------
@@ -216,8 +293,8 @@ def predict(self, X):
 
         Returns
         -------
-        y : array-like, shape = [n_samples]
-            Predicted multi-class targets.
+        y : {array-like, sparse matrix}, shape = [n_samples] or
+            [n_samples, n_classes]. Predicted multi-class targets.
         """
         self._check_is_fitted()
 
@@ -242,7 +319,7 @@ def predict_proba(self, X):
 
         Returns
         -------
-        T : array-like, shape = [n_samples, n_classes]
+        T : {array-like, sparse matrix}, shape = [n_samples, n_classes]
             Returns the probability of the sample for each class in the model,
             where classes are ordered as they are in `self.classes_`.
         """
@@ -271,7 +348,7 @@ def decision_function(self, X):
     @property
     def multilabel_(self):
         """Whether this is a multilabel classifier"""
-        return self.label_binarizer_.multilabel_
+        return self.label_binarizer_.y_type_.startswith('multilabel')
 
     def score(self, X, y):
         if self.multilabel_:

diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py
@@ -201,6 +201,13 @@ class LabelBinarizer(BaseEstimator, TransformerMixin):
     `classes_` : array of shape [n_class]
         Holds the label for each class.
 
+    `y_type_` : str,
+        Represents the type of the target data as evaluated by
+        utils.multiclass.type_of_target. Possible type are 'continuous',
+        'continuous-multioutput', 'binary', 'multiclass',
+        'mutliclass-multioutput', 'multilabel-sequences',
+        'multilabel-indicator', and 'unknown'.
+
     `multilabel_` : boolean
         True if the transformer was fitted on a multilabel rather than a
         multiclass set of labels. The multilabel_ attribute is deprecated
@@ -301,6 +308,10 @@ def fit(self, y):
         self : returns an instance of self.
         """
         self.y_type_ = type_of_target(y)
+        if 'multioutput' in self.y_type_:
+            raise ValueError("Multioutput target data is not supported with "
+                             "label binarization")
+
         self.sparse_input_ = sp.issparse(y)
         self.classes_ = unique_labels(y)
         return self
@@ -462,6 +473,9 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
         pos_label = -neg_label
 
     y_type = type_of_target(y)
+    if 'multioutput' in y_type:
+        raise ValueError("Multioutput target data is not supported with label "
+                         "binarization")
 
     n_samples = y.shape[0] if sp.issparse(y) else len(y)
     n_classes = len(classes)
@@ -517,14 +531,19 @@ def label_binarize(y, classes, neg_label=0, pos_label=1,
 
         if pos_switch:
             Y[Y == pos_label] = 0
+    else:
+        Y.data = astype(Y.data, int, copy=False)
 
     # preserve label ordering
     if np.any(classes != sorted_class):
         indices = np.argsort(classes)
         Y = Y[:, indices]
 
     if y_type == "binary":
-        Y = Y[:, -1].reshape((-1, 1))
+        if sparse_output:
+            Y = Y.getcol(-1)
+        else:
+            Y = Y[:, -1].reshape((-1, 1))
 
     return Y
 
@@ -600,6 +619,8 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold):
 
     # Inverse transform data
     if output_type == "binary":
+        if sp.issparse(y):
+            y = y.toarray()
         if y.ndim == 2 and y.shape[1] == 2:
             return classes[y[:, 1]]
         else:

diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -194,6 +194,11 @@ def test_label_binarizer_errors():
                   y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary",
                   classes=[1, 2, 3], threshold=0)
 
+    # Fail on multioutput data
+    assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]]))
+    assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]),
+                  [1, 2, 3])
+
 
 def test_label_encoder():
     """Test LabelEncoder's transform and inverse_transform methods"""
@@ -467,6 +472,15 @@ def test_label_binarize_binary():
 
     yield check_binarized_results, y, classes, pos_label, neg_label, expected
 
+    # Binary case where sparse_output = True will not result in a ValueError
+    y = [0, 1, 0]
+    classes = [0, 1]
+    pos_label = 3
+    neg_label = 0
+    expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
+
+    yield check_binarized_results, y, classes, pos_label, neg_label, expected
+
 
 def test_label_binarize_multiclass():
     y = [0, 1, 2]