-
-
Notifications
You must be signed in to change notification settings - Fork 26k
[MRG+1] Sparse One vs. Rest #3276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a4fe5b0
8dbae82
4662b95
0771ba0
2a5ebfb
1275ce5
c668fc7
94d7d04
63aab29
7c51b1a
bba115a
5343138
b3860cd
b492030
9a3c831
312e108
ee4a715
b2d0f1e
8e2f9a2
4f82b66
0ae6dec
6e5c3ae
6b9b53e
b35671a
c766dd6
55cea43
986b43b
2454009
86fa719
350ccc3
9a7635f
6805704
63c3b58
1dd6e95
ebdae52
6a50b82
99dfe1b
ec0558c
01a4cb7
91d9354
104ee77
b31e9b6
37d7b19
a20cf27
2c15f1b
09f4b25
17407a8
a3b909a
911bff2
80f57f2
dbc67af
26d63c3
855fdd1
e1dc470
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,14 +32,19 @@ | |
# | ||
# License: BSD 3 clause | ||
|
||
import array | ||
import numpy as np | ||
import warnings | ||
import scipy.sparse as sp | ||
|
||
from .base import BaseEstimator, ClassifierMixin, clone, is_classifier | ||
from .base import MetaEstimatorMixin | ||
from .preprocessing import LabelBinarizer | ||
from .metrics.pairwise import euclidean_distances | ||
from .utils import check_random_state | ||
from .utils.multiclass import type_of_target | ||
from .utils.multiclass import unique_labels | ||
from .utils.validation import _num_samples | ||
from .externals.joblib import Parallel | ||
from .externals.joblib import delayed | ||
|
||
|
@@ -81,24 +86,96 @@ def _check_estimator(estimator): | |
|
||
|
||
def fit_ovr(estimator, X, y, n_jobs=1): | ||
"""Fit a one-vs-the-rest strategy.""" | ||
_check_estimator(estimator) | ||
"""Fit a list of estimators using a one-vs-the-rest strategy. | ||
|
||
lb = LabelBinarizer() | ||
Y = lb.fit_transform(y) | ||
Parameters | ||
---------- | ||
estimator : estimator object | ||
An estimator object implementing `fit` and one of `decision_function` | ||
or `predict_proba`. | ||
|
||
estimators = Parallel(n_jobs=n_jobs)( | ||
delayed(_fit_binary)(estimator, X, Y[:, i], classes=["not %s" % i, i]) | ||
for i in range(Y.shape[1])) | ||
X : {array-like, sparse matrix}, shape = [n_samples, n_features] | ||
Data. | ||
|
||
y : {array-like, sparse matrix}, shape = [n_samples] or | ||
[n_samples, n_classes] Multi-class targets. An indicator matrix | ||
turns on multilabel classification. | ||
|
||
Returns | ||
------- | ||
self | ||
""" | ||
_check_estimator(estimator) | ||
# A sparse LabelBinarizer, with sparse_output=True, has been shown to | ||
# outpreform or match a dense label binarizer in all cases and has also | ||
# resulted in less or equal memory consumption in the fit_ovr function | ||
# overall. | ||
lb = LabelBinarizer(sparse_output=True) | ||
Y = lb.fit_transform(y) | ||
Y = Y.tocsc() | ||
columns = (col.toarray().ravel() for col in Y.T) | ||
# In cases where individual estimators are very fast to train setting | ||
# n_jobs > 1 in can results in slower performance due to the overhead | ||
# of spawning threads. | ||
estimators = Parallel(n_jobs=n_jobs)(delayed(_fit_binary) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a comment (or a sentence in the docs) stating that You can add a comment in the source referencing this joblib issue. |
||
(estimator, | ||
X, | ||
column, | ||
classes=["not %s" % i, | ||
lb.classes_[i]]) | ||
for i, column in enumerate(columns)) | ||
return estimators, lb | ||
|
||
|
||
def predict_ovr(estimators, label_binarizer, X): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function assumes that EDIT: ways to solve this:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or 4) deprecate and make them private... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I say (1) or perhaps (2) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I implemented suggestion number 2 and included a test for it. |
||
"""Make predictions using the one-vs-the-rest strategy.""" | ||
Y = np.array([_predict_binary(e, X) for e in estimators]) | ||
"""Predict multi-class targets using the one vs rest strategy. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please remove this blank line. |
||
Parameters | ||
---------- | ||
estimators : list of `n_classes` estimators, Estimators used for | ||
predictions. The list must be homogeneous with respect to the type of | ||
estimators. fit_ovr supplies this list as part of its output. | ||
|
||
label_binarizer : LabelBinarizer object, Object used to transform | ||
multiclass labels to binary labels and vice-versa. fit_ovr supplies | ||
this object as part of its output. | ||
|
||
X : {array-like, sparse matrix}, shape = [n_samples, n_features] | ||
Data. | ||
|
||
Returns | ||
------- | ||
y : {array-like, sparse matrix}, shape = [n_samples] or | ||
[n_samples, n_classes]. Predicted multi-class targets. | ||
""" | ||
e_types = set([type(e) for e in estimators if not | ||
isinstance(e, _ConstantPredictor)]) | ||
if len(e_types) > 1: | ||
raise ValueError("List of estimators must contain estimators of the" | ||
" same type but contains types {0}".format(e_types)) | ||
e = estimators[0] | ||
thresh = 0 if hasattr(e, "decision_function") and is_classifier(e) else .5 | ||
return label_binarizer.inverse_transform(Y.T, threshold=thresh) | ||
|
||
if label_binarizer.y_type_ == "multiclass": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can EDIT: I think the cases to consider are "binary" and "multiclass-multioutput". Could you please check that these cases are covered by tests? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see the need for binary data with One vs. Rest since it is not multiclass and would be the same thing as fitting a regular estimator. I am also not sure "multiclass-multioutput" would work since it does not fit with the scheme of One vs. Rest and there is no way to binarize it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not saying there is need for it. I'm asking what happens if a user
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have raised a |
||
maxima = np.empty(X.shape[0], dtype=float) | ||
maxima.fill(-np.inf) | ||
argmaxima = np.zeros(X.shape[0], dtype=int) | ||
for i, e in enumerate(estimators): | ||
pred = _predict_binary(e, X) | ||
np.maximum(maxima, pred, out=maxima) | ||
argmaxima[maxima == pred] = i | ||
return label_binarizer.classes_[np.array(argmaxima.T)] | ||
else: | ||
n_samples = _num_samples(X) | ||
indices = array.array('i') | ||
indptr = array.array('i', [0]) | ||
for e in estimators: | ||
indices.extend(np.where(_predict_binary(e, X) > thresh)[0]) | ||
indptr.append(len(indices)) | ||
data = np.ones(len(indices), dtype=int) | ||
indicator = sp.csc_matrix((data, indices, indptr), | ||
shape=(n_samples, len(estimators))) | ||
return label_binarizer.inverse_transform(indicator) | ||
|
||
|
||
def predict_proba_ovr(estimators, X, is_multilabel): | ||
|
@@ -190,9 +267,9 @@ def fit(self, X, y): | |
X : {array-like, sparse matrix}, shape = [n_samples, n_features] | ||
Data. | ||
|
||
y : array-like, shape = [n_samples] or [n_samples, n_classes] | ||
Multi-class targets. An indicator matrix turns on multilabel | ||
classification. | ||
y : {array-like, sparse matrix}, shape = [n_samples] or | ||
[n_samples, n_classes] Multi-class targets. An indicator matrix | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Could it be one line? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This extends over the line limit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does it render well in the doc? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am working on building the doc, I am getting errors importing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This rule is in the scikit-learn folder. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok I have gotten it to start building. I apprently have not run There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks bad in the documentation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Usually it is put in the header. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Althoug it is not entirely precise another solution could be to shorten |
||
turns on multilabel classification. | ||
|
||
Returns | ||
------- | ||
|
@@ -216,8 +293,8 @@ def predict(self, X): | |
|
||
Returns | ||
------- | ||
y : array-like, shape = [n_samples] | ||
Predicted multi-class targets. | ||
y : {array-like, sparse matrix}, shape = [n_samples] or | ||
[n_samples, n_classes]. Predicted multi-class targets. | ||
""" | ||
self._check_is_fitted() | ||
|
||
|
@@ -242,7 +319,7 @@ def predict_proba(self, X): | |
|
||
Returns | ||
------- | ||
T : array-like, shape = [n_samples, n_classes] | ||
T : {array-like, sparse matrix}, shape = [n_samples, n_classes] | ||
Returns the probability of the sample for each class in the model, | ||
where classes are ordered as they are in `self.classes_`. | ||
""" | ||
|
@@ -271,7 +348,7 @@ def decision_function(self, X): | |
@property | ||
def multilabel_(self): | ||
"""Whether this is a multilabel classifier""" | ||
return self.label_binarizer_.multilabel_ | ||
return self.label_binarizer_.y_type_.startswith('multilabel') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have made the entry in the Attributes section |
||
|
||
def score(self, X, y): | ||
if self.multilabel_: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -201,6 +201,13 @@ class LabelBinarizer(BaseEstimator, TransformerMixin): | |
`classes_` : array of shape [n_class] | ||
Holds the label for each class. | ||
|
||
`y_type_` : str, | ||
Represents the type of the target data as evaluated by | ||
utils.multiclass.type_of_target. Possible type are 'continuous', | ||
'continuous-multioutput', 'binary', 'multiclass', | ||
'mutliclass-multioutput', 'multilabel-sequences', | ||
'multilabel-indicator', and 'unknown'. | ||
|
||
`multilabel_` : boolean | ||
True if the transformer was fitted on a multilabel rather than a | ||
multiclass set of labels. The multilabel_ attribute is deprecated | ||
|
@@ -301,6 +308,10 @@ def fit(self, y): | |
self : returns an instance of self. | ||
""" | ||
self.y_type_ = type_of_target(y) | ||
if 'multioutput' in self.y_type_: | ||
raise ValueError("Multioutput target data is not supported with " | ||
"label binarization") | ||
|
||
self.sparse_input_ = sp.issparse(y) | ||
self.classes_ = unique_labels(y) | ||
return self | ||
|
@@ -462,6 +473,9 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, | |
pos_label = -neg_label | ||
|
||
y_type = type_of_target(y) | ||
if 'multioutput' in y_type: | ||
raise ValueError("Multioutput target data is not supported with label " | ||
"binarization") | ||
|
||
n_samples = y.shape[0] if sp.issparse(y) else len(y) | ||
n_classes = len(classes) | ||
|
@@ -517,14 +531,19 @@ def label_binarize(y, classes, neg_label=0, pos_label=1, | |
|
||
if pos_switch: | ||
Y[Y == pos_label] = 0 | ||
else: | ||
Y.data = astype(Y.data, int, copy=False) | ||
|
||
# preserve label ordering | ||
if np.any(classes != sorted_class): | ||
indices = np.argsort(classes) | ||
Y = Y[:, indices] | ||
|
||
if y_type == "binary": | ||
Y = Y[:, -1].reshape((-1, 1)) | ||
if sparse_output: | ||
Y = Y.getcol(-1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this interesting? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found from setting |
||
else: | ||
Y = Y[:, -1].reshape((-1, 1)) | ||
|
||
return Y | ||
|
||
|
@@ -600,6 +619,8 @@ def _inverse_binarize_thresholding(y, output_type, classes, threshold): | |
|
||
# Inverse transform data | ||
if output_type == "binary": | ||
if sp.issparse(y): | ||
y = y.toarray() | ||
if y.ndim == 2 and y.shape[1] == 2: | ||
return classes[y[:, 1]] | ||
else: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given the amount of thought and benchmarking efforts that went into making this decision, I think it's worth to at least explain the empirical results in a comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have included a comment summarizing the benefits