From f76f6895c318779c1b5abb43e91cad4da0572361 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 31 Aug 2016 17:41:35 -0400 Subject: [PATCH 1/5] refer users to the other encoders to do one hot encoding for labels. --- sklearn/preprocessing/data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e7f242cdedc5d..b7dc60c0dd482 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1810,6 +1810,10 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot encoding of dictionary items or strings. + sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all + fashion. + sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 + and n_classes-1. """ def __init__(self, n_values="auto", categorical_features="all", dtype=np.float64, sparse=True, handle_unknown='error'): From c7b0ea90d3291d74d8069cea3ce84488a2a4c7c1 Mon Sep 17 00:00:00 2001 From: Kathy Date: Thu, 1 Sep 2016 09:32:30 -0400 Subject: [PATCH 2/5] added to the 'see more' for labelbinarizer, multilabelbinarizer, and labelencoder' as well as an example to multilabel binarizer --- sklearn/preprocessing/data.py | 3 +++ sklearn/preprocessing/label.py | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index b7dc60c0dd482..891dbcc98cd23 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1812,6 +1812,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): encoding of dictionary items or strings. sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all fashion. + sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 and n_classes-1. """ diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index e571d3f44be7f..590565e5bfd83 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -91,6 +91,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] + See also + -------- + sklearn.preprocessing.OneHotEncoder : encode categorical integer features + using a one-hot aka one-of-K scheme. """ def fit(self, y): @@ -257,6 +261,8 @@ class LabelBinarizer(BaseEstimator, TransformerMixin): -------- label_binarize : function to perform the transform operation of LabelBinarizer with fixed classes. + sklearn.preprocessing.OneHotEncoder : encode categorical integer features + using a one-hot aka one-of-K scheme. """ def __init__(self, neg_label=0, pos_label=1, sparse_output=False): @@ -648,6 +654,7 @@ class MultiLabelBinarizer(BaseEstimator, TransformerMixin): Examples -------- + >>> from sklearn.preprocessing import MultiLabelBinarizer >>> mlb = MultiLabelBinarizer() >>> mlb.fit_transform([(1, 2), (3,)]) array([[1, 1, 0], @@ -661,6 +668,20 @@ class MultiLabelBinarizer(BaseEstimator, TransformerMixin): >>> list(mlb.classes_) ['comedy', 'sci-fi', 'thriller'] + Perform a one hot encoding for y labels + + >>> import numpy as np + >>> y = np.array([0, 1, 1, 0]) + >>> mlb.fit_transform(y.reshape(-1, 1)) + array([[1, 0], + [0, 1], + [0, 1], + [1, 0]]) + + See also + -------- + sklearn.preprocessing.OneHotEncoder : encode categorical integer features + using a one-hot aka one-of-K scheme. """ def __init__(self, classes=None, sparse_output=False): self.classes = classes From e657c7931aec1dd715cfcf48db1c5a86fa60ac23 Mon Sep 17 00:00:00 2001 From: Kathy Date: Thu, 1 Sep 2016 12:10:51 -0400 Subject: [PATCH 3/5] added note about y labels to the OneHotEncoder docstring --- sklearn/preprocessing/data.py | 3 +++ sklearn/preprocessing/label.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 891dbcc98cd23..8ac581d4eac63 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1737,6 +1737,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. + Note: a one-hot encoding of y labels should use a MultiLabelBinarizer + instead. + Read more in the :ref:`User Guide `. Parameters diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 590565e5bfd83..21794c70ef5b6 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -668,7 +668,7 @@ class MultiLabelBinarizer(BaseEstimator, TransformerMixin): >>> list(mlb.classes_) ['comedy', 'sci-fi', 'thriller'] - Perform a one hot encoding for y labels + Perform a one-hot encoding for y labels >>> import numpy as np >>> y = np.array([0, 1, 1, 0]) From b27cf06c80b2e012aa283b91ed856d6244850752 Mon Sep 17 00:00:00 2001 From: kchen17 Date: Sat, 17 Sep 2016 11:46:54 -0400 Subject: [PATCH 4/5] removed example from MultiLabelBinarizer --- sklearn/preprocessing/label.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index 21794c70ef5b6..7a391b3f60b19 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -668,16 +668,6 @@ class MultiLabelBinarizer(BaseEstimator, TransformerMixin): >>> list(mlb.classes_) ['comedy', 'sci-fi', 'thriller'] - Perform a one-hot encoding for y labels - - >>> import numpy as np - >>> y = np.array([0, 1, 1, 0]) - >>> mlb.fit_transform(y.reshape(-1, 1)) - array([[1, 0], - [0, 1], - [0, 1], - [1, 0]]) - See also -------- sklearn.preprocessing.OneHotEncoder : encode categorical integer features From 52bc72e6cd8642cdf7d23ef218f9c8fb0d6f661a Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 21 Sep 2016 12:13:05 -0400 Subject: [PATCH 5/5] documentation should specify LabelBinarizer, not MultiLabelBinarizer in OHE --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 8ac581d4eac63..08be1c75d0f49 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1737,7 +1737,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. - Note: a one-hot encoding of y labels should use a MultiLabelBinarizer + Note: a one-hot encoding of y labels should use a LabelBinarizer instead. Read more in the :ref:`User Guide `.