diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index e7f242cdedc5d..08be1c75d0f49 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1737,6 +1737,9 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. + Note: a one-hot encoding of y labels should use a LabelBinarizer + instead. + Read more in the :ref:`User Guide `. Parameters @@ -1810,6 +1813,13 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot encoding of dictionary items or strings. + sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all + fashion. + sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. + sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 + and n_classes-1. """ def __init__(self, n_values="auto", categorical_features="all", dtype=np.float64, sparse=True, handle_unknown='error'): diff --git a/sklearn/preprocessing/label.py b/sklearn/preprocessing/label.py index e571d3f44be7f..7a391b3f60b19 100644 --- a/sklearn/preprocessing/label.py +++ b/sklearn/preprocessing/label.py @@ -91,6 +91,10 @@ class LabelEncoder(BaseEstimator, TransformerMixin): >>> list(le.inverse_transform([2, 2, 1])) ['tokyo', 'tokyo', 'paris'] + See also + -------- + sklearn.preprocessing.OneHotEncoder : encode categorical integer features + using a one-hot aka one-of-K scheme. """ def fit(self, y): @@ -257,6 +261,8 @@ class LabelBinarizer(BaseEstimator, TransformerMixin): -------- label_binarize : function to perform the transform operation of LabelBinarizer with fixed classes. + sklearn.preprocessing.OneHotEncoder : encode categorical integer features + using a one-hot aka one-of-K scheme. """ def __init__(self, neg_label=0, pos_label=1, sparse_output=False): @@ -648,6 +654,7 @@ class MultiLabelBinarizer(BaseEstimator, TransformerMixin): Examples -------- + >>> from sklearn.preprocessing import MultiLabelBinarizer >>> mlb = MultiLabelBinarizer() >>> mlb.fit_transform([(1, 2), (3,)]) array([[1, 1, 0], @@ -661,6 +668,10 @@ class MultiLabelBinarizer(BaseEstimator, TransformerMixin): >>> list(mlb.classes_) ['comedy', 'sci-fi', 'thriller'] + See also + -------- + sklearn.preprocessing.OneHotEncoder : encode categorical integer features + using a one-hot aka one-of-K scheme. """ def __init__(self, classes=None, sparse_output=False): self.classes = classes