diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 05f60d2805c7c..f1b87b8470bf7 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1286,7 +1286,27 @@ def idf_(self, value): class TfidfVectorizer(CountVectorizer): """Convert a collection of raw documents to a matrix of TF-IDF features. - Equivalent to CountVectorizer followed by TfidfTransformer. + Equivalent to CountVectorizer followed by TfidfTransformer. + + CountVectorizer converts a collection of text documents to a matrix of + token counts. + + TfidfTransformer then converts the count matrix from CountVectorizer to a + normalized tf-idf representation. Tf is term frequency, and idf is inverse + document frequency. This is a common way to calculate the count of a word + relative to the appearance of a ducument. + + The formula that is used to compute the tf-idf of term t is + tf-idf(d, t) = tf(t) * idf(d, t), and the idf is computed as + idf(d, t) = log [ n / df(d, t) ] + 1 (if ``smooth_idf=False``), + where n is the total number of documents and df(d, t) is the + document frequency; the document frequency is the number of documents d + that contain term t. The effect of adding "1" to the idf in the equation + above is that terms with zero idf, i.e., terms that occur in all documents + in a training set, will not be entirely ignored. + (Note that the idf formula above differs from the standard + textbook notation that defines the idf as + idf(d, t) = log [ n / (df(d, t) + 1) ]). Read more in the :ref:`User Guide `. diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py index 01a4f78ab0157..0566b1de5807b 100644 --- a/sklearn/linear_model/logistic.py +++ b/sklearn/linear_model/logistic.py @@ -1140,6 +1140,11 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, Attributes ---------- + classes_ : array or shape (n_classes, ) + A list of class labels known to the classifier. + + In binary classification, one class is often considered the 'positive' + class. For example, in labels [-1, 1], 1 is the positive class. coef_ : array, shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function.