From 21369dd6a140e70a90341669f15a4f05bf883480 Mon Sep 17 00:00:00 2001 From: Raghav R V Date: Wed, 31 Dec 2014 07:54:24 +0530 Subject: [PATCH] TST Removal or modification of stop_words_ should not affect transform. DOC Add a line to {Count, Tfidf}Vectorizer about removal of stop_words_ DOC Add documentation of stop_words_ attr in TfidfVectorizer --- sklearn/feature_extraction/tests/test_text.py | 23 +++++++++++++++++++ sklearn/feature_extraction/text.py | 22 +++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 096101a9cb24d..7c131366d89a7 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -859,6 +859,29 @@ def test_pickling_vectorizer(): orig.fit_transform(JUNK_FOOD_DOCS).toarray()) +def test_stop_words_removal(): + """Ensure that deleting the stop_words_ attribute doesn't affect transform + """ + + fitted_vectorizers = ( + TfidfVectorizer().fit(JUNK_FOOD_DOCS), + CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS), + CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS) + ) + + for vect in fitted_vectorizers: + vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray() + + vect.stop_words_ = None + stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray() + + delattr(vect, 'stop_words_') + stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray() + + assert_array_equal(stop_None_transform, vect_transform) + assert_array_equal(stop_del_transform, vect_transform) + + def test_pickling_transformer(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2aa4a0bba68b7..91f3589e3726c 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -628,6 +628,12 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): See also -------- HashingVectorizer, TfidfVectorizer + + Notes + ----- + The ``stop_words_`` attribute can get large and increase the model size + when pickling. This attribute is provided only for introspection and can + be safely removed using delattr or set to None before pickling. """ def __init__(self, input='content', encoding='utf-8', @@ -1158,6 +1164,15 @@ class TfidfVectorizer(CountVectorizer): The learned idf vector (global term weights) when ``use_idf`` is set to True, None otherwise. + stop_words_ : set + Terms that were ignored because they either: + + - occurred in too many documents (`max_df`) + - occurred in too few documents (`min_df`) + - were cut off by feature selection (`max_features`). + + This is only available if no vocabulary was given. + See also -------- CountVectorizer @@ -1167,7 +1182,12 @@ class TfidfVectorizer(CountVectorizer): TfidfTransformer Apply Term Frequency Inverse Document Frequency normalization to a sparse matrix of occurrence counts. - + + Notes + ----- + The ``stop_words_`` attribute can get large and increase the model size + when pickling. This attribute is provided only for introspection and can + be safely removed using delattr or set to None before pickling. """ def __init__(self, input='content', encoding='utf-8',