scikit-learn · jnothman · Feb 1, 2015 · Dec 31, 2014 · raghavrv · Jan 11, 2015
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
@@ -859,6 +859,29 @@ def test_pickling_vectorizer():
             orig.fit_transform(JUNK_FOOD_DOCS).toarray())
 
 
+def test_stop_words_removal():
+    """Ensure that deleting the stop_words_ attribute doesn't affect transform
+    """
+
+    fitted_vectorizers = (
+        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
+        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
+        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS)
+    )
+
+    for vect in fitted_vectorizers:
+        vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+
+        vect.stop_words_ = None
+        stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+
+        delattr(vect, 'stop_words_')
+        stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+
+        assert_array_equal(stop_None_transform, vect_transform)
+        assert_array_equal(stop_del_transform, vect_transform)
+
+
 def test_pickling_transformer():
     X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
     orig = TfidfTransformer().fit(X)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -628,6 +628,12 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
     See also
     --------
     HashingVectorizer, TfidfVectorizer
+
+    Notes
+    -----
+    The ``stop_words_`` attribute can get large and increase the model size 
+    when pickling. This attribute is provided only for introspection and can
+    be safely removed using delattr or set to None before pickling.
     """
 
     def __init__(self, input='content', encoding='utf-8',
@@ -1158,6 +1164,15 @@ class TfidfVectorizer(CountVectorizer):
         The learned idf vector (global term weights)
         when ``use_idf`` is set to True, None otherwise.
 
+    stop_words_ : set
+        Terms that were ignored because they either:
+
+          - occurred in too many documents (`max_df`)
+          - occurred in too few documents (`min_df`)
+          - were cut off by feature selection (`max_features`).
+
+        This is only available if no vocabulary was given.
+
     See also
     --------
     CountVectorizer
@@ -1167,7 +1182,12 @@ class TfidfVectorizer(CountVectorizer):
     TfidfTransformer
         Apply Term Frequency Inverse Document Frequency normalization to a
         sparse matrix of occurrence counts.
-
+
+    Notes
+    -----
+    The ``stop_words_`` attribute can get large and increase the model size 
+    when pickling. This attribute is provided only for introspection and can
+    be safely removed using delattr or set to None before pickling.
     """
 
     def __init__(self, input='content', encoding='utf-8',