From 21369dd6a140e70a90341669f15a4f05bf883480 Mon Sep 17 00:00:00 2001
From: Raghav R V <ragvrv@gmail.com>
Date: Wed, 31 Dec 2014 07:54:24 +0530
Subject: [PATCH] TST Removal or modification of stop_words_ should not affect
 transform.

DOC Add a line to {Count, Tfidf}Vectorizer about removal of stop_words_
DOC Add documentation of stop_words_ attr in TfidfVectorizer
---
 sklearn/feature_extraction/tests/test_text.py | 23 +++++++++++++++++++
 sklearn/feature_extraction/text.py            | 22 +++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 096101a9cb24d..7c131366d89a7 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -859,6 +859,29 @@ def test_pickling_vectorizer():
             orig.fit_transform(JUNK_FOOD_DOCS).toarray())
 
 
+def test_stop_words_removal():
+    """Ensure that deleting the stop_words_ attribute doesn't affect transform
+    """
+
+    fitted_vectorizers = (
+        TfidfVectorizer().fit(JUNK_FOOD_DOCS),
+        CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
+        CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS)
+    )
+
+    for vect in fitted_vectorizers:
+        vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+
+        vect.stop_words_ = None
+        stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+
+        delattr(vect, 'stop_words_')
+        stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
+
+        assert_array_equal(stop_None_transform, vect_transform)
+        assert_array_equal(stop_del_transform, vect_transform)
+
+
 def test_pickling_transformer():
     X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
     orig = TfidfTransformer().fit(X)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 2aa4a0bba68b7..91f3589e3726c 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -628,6 +628,12 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
     See also
     --------
     HashingVectorizer, TfidfVectorizer
+
+    Notes
+    -----
+    The ``stop_words_`` attribute can get large and increase the model size 
+    when pickling. This attribute is provided only for introspection and can
+    be safely removed using delattr or set to None before pickling.
     """
 
     def __init__(self, input='content', encoding='utf-8',
@@ -1158,6 +1164,15 @@ class TfidfVectorizer(CountVectorizer):
         The learned idf vector (global term weights)
         when ``use_idf`` is set to True, None otherwise.
 
+    stop_words_ : set
+        Terms that were ignored because they either:
+
+          - occurred in too many documents (`max_df`)
+          - occurred in too few documents (`min_df`)
+          - were cut off by feature selection (`max_features`).
+
+        This is only available if no vocabulary was given.
+ 
     See also
     --------
     CountVectorizer
@@ -1167,7 +1182,12 @@ class TfidfVectorizer(CountVectorizer):
     TfidfTransformer
         Apply Term Frequency Inverse Document Frequency normalization to a
         sparse matrix of occurrence counts.
-
+ 
+    Notes
+    -----
+    The ``stop_words_`` attribute can get large and increase the model size 
+    when pickling. This attribute is provided only for introspection and can
+    be safely removed using delattr or set to None before pickling.
     """
 
     def __init__(self, input='content', encoding='utf-8',