From a0341a8fba7d2634c34e9130d4a7f6d54abeb82b Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Sun, 14 Apr 2019 23:15:42 +0200
Subject: [PATCH 01/16] make sure vectorizers read data from file before
 analyzing

---
 sklearn/feature_extraction/tests/test_text.py | 15 +++++++++++++++
 sklearn/feature_extraction/text.py            | 14 ++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index b02aa2aea46af..8c6113333a7a1 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1196,3 +1196,18 @@ def build_preprocessor(self):
                                             .findall(doc),
                     stop_words=['and'])
     assert _check_stop_words_consistency(vec) is True
+
+
+@pytest.mark.parametrize('Estimator',
+                         [CountVectorizer, TfidfVectorizer, HashingVectorizer])
+def test_callable_analyzer_vs_file_input(Estimator):
+    data = ['this is text, not file or filename']
+
+    with pytest.raises(FileNotFoundError):
+        Estimator(analyzer=lambda x: x.split(),
+                  input='filename').fit_transform(data)
+
+    with pytest.raises(AttributeError,
+                       match="'str' object has no attribute 'read'"):
+        Estimator(analyzer=lambda x: x.split(),
+                  input='file').fit_transform(data)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 9cdbace6224aa..114237df98c09 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -307,7 +307,7 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
     def build_analyzer(self):
         """Return a callable that handles preprocessing and tokenization"""
         if callable(self.analyzer):
-            return self.analyzer
+            return lambda doc: self.analyzer(self.decode(doc))
 
         preprocess = self.build_preprocessor()
 
@@ -488,7 +488,9 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input.
+        out of the raw, unprocessed input. If ``input`` is ``filename`` or
+        ``file``, the data is first read from the file and then passed to the
+        given callable analyzer.
 
     n_features : integer, default=(2 ** 20)
         The number of features (columns) in the output matrices. Small numbers
@@ -743,7 +745,9 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input.
+        out of the raw, unprocessed input. If ``input`` is ``filename`` or
+        ``file``, the data is first read from the file and then passed to the
+        given callable analyzer.
 
     max_df : float in range [0.0, 1.0] or int, default=1.0
         When building the vocabulary ignore terms that have a document
@@ -1367,7 +1371,9 @@ class TfidfVectorizer(CountVectorizer):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input.
+        out of the raw, unprocessed input. If ``input`` is ``filename`` or
+        ``file``, the data is first read from the file and then passed to the
+        given callable analyzer.
 
     stop_words : string {'english'}, list, or None (default=None)
         If a string, it is passed to _check_stop_list and the appropriate stop

From a1de2ae44cd52ffbc8ad84d4466a830bc83de368 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 15 Apr 2019 13:48:51 +0200
Subject: [PATCH 02/16] raise a ChangedBehaviorWarning when appropriate

---
 sklearn/feature_extraction/tests/test_text.py | 24 +++++++++++++++++
 sklearn/feature_extraction/text.py            | 27 +++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 8c6113333a7a1..6cb55df3c54f4 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -29,6 +29,7 @@
 from numpy.testing import assert_array_almost_equal
 from numpy.testing import assert_array_equal
 from sklearn.utils import IS_PYPY
+from sklearn.exceptions import ChangedBehaviorWarning
 from sklearn.utils.testing import (assert_equal, assert_not_equal,
                                    assert_almost_equal, assert_in,
                                    assert_less, assert_greater,
@@ -1211,3 +1212,26 @@ def test_callable_analyzer_vs_file_input(Estimator):
                        match="'str' object has no attribute 'read'"):
         Estimator(analyzer=lambda x: x.split(),
                   input='file').fit_transform(data)
+
+    # check if the ChangedBehaviorWarning is raised if the given analyzer
+    # expects a file or a file name.
+    def analyzer1(doc):
+        with open(doc, 'r'):
+            pass
+
+    def analyzer2(doc):
+        with _ in doc.read():
+            pass
+
+    print(Estimator)
+    for analyzer in [analyzer1, analyzer2]:
+        for input_type in ['file', 'filename']:
+            print(input_type)
+            print(analyzer)
+            try:
+                with pytest.warns(ChangedBehaviorWarning,
+                                  match="Since v0.21, vectorizer"):
+                    Estimator(analyzer=analyzer,
+                              input=input_type).fit_transform(data)
+            except Exception:
+                pass
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 114237df98c09..c8f65db201615 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -31,6 +31,7 @@
 from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
 from ..utils import _IS_32BIT
 from ..utils.fixes import _astype_copy_false
+from ..exceptions import ChangedBehaviorWarning
 
 
 __all__ = ['HashingVectorizer',
@@ -304,9 +305,35 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
             self._stop_words_id = id(self.stop_words)
             return 'error'
 
+    def _validate_analyzer(self):
+        # This is to check if the given custom analyzer expects file or a
+        # filename instead of data.
+        # Behavior changed in v0.21, function could be removed in v0.24
+        if not callable(self.analyzer):
+            return
+
+        import tempfile
+        with tempfile.NamedTemporaryFile() as f:
+            fname = f.name
+        # now we're sure fname doesn't exist
+
+        msg = ("Since v0.21, vectorizers pass the data to the custom analyzer "
+               "and not the file names or the file objects.")
+        try:
+            self.analyzer(fname)
+        except FileNotFoundError:
+            warnings.warn(msg, ChangedBehaviorWarning)
+        except AttributeError as e:
+            print(str(e))
+            if str(e) == "'str' object has no attribute 'read'":
+                warnings.warn(msg, ChangedBehaviorWarning)
+        except Exception:
+            pass
+
     def build_analyzer(self):
         """Return a callable that handles preprocessing and tokenization"""
         if callable(self.analyzer):
+            self._validate_analyzer()
             return lambda doc: self.analyzer(self.decode(doc))
 
         preprocess = self.build_preprocessor()

From d923b21d7bba323b789a9eb1cf65a8fbf7e3f077 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 15 Apr 2019 13:52:10 +0200
Subject: [PATCH 03/16] pep8

---
 sklearn/feature_extraction/tests/test_text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 6cb55df3c54f4..223254567d1e8 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1220,8 +1220,8 @@ def analyzer1(doc):
             pass
 
     def analyzer2(doc):
-        with _ in doc.read():
-            pass
+        for x in doc.read():
+            return
 
     print(Estimator)
     for analyzer in [analyzer1, analyzer2]:

From 1c375561eb5db19b8ede68cb7d10e2b8efae0903 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 15 Apr 2019 14:41:03 +0200
Subject: [PATCH 04/16] improve coverage

---
 sklearn/feature_extraction/tests/test_text.py | 16 +++++++++-------
 sklearn/feature_extraction/text.py            |  3 +--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 223254567d1e8..004d5b6d704a0 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1216,18 +1216,13 @@ def test_callable_analyzer_vs_file_input(Estimator):
     # check if the ChangedBehaviorWarning is raised if the given analyzer
     # expects a file or a file name.
     def analyzer1(doc):
-        with open(doc, 'r'):
-            pass
+        open(doc, 'r')
 
     def analyzer2(doc):
-        for x in doc.read():
-            return
+        doc.read()
 
-    print(Estimator)
     for analyzer in [analyzer1, analyzer2]:
         for input_type in ['file', 'filename']:
-            print(input_type)
-            print(analyzer)
             try:
                 with pytest.warns(ChangedBehaviorWarning,
                                   match="Since v0.21, vectorizer"):
@@ -1235,3 +1230,10 @@ def analyzer2(doc):
                               input=input_type).fit_transform(data)
             except Exception:
                 pass
+
+    # check if a custom exception from the analyzer is shown to the user
+    def analyzer3(doc):
+        raise Exception("testing")
+
+    with pytest.raises(Exception, match="testing"):
+        Estimator(analyzer=analyzer3).fit_transform(data)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index c8f65db201615..f3861ead1553e 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -324,7 +324,6 @@ def _validate_analyzer(self):
         except FileNotFoundError:
             warnings.warn(msg, ChangedBehaviorWarning)
         except AttributeError as e:
-            print(str(e))
             if str(e) == "'str' object has no attribute 'read'":
                 warnings.warn(msg, ChangedBehaviorWarning)
         except Exception:
@@ -332,8 +331,8 @@ def _validate_analyzer(self):
 
     def build_analyzer(self):
         """Return a callable that handles preprocessing and tokenization"""
+        self._validate_analyzer()
         if callable(self.analyzer):
-            self._validate_analyzer()
             return lambda doc: self.analyzer(self.decode(doc))
 
         preprocess = self.build_preprocessor()

From 22e3f04cabfe37f74c924b39fa9cbdadad62a468 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 15 Apr 2019 16:00:03 +0200
Subject: [PATCH 05/16] add version

---
 sklearn/feature_extraction/text.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index f3861ead1553e..2985327201602 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -514,9 +514,9 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input. If ``input`` is ``filename`` or
-        ``file``, the data is first read from the file and then passed to the
-        given callable analyzer.
+        out of the raw, unprocessed input. Since v0.21, if ``input`` is
+        ``filename`` or ``file``, the data is first read from the file and then
+        passed to the given callable analyzer.
 
     n_features : integer, default=(2 ** 20)
         The number of features (columns) in the output matrices. Small numbers
@@ -771,9 +771,9 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input. If ``input`` is ``filename`` or
-        ``file``, the data is first read from the file and then passed to the
-        given callable analyzer.
+        out of the raw, unprocessed input. Since v0.21, if ``input`` is
+        ``filename`` or ``file``, the data is first read from the file and then
+        passed to the given callable analyzer.
 
     max_df : float in range [0.0, 1.0] or int, default=1.0
         When building the vocabulary ignore terms that have a document
@@ -1397,9 +1397,9 @@ class TfidfVectorizer(CountVectorizer):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input. If ``input`` is ``filename`` or
-        ``file``, the data is first read from the file and then passed to the
-        given callable analyzer.
+        out of the raw, unprocessed input. Since v0.21, if ``input`` is
+        ``filename`` or ``file``, the data is first read from the file and then
+        passed to the given callable analyzer.
 
     stop_words : string {'english'}, list, or None (default=None)
         If a string, it is passed to _check_stop_list and the appropriate stop

From 8a7ade96e79a5182df8210a01859206ecc6ae704 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 15 Apr 2019 16:27:44 +0200
Subject: [PATCH 06/16] validate only if input is not content

---
 sklearn/feature_extraction/text.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 2985327201602..b2d0adf2fb2b2 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -309,9 +309,6 @@ def _validate_analyzer(self):
         # This is to check if the given custom analyzer expects file or a
         # filename instead of data.
         # Behavior changed in v0.21, function could be removed in v0.24
-        if not callable(self.analyzer):
-            return
-
         import tempfile
         with tempfile.NamedTemporaryFile() as f:
             fname = f.name
@@ -331,8 +328,9 @@ def _validate_analyzer(self):
 
     def build_analyzer(self):
         """Return a callable that handles preprocessing and tokenization"""
-        self._validate_analyzer()
         if callable(self.analyzer):
+            if self.input in ['file', 'filename']:
+                self._validate_analyzer()
             return lambda doc: self.analyzer(self.decode(doc))
 
         preprocess = self.build_preprocessor()

From c655e04d736303f4710369ca32d15874213d4023 Mon Sep 17 00:00:00 2001
From: Joel Nothman <joel.nothman@gmail.com>
Date: Tue, 16 Apr 2019 09:38:47 +0200
Subject: [PATCH 07/16] Update sklearn/feature_extraction/text.py

Co-Authored-By: adrinjalali <adrin.jalali@gmail.com>
---
 sklearn/feature_extraction/text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index b2d0adf2fb2b2..0aea069bc811a 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -308,7 +308,7 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
     def _validate_analyzer(self):
         # This is to check if the given custom analyzer expects file or a
         # filename instead of data.
-        # Behavior changed in v0.21, function could be removed in v0.24
+        # Behavior changed in v0.21, function could be removed in v0.23
         import tempfile
         with tempfile.NamedTemporaryFile() as f:
             fname = f.name

From 2ab54653282305fa869af9c188ca32a4b93a8733 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 16 Apr 2019 09:47:18 +0200
Subject: [PATCH 08/16] whats_new

---
 doc/whats_new/v0.21.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 3175fca4747f6..403ef405e2ce6 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -246,6 +246,17 @@ Support for Python 3.4 and below has been officially dropped.
 - |API| Deprecated :mod:`externals.six` since we have dropped support for
   Python 2.7. :issue:`12916` by :user:`Hanmin Qin <qinhanmin2014>`.
 
+:mod:`sklearn.feature_extraction`
+.................................
+
+- |API| If ``input='file'`` or ``input='filename'``, and a callable is given
+  as the ``analyzer``, :class:`sklearn.feature_extraction.text`,
+  :class:`sklearn.feature_extraction.text`, and
+  :class:`sklearn.feature_extraction.text` now read the data from the file(s)
+  and then pass it to the given ``analyzer``, instead of passing the file
+  name(s) or the file object(s) to the analyzer.
+  :issue:`13641` by `Adrin Jalali`_.
+
 :mod:`sklearn.impute`
 .....................
 

From 7e857cfe4507d6773327f41282ff4624399eb727 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Tue, 16 Apr 2019 09:49:02 +0200
Subject: [PATCH 09/16] modify the warning message and hint the removal version

---
 sklearn/feature_extraction/text.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index b2d0adf2fb2b2..d480cbbc69c1b 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -315,7 +315,8 @@ def _validate_analyzer(self):
         # now we're sure fname doesn't exist
 
         msg = ("Since v0.21, vectorizers pass the data to the custom analyzer "
-               "and not the file names or the file objects.")
+               "and not the file names or the file objects. This warning "
+               "will be removed in v0.23.")
         try:
             self.analyzer(fname)
         except FileNotFoundError:

From 10f91f38d2ff534281f791774f10bba8b0ca5d44 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Wed, 17 Apr 2019 11:33:32 +0200
Subject: [PATCH 10/16] improve coverage

---
 sklearn/feature_extraction/tests/test_text.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 004d5b6d704a0..cc8dbf1c863fe 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1235,5 +1235,13 @@ def analyzer2(doc):
     def analyzer3(doc):
         raise Exception("testing")
 
+    import tempfile
+    with tempfile.NamedTemporaryFile('w', delete=False) as f:
+        f.write("sample content\n")
+        fname = f.name
+
     with pytest.raises(Exception, match="testing"):
-        Estimator(analyzer=analyzer3).fit_transform(data)
+        Estimator(analyzer=analyzer3, input='filename').fit_transform([fname])
+
+    import os
+    os.remove(fname)

From 89f12901cea02f237c878d50f481fa4c75bdab64 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 18 Apr 2019 16:51:05 +0200
Subject: [PATCH 11/16] apply comments

---
 sklearn/feature_extraction/tests/test_text.py | 14 +++++---------
 sklearn/feature_extraction/text.py            |  4 ++--
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index cc8dbf1c863fe..facea754c15f3 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from collections.abc import Mapping
+import os
 import re
 import warnings
 
@@ -1201,7 +1202,7 @@ def build_preprocessor(self):
 
 @pytest.mark.parametrize('Estimator',
                          [CountVectorizer, TfidfVectorizer, HashingVectorizer])
-def test_callable_analyzer_vs_file_input(Estimator):
+def test_callable_analyzer_vs_file_input(tmpdir, Estimator):
     data = ['this is text, not file or filename']
 
     with pytest.raises(FileNotFoundError):
@@ -1235,13 +1236,8 @@ def analyzer2(doc):
     def analyzer3(doc):
         raise Exception("testing")
 
-    import tempfile
-    with tempfile.NamedTemporaryFile('w', delete=False) as f:
-        f.write("sample content\n")
-        fname = f.name
+    f = tmpdir.join("file.txt")
+    f.write("sample content\n")
 
     with pytest.raises(Exception, match="testing"):
-        Estimator(analyzer=analyzer3, input='filename').fit_transform([fname])
-
-    import os
-    os.remove(fname)
+        Estimator(analyzer=analyzer3, input='file').fit_transform([f])
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index c8ab03ae70de2..0af6645472baa 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -305,7 +305,7 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
             self._stop_words_id = id(self.stop_words)
             return 'error'
 
-    def _validate_analyzer(self):
+    def _validate_custom_analyzer(self):
         # This is to check if the given custom analyzer expects file or a
         # filename instead of data.
         # Behavior changed in v0.21, function could be removed in v0.23
@@ -331,7 +331,7 @@ def build_analyzer(self):
         """Return a callable that handles preprocessing and tokenization"""
         if callable(self.analyzer):
             if self.input in ['file', 'filename']:
-                self._validate_analyzer()
+                self._validate_custom_analyzer()
             return lambda doc: self.analyzer(self.decode(doc))
 
         preprocess = self.build_preprocessor()

From dfc55782575a458cd120d9b61d856e9e6da87e8e Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Thu, 18 Apr 2019 16:52:07 +0200
Subject: [PATCH 12/16] pep8

---
 sklearn/feature_extraction/tests/test_text.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index facea754c15f3..cf8b83fd81dcd 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
 from collections.abc import Mapping
-import os
 import re
 import warnings
 

From 028af520361a28e957f837b7539d4b5397f4a3da Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 22 Apr 2019 10:28:39 +0200
Subject: [PATCH 13/16] fix whats_new, add to changed models

---
 doc/whats_new/v0.21.rst | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
index 403ef405e2ce6..e893589c9cc0b 100644
--- a/doc/whats_new/v0.21.rst
+++ b/doc/whats_new/v0.21.rst
@@ -27,6 +27,9 @@ random sampling procedures.
 - :class:`linear_model.LogisticRegression` and
   :class:`linear_model.LogisticRegressionCV` with 'saga' solver. |Fix|
 - :class:`ensemble.GradientBoostingClassifier` |Fix|
+- :class:`sklearn.feature_extraction.text.HashingVectorizer`,
+  :class:`sklearn.feature_extraction.text.TfidfVectorizer`, and
+  :class:`sklearn.feature_extraction.text.CountVectorizer` |API|
 - :class:`neural_network.MLPClassifier` |Fix|
 - :func:`svm.SVC.decision_function` and
   :func:`multiclass.OneVsOneClassifier.decision_function`. |Fix|
@@ -250,11 +253,11 @@ Support for Python 3.4 and below has been officially dropped.
 .................................
 
 - |API| If ``input='file'`` or ``input='filename'``, and a callable is given
-  as the ``analyzer``, :class:`sklearn.feature_extraction.text`,
-  :class:`sklearn.feature_extraction.text`, and
-  :class:`sklearn.feature_extraction.text` now read the data from the file(s)
-  and then pass it to the given ``analyzer``, instead of passing the file
-  name(s) or the file object(s) to the analyzer.
+  as the ``analyzer``, :class:`sklearn.feature_extraction.text.HashingVectorizer`,
+  :class:`sklearn.feature_extraction.text.TfidfVectorizer`, and
+  :class:`sklearn.feature_extraction.text.CountVectorizer` now read the data
+  from the file(s) and then pass it to the given ``analyzer``, instead of
+  passing the file name(s) or the file object(s) to the analyzer.
   :issue:`13641` by `Adrin Jalali`_.
 
 :mod:`sklearn.impute`

From 05d0f606a1bd4f0e07730ef6a32ff4608d364021 Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 22 Apr 2019 10:39:06 +0200
Subject: [PATCH 14/16] fix tests

---
 sklearn/feature_extraction/tests/test_text.py | 57 ++++++++++---------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index cf8b83fd81dcd..cef68df216f05 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1201,42 +1201,43 @@ def build_preprocessor(self):
 
 @pytest.mark.parametrize('Estimator',
                          [CountVectorizer, TfidfVectorizer, HashingVectorizer])
-def test_callable_analyzer_vs_file_input(tmpdir, Estimator):
+@pytest.mark.parametrize(
+    'input_type, err_type, err_msg',
+    [('filename', FileNotFoundError, ''),
+     ('file', AttributeError, "'str' object has no attribute 'read'")]
+)
+def test_callable_analyzer_error(Estimator, input_type, err_type, err_msg):
     data = ['this is text, not file or filename']
-
-    with pytest.raises(FileNotFoundError):
+    with pytest.raises(err_type, match=err_msg):
         Estimator(analyzer=lambda x: x.split(),
-                  input='filename').fit_transform(data)
+                  input=input_type).fit_transform(data)
 
-    with pytest.raises(AttributeError,
-                       match="'str' object has no attribute 'read'"):
-        Estimator(analyzer=lambda x: x.split(),
-                  input='file').fit_transform(data)
-
-    # check if the ChangedBehaviorWarning is raised if the given analyzer
-    # expects a file or a file name.
-    def analyzer1(doc):
-        open(doc, 'r')
-
-    def analyzer2(doc):
-        doc.read()
-
-    for analyzer in [analyzer1, analyzer2]:
-        for input_type in ['file', 'filename']:
-            try:
-                with pytest.warns(ChangedBehaviorWarning,
-                                  match="Since v0.21, vectorizer"):
-                    Estimator(analyzer=analyzer,
-                              input=input_type).fit_transform(data)
-            except Exception:
-                pass
 
+@pytest.mark.parametrize('Estimator',
+                         [CountVectorizer, TfidfVectorizer, HashingVectorizer])
+@pytest.mark.parametrize(
+    'analyzer', [lambda doc: open(doc, 'r'), lambda doc: doc.read()]
+)
+@pytest.mark.parametrize('input_type', ['file', 'filename'])
+def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type):
+    data = ['this is text, not file or filename']
+    warn_msg = 'Since v0.21, vectorizer'
+    with pytest.raises((FileNotFoundError, AttributeError)):
+        with pytest.warns(ChangedBehaviorWarning, match=warn_msg) as records:
+            Estimator(analyzer=analyzer, input=input_type).fit_transform(data)
+        assert len(records) == 1
+        assert warn_msg in str(records[0])
+
+
+@pytest.mark.parametrize('Estimator',
+                         [CountVectorizer, TfidfVectorizer, HashingVectorizer])
+def test_callable_analyzer_reraise_error(tmpdir, Estimator):
     # check if a custom exception from the analyzer is shown to the user
-    def analyzer3(doc):
+    def analyzer(doc):
         raise Exception("testing")
 
     f = tmpdir.join("file.txt")
     f.write("sample content\n")
 
     with pytest.raises(Exception, match="testing"):
-        Estimator(analyzer=analyzer3, input='file').fit_transform([f])
+        Estimator(analyzer=analyzer, input='file').fit_transform([f])

From dbe2bff8c9f097a196b821df1de3528b4f559d1e Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 22 Apr 2019 10:41:00 +0200
Subject: [PATCH 15/16] add versionchanged

---
 sklearn/feature_extraction/text.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 0af6645472baa..007e158f3a449 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -513,9 +513,12 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input. Since v0.21, if ``input`` is
-        ``filename`` or ``file``, the data is first read from the file and then
-        passed to the given callable analyzer.
+        out of the raw, unprocessed input.
+
+        .. versionchanged:: 0.21
+        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
+        first read from the file and then passed to the given callable
+        analyzer.
 
     n_features : integer, default=(2 ** 20)
         The number of features (columns) in the output matrices. Small numbers
@@ -770,9 +773,12 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input. Since v0.21, if ``input`` is
-        ``filename`` or ``file``, the data is first read from the file and then
-        passed to the given callable analyzer.
+        out of the raw, unprocessed input.
+
+        .. versionchanged:: 0.21
+        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
+        first read from the file and then passed to the given callable
+        analyzer.
 
     max_df : float in range [0.0, 1.0] or int, default=1.0
         When building the vocabulary ignore terms that have a document
@@ -1396,9 +1402,12 @@ class TfidfVectorizer(CountVectorizer):
         word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
-        out of the raw, unprocessed input. Since v0.21, if ``input`` is
-        ``filename`` or ``file``, the data is first read from the file and then
-        passed to the given callable analyzer.
+        out of the raw, unprocessed input.
+
+        .. versionchanged:: 0.21
+        Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
+        first read from the file and then passed to the given callable
+        analyzer.
 
     stop_words : string {'english'}, list, or None (default=None)
         If a string, it is passed to _check_stop_list and the appropriate stop

From 77ee198c4f35d3431d61339131ebb0d90418fa4c Mon Sep 17 00:00:00 2001
From: adrinjalali <adrin.jalali@gmail.com>
Date: Mon, 22 Apr 2019 21:52:11 +0200
Subject: [PATCH 16/16] fix test

---
 sklearn/feature_extraction/tests/test_text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index cef68df216f05..bfd9f5f2f4ffe 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -1225,8 +1225,8 @@ def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type):
     with pytest.raises((FileNotFoundError, AttributeError)):
         with pytest.warns(ChangedBehaviorWarning, match=warn_msg) as records:
             Estimator(analyzer=analyzer, input=input_type).fit_transform(data)
-        assert len(records) == 1
-        assert warn_msg in str(records[0])
+    assert len(records) == 1
+    assert warn_msg in str(records[0])
 
 
 @pytest.mark.parametrize('Estimator',