From a0341a8fba7d2634c34e9130d4a7f6d54abeb82b Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 14 Apr 2019 23:15:42 +0200 Subject: [PATCH 01/16] make sure vectorizers read data from file before analyzing --- sklearn/feature_extraction/tests/test_text.py | 15 +++++++++++++++ sklearn/feature_extraction/text.py | 14 ++++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index b02aa2aea46af..8c6113333a7a1 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1196,3 +1196,18 @@ def build_preprocessor(self): .findall(doc), stop_words=['and']) assert _check_stop_words_consistency(vec) is True + + +@pytest.mark.parametrize('Estimator', + [CountVectorizer, TfidfVectorizer, HashingVectorizer]) +def test_callable_analyzer_vs_file_input(Estimator): + data = ['this is text, not file or filename'] + + with pytest.raises(FileNotFoundError): + Estimator(analyzer=lambda x: x.split(), + input='filename').fit_transform(data) + + with pytest.raises(AttributeError, + match="'str' object has no attribute 'read'"): + Estimator(analyzer=lambda x: x.split(), + input='file').fit_transform(data) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 9cdbace6224aa..114237df98c09 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -307,7 +307,7 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" if callable(self.analyzer): - return self.analyzer + return lambda doc: self.analyzer(self.decode(doc)) preprocess = self.build_preprocessor() @@ -488,7 +488,9 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. + out of the raw, unprocessed input. If ``input`` is ``filename`` or + ``file``, the data is first read from the file and then passed to the + given callable analyzer. n_features : integer, default=(2 ** 20) The number of features (columns) in the output matrices. Small numbers @@ -743,7 +745,9 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. + out of the raw, unprocessed input. If ``input`` is ``filename`` or + ``file``, the data is first read from the file and then passed to the + given callable analyzer. max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document @@ -1367,7 +1371,9 @@ class TfidfVectorizer(CountVectorizer): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. + out of the raw, unprocessed input. If ``input`` is ``filename`` or + ``file``, the data is first read from the file and then passed to the + given callable analyzer. stop_words : string {'english'}, list, or None (default=None) If a string, it is passed to _check_stop_list and the appropriate stop From a1de2ae44cd52ffbc8ad84d4466a830bc83de368 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Apr 2019 13:48:51 +0200 Subject: [PATCH 02/16] raise a ChangedBehaviorWarning when appropriate --- sklearn/feature_extraction/tests/test_text.py | 24 +++++++++++++++++ sklearn/feature_extraction/text.py | 27 +++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 8c6113333a7a1..6cb55df3c54f4 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -29,6 +29,7 @@ from numpy.testing import assert_array_almost_equal from numpy.testing import assert_array_equal from sklearn.utils import IS_PYPY +from sklearn.exceptions import ChangedBehaviorWarning from sklearn.utils.testing import (assert_equal, assert_not_equal, assert_almost_equal, assert_in, assert_less, assert_greater, @@ -1211,3 +1212,26 @@ def test_callable_analyzer_vs_file_input(Estimator): match="'str' object has no attribute 'read'"): Estimator(analyzer=lambda x: x.split(), input='file').fit_transform(data) + + # check if the ChangedBehaviorWarning is raised if the given analyzer + # expects a file or a file name. + def analyzer1(doc): + with open(doc, 'r'): + pass + + def analyzer2(doc): + with _ in doc.read(): + pass + + print(Estimator) + for analyzer in [analyzer1, analyzer2]: + for input_type in ['file', 'filename']: + print(input_type) + print(analyzer) + try: + with pytest.warns(ChangedBehaviorWarning, + match="Since v0.21, vectorizer"): + Estimator(analyzer=analyzer, + input=input_type).fit_transform(data) + except Exception: + pass diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 114237df98c09..c8f65db201615 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -31,6 +31,7 @@ from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES from ..utils import _IS_32BIT from ..utils.fixes import _astype_copy_false +from ..exceptions import ChangedBehaviorWarning __all__ = ['HashingVectorizer', @@ -304,9 +305,35 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): self._stop_words_id = id(self.stop_words) return 'error' + def _validate_analyzer(self): + # This is to check if the given custom analyzer expects file or a + # filename instead of data. + # Behavior changed in v0.21, function could be removed in v0.24 + if not callable(self.analyzer): + return + + import tempfile + with tempfile.NamedTemporaryFile() as f: + fname = f.name + # now we're sure fname doesn't exist + + msg = ("Since v0.21, vectorizers pass the data to the custom analyzer " + "and not the file names or the file objects.") + try: + self.analyzer(fname) + except FileNotFoundError: + warnings.warn(msg, ChangedBehaviorWarning) + except AttributeError as e: + print(str(e)) + if str(e) == "'str' object has no attribute 'read'": + warnings.warn(msg, ChangedBehaviorWarning) + except Exception: + pass + def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" if callable(self.analyzer): + self._validate_analyzer() return lambda doc: self.analyzer(self.decode(doc)) preprocess = self.build_preprocessor() From d923b21d7bba323b789a9eb1cf65a8fbf7e3f077 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Apr 2019 13:52:10 +0200 Subject: [PATCH 03/16] pep8 --- sklearn/feature_extraction/tests/test_text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 6cb55df3c54f4..223254567d1e8 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1220,8 +1220,8 @@ def analyzer1(doc): pass def analyzer2(doc): - with _ in doc.read(): - pass + for x in doc.read(): + return print(Estimator) for analyzer in [analyzer1, analyzer2]: From 1c375561eb5db19b8ede68cb7d10e2b8efae0903 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Apr 2019 14:41:03 +0200 Subject: [PATCH 04/16] improve coverage --- sklearn/feature_extraction/tests/test_text.py | 16 +++++++++------- sklearn/feature_extraction/text.py | 3 +-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 223254567d1e8..004d5b6d704a0 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1216,18 +1216,13 @@ def test_callable_analyzer_vs_file_input(Estimator): # check if the ChangedBehaviorWarning is raised if the given analyzer # expects a file or a file name. def analyzer1(doc): - with open(doc, 'r'): - pass + open(doc, 'r') def analyzer2(doc): - for x in doc.read(): - return + doc.read() - print(Estimator) for analyzer in [analyzer1, analyzer2]: for input_type in ['file', 'filename']: - print(input_type) - print(analyzer) try: with pytest.warns(ChangedBehaviorWarning, match="Since v0.21, vectorizer"): @@ -1235,3 +1230,10 @@ def analyzer2(doc): input=input_type).fit_transform(data) except Exception: pass + + # check if a custom exception from the analyzer is shown to the user + def analyzer3(doc): + raise Exception("testing") + + with pytest.raises(Exception, match="testing"): + Estimator(analyzer=analyzer3).fit_transform(data) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c8f65db201615..f3861ead1553e 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -324,7 +324,6 @@ def _validate_analyzer(self): except FileNotFoundError: warnings.warn(msg, ChangedBehaviorWarning) except AttributeError as e: - print(str(e)) if str(e) == "'str' object has no attribute 'read'": warnings.warn(msg, ChangedBehaviorWarning) except Exception: @@ -332,8 +331,8 @@ def _validate_analyzer(self): def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" + self._validate_analyzer() if callable(self.analyzer): - self._validate_analyzer() return lambda doc: self.analyzer(self.decode(doc)) preprocess = self.build_preprocessor() From 22e3f04cabfe37f74c924b39fa9cbdadad62a468 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Apr 2019 16:00:03 +0200 Subject: [PATCH 05/16] add version --- sklearn/feature_extraction/text.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index f3861ead1553e..2985327201602 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -514,9 +514,9 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. If ``input`` is ``filename`` or - ``file``, the data is first read from the file and then passed to the - given callable analyzer. + out of the raw, unprocessed input. Since v0.21, if ``input`` is + ``filename`` or ``file``, the data is first read from the file and then + passed to the given callable analyzer. n_features : integer, default=(2 ** 20) The number of features (columns) in the output matrices. Small numbers @@ -771,9 +771,9 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. If ``input`` is ``filename`` or - ``file``, the data is first read from the file and then passed to the - given callable analyzer. + out of the raw, unprocessed input. Since v0.21, if ``input`` is + ``filename`` or ``file``, the data is first read from the file and then + passed to the given callable analyzer. max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document @@ -1397,9 +1397,9 @@ class TfidfVectorizer(CountVectorizer): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. If ``input`` is ``filename`` or - ``file``, the data is first read from the file and then passed to the - given callable analyzer. + out of the raw, unprocessed input. Since v0.21, if ``input`` is + ``filename`` or ``file``, the data is first read from the file and then + passed to the given callable analyzer. stop_words : string {'english'}, list, or None (default=None) If a string, it is passed to _check_stop_list and the appropriate stop From 8a7ade96e79a5182df8210a01859206ecc6ae704 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 15 Apr 2019 16:27:44 +0200 Subject: [PATCH 06/16] validate only if input is not content --- sklearn/feature_extraction/text.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2985327201602..b2d0adf2fb2b2 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -309,9 +309,6 @@ def _validate_analyzer(self): # This is to check if the given custom analyzer expects file or a # filename instead of data. # Behavior changed in v0.21, function could be removed in v0.24 - if not callable(self.analyzer): - return - import tempfile with tempfile.NamedTemporaryFile() as f: fname = f.name @@ -331,8 +328,9 @@ def _validate_analyzer(self): def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" - self._validate_analyzer() if callable(self.analyzer): + if self.input in ['file', 'filename']: + self._validate_analyzer() return lambda doc: self.analyzer(self.decode(doc)) preprocess = self.build_preprocessor() From c655e04d736303f4710369ca32d15874213d4023 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 16 Apr 2019 09:38:47 +0200 Subject: [PATCH 07/16] Update sklearn/feature_extraction/text.py Co-Authored-By: adrinjalali --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index b2d0adf2fb2b2..0aea069bc811a 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -308,7 +308,7 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): def _validate_analyzer(self): # This is to check if the given custom analyzer expects file or a # filename instead of data. - # Behavior changed in v0.21, function could be removed in v0.24 + # Behavior changed in v0.21, function could be removed in v0.23 import tempfile with tempfile.NamedTemporaryFile() as f: fname = f.name From 2ab54653282305fa869af9c188ca32a4b93a8733 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 16 Apr 2019 09:47:18 +0200 Subject: [PATCH 08/16] whats_new --- doc/whats_new/v0.21.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 3175fca4747f6..403ef405e2ce6 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -246,6 +246,17 @@ Support for Python 3.4 and below has been officially dropped. - |API| Deprecated :mod:`externals.six` since we have dropped support for Python 2.7. :issue:`12916` by :user:`Hanmin Qin `. +:mod:`sklearn.feature_extraction` +................................. + +- |API| If ``input='file'`` or ``input='filename'``, and a callable is given + as the ``analyzer``, :class:`sklearn.feature_extraction.text`, + :class:`sklearn.feature_extraction.text`, and + :class:`sklearn.feature_extraction.text` now read the data from the file(s) + and then pass it to the given ``analyzer``, instead of passing the file + name(s) or the file object(s) to the analyzer. + :issue:`13641` by `Adrin Jalali`_. + :mod:`sklearn.impute` ..................... From 7e857cfe4507d6773327f41282ff4624399eb727 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 16 Apr 2019 09:49:02 +0200 Subject: [PATCH 09/16] modify the warning message and hint the removal version --- sklearn/feature_extraction/text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index b2d0adf2fb2b2..d480cbbc69c1b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -315,7 +315,8 @@ def _validate_analyzer(self): # now we're sure fname doesn't exist msg = ("Since v0.21, vectorizers pass the data to the custom analyzer " - "and not the file names or the file objects.") + "and not the file names or the file objects. This warning " + "will be removed in v0.23.") try: self.analyzer(fname) except FileNotFoundError: From 10f91f38d2ff534281f791774f10bba8b0ca5d44 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 17 Apr 2019 11:33:32 +0200 Subject: [PATCH 10/16] improve coverage --- sklearn/feature_extraction/tests/test_text.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 004d5b6d704a0..cc8dbf1c863fe 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1235,5 +1235,13 @@ def analyzer2(doc): def analyzer3(doc): raise Exception("testing") + import tempfile + with tempfile.NamedTemporaryFile('w', delete=False) as f: + f.write("sample content\n") + fname = f.name + with pytest.raises(Exception, match="testing"): - Estimator(analyzer=analyzer3).fit_transform(data) + Estimator(analyzer=analyzer3, input='filename').fit_transform([fname]) + + import os + os.remove(fname) From 89f12901cea02f237c878d50f481fa4c75bdab64 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 18 Apr 2019 16:51:05 +0200 Subject: [PATCH 11/16] apply comments --- sklearn/feature_extraction/tests/test_text.py | 14 +++++--------- sklearn/feature_extraction/text.py | 4 ++-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index cc8dbf1c863fe..facea754c15f3 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from collections.abc import Mapping +import os import re import warnings @@ -1201,7 +1202,7 @@ def build_preprocessor(self): @pytest.mark.parametrize('Estimator', [CountVectorizer, TfidfVectorizer, HashingVectorizer]) -def test_callable_analyzer_vs_file_input(Estimator): +def test_callable_analyzer_vs_file_input(tmpdir, Estimator): data = ['this is text, not file or filename'] with pytest.raises(FileNotFoundError): @@ -1235,13 +1236,8 @@ def analyzer2(doc): def analyzer3(doc): raise Exception("testing") - import tempfile - with tempfile.NamedTemporaryFile('w', delete=False) as f: - f.write("sample content\n") - fname = f.name + f = tmpdir.join("file.txt") + f.write("sample content\n") with pytest.raises(Exception, match="testing"): - Estimator(analyzer=analyzer3, input='filename').fit_transform([fname]) - - import os - os.remove(fname) + Estimator(analyzer=analyzer3, input='file').fit_transform([f]) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c8ab03ae70de2..0af6645472baa 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -305,7 +305,7 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): self._stop_words_id = id(self.stop_words) return 'error' - def _validate_analyzer(self): + def _validate_custom_analyzer(self): # This is to check if the given custom analyzer expects file or a # filename instead of data. # Behavior changed in v0.21, function could be removed in v0.23 @@ -331,7 +331,7 @@ def build_analyzer(self): """Return a callable that handles preprocessing and tokenization""" if callable(self.analyzer): if self.input in ['file', 'filename']: - self._validate_analyzer() + self._validate_custom_analyzer() return lambda doc: self.analyzer(self.decode(doc)) preprocess = self.build_preprocessor() From dfc55782575a458cd120d9b61d856e9e6da87e8e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 18 Apr 2019 16:52:07 +0200 Subject: [PATCH 12/16] pep8 --- sklearn/feature_extraction/tests/test_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index facea754c15f3..cf8b83fd81dcd 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1,6 +1,5 @@ # -*- coding: utf-8 -*- from collections.abc import Mapping -import os import re import warnings From 028af520361a28e957f837b7539d4b5397f4a3da Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 22 Apr 2019 10:28:39 +0200 Subject: [PATCH 13/16] fix whats_new, add to changed models --- doc/whats_new/v0.21.rst | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 403ef405e2ce6..e893589c9cc0b 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -27,6 +27,9 @@ random sampling procedures. - :class:`linear_model.LogisticRegression` and :class:`linear_model.LogisticRegressionCV` with 'saga' solver. |Fix| - :class:`ensemble.GradientBoostingClassifier` |Fix| +- :class:`sklearn.feature_extraction.text.HashingVectorizer`, + :class:`sklearn.feature_extraction.text.TfidfVectorizer`, and + :class:`sklearn.feature_extraction.text.CountVectorizer` |API| - :class:`neural_network.MLPClassifier` |Fix| - :func:`svm.SVC.decision_function` and :func:`multiclass.OneVsOneClassifier.decision_function`. |Fix| @@ -250,11 +253,11 @@ Support for Python 3.4 and below has been officially dropped. ................................. - |API| If ``input='file'`` or ``input='filename'``, and a callable is given - as the ``analyzer``, :class:`sklearn.feature_extraction.text`, - :class:`sklearn.feature_extraction.text`, and - :class:`sklearn.feature_extraction.text` now read the data from the file(s) - and then pass it to the given ``analyzer``, instead of passing the file - name(s) or the file object(s) to the analyzer. + as the ``analyzer``, :class:`sklearn.feature_extraction.text.HashingVectorizer`, + :class:`sklearn.feature_extraction.text.TfidfVectorizer`, and + :class:`sklearn.feature_extraction.text.CountVectorizer` now read the data + from the file(s) and then pass it to the given ``analyzer``, instead of + passing the file name(s) or the file object(s) to the analyzer. :issue:`13641` by `Adrin Jalali`_. :mod:`sklearn.impute` From 05d0f606a1bd4f0e07730ef6a32ff4608d364021 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 22 Apr 2019 10:39:06 +0200 Subject: [PATCH 14/16] fix tests --- sklearn/feature_extraction/tests/test_text.py | 57 ++++++++++--------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index cf8b83fd81dcd..cef68df216f05 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1201,42 +1201,43 @@ def build_preprocessor(self): @pytest.mark.parametrize('Estimator', [CountVectorizer, TfidfVectorizer, HashingVectorizer]) -def test_callable_analyzer_vs_file_input(tmpdir, Estimator): +@pytest.mark.parametrize( + 'input_type, err_type, err_msg', + [('filename', FileNotFoundError, ''), + ('file', AttributeError, "'str' object has no attribute 'read'")] +) +def test_callable_analyzer_error(Estimator, input_type, err_type, err_msg): data = ['this is text, not file or filename'] - - with pytest.raises(FileNotFoundError): + with pytest.raises(err_type, match=err_msg): Estimator(analyzer=lambda x: x.split(), - input='filename').fit_transform(data) + input=input_type).fit_transform(data) - with pytest.raises(AttributeError, - match="'str' object has no attribute 'read'"): - Estimator(analyzer=lambda x: x.split(), - input='file').fit_transform(data) - - # check if the ChangedBehaviorWarning is raised if the given analyzer - # expects a file or a file name. - def analyzer1(doc): - open(doc, 'r') - - def analyzer2(doc): - doc.read() - - for analyzer in [analyzer1, analyzer2]: - for input_type in ['file', 'filename']: - try: - with pytest.warns(ChangedBehaviorWarning, - match="Since v0.21, vectorizer"): - Estimator(analyzer=analyzer, - input=input_type).fit_transform(data) - except Exception: - pass +@pytest.mark.parametrize('Estimator', + [CountVectorizer, TfidfVectorizer, HashingVectorizer]) +@pytest.mark.parametrize( + 'analyzer', [lambda doc: open(doc, 'r'), lambda doc: doc.read()] +) +@pytest.mark.parametrize('input_type', ['file', 'filename']) +def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type): + data = ['this is text, not file or filename'] + warn_msg = 'Since v0.21, vectorizer' + with pytest.raises((FileNotFoundError, AttributeError)): + with pytest.warns(ChangedBehaviorWarning, match=warn_msg) as records: + Estimator(analyzer=analyzer, input=input_type).fit_transform(data) + assert len(records) == 1 + assert warn_msg in str(records[0]) + + +@pytest.mark.parametrize('Estimator', + [CountVectorizer, TfidfVectorizer, HashingVectorizer]) +def test_callable_analyzer_reraise_error(tmpdir, Estimator): # check if a custom exception from the analyzer is shown to the user - def analyzer3(doc): + def analyzer(doc): raise Exception("testing") f = tmpdir.join("file.txt") f.write("sample content\n") with pytest.raises(Exception, match="testing"): - Estimator(analyzer=analyzer3, input='file').fit_transform([f]) + Estimator(analyzer=analyzer, input='file').fit_transform([f]) From dbe2bff8c9f097a196b821df1de3528b4f559d1e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 22 Apr 2019 10:41:00 +0200 Subject: [PATCH 15/16] add versionchanged --- sklearn/feature_extraction/text.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 0af6645472baa..007e158f3a449 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -513,9 +513,12 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin, TransformerMixin): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. Since v0.21, if ``input`` is - ``filename`` or ``file``, the data is first read from the file and then - passed to the given callable analyzer. + out of the raw, unprocessed input. + + .. versionchanged:: 0.21 + Since v0.21, if ``input`` is ``filename`` or ``file``, the data is + first read from the file and then passed to the given callable + analyzer. n_features : integer, default=(2 ** 20) The number of features (columns) in the output matrices. Small numbers @@ -770,9 +773,12 @@ class CountVectorizer(BaseEstimator, VectorizerMixin): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. Since v0.21, if ``input`` is - ``filename`` or ``file``, the data is first read from the file and then - passed to the given callable analyzer. + out of the raw, unprocessed input. + + .. versionchanged:: 0.21 + Since v0.21, if ``input`` is ``filename`` or ``file``, the data is + first read from the file and then passed to the given callable + analyzer. max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document @@ -1396,9 +1402,12 @@ class TfidfVectorizer(CountVectorizer): word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features - out of the raw, unprocessed input. Since v0.21, if ``input`` is - ``filename`` or ``file``, the data is first read from the file and then - passed to the given callable analyzer. + out of the raw, unprocessed input. + + .. versionchanged:: 0.21 + Since v0.21, if ``input`` is ``filename`` or ``file``, the data is + first read from the file and then passed to the given callable + analyzer. stop_words : string {'english'}, list, or None (default=None) If a string, it is passed to _check_stop_list and the appropriate stop From 77ee198c4f35d3431d61339131ebb0d90418fa4c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 22 Apr 2019 21:52:11 +0200 Subject: [PATCH 16/16] fix test --- sklearn/feature_extraction/tests/test_text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index cef68df216f05..bfd9f5f2f4ffe 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -1225,8 +1225,8 @@ def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type): with pytest.raises((FileNotFoundError, AttributeError)): with pytest.warns(ChangedBehaviorWarning, match=warn_msg) as records: Estimator(analyzer=analyzer, input=input_type).fit_transform(data) - assert len(records) == 1 - assert warn_msg in str(records[0]) + assert len(records) == 1 + assert warn_msg in str(records[0]) @pytest.mark.parametrize('Estimator',