From abfcfb3fbe3dc5b50f05f8aee2595519fdd4530b Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 6 Oct 2021 12:04:45 +0200 Subject: [PATCH 1/5] vocab check for upper only in fit --- sklearn/feature_extraction/text.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index a0b74a60dab4d..257e10f2fdc99 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1202,17 +1202,6 @@ def _count_vocab(self, raw_documents, fixed_vocab): j_indices = [] indptr = [] - if self.lowercase: - for vocab in vocabulary: - if any(map(str.isupper, vocab)): - warnings.warn( - "Upper case characters found in" - " vocabulary while 'lowercase'" - " is True. These entries will not" - " be matched with any documents" - ) - break - values = _make_int_array() indptr.append(0) for doc in raw_documents: @@ -1318,6 +1307,17 @@ def fit_transform(self, raw_documents, y=None): min_df = self.min_df max_features = self.max_features + if self.fixed_vocabulary_ and self.lowercase: + for term in self.vocabulary: + if any(map(str.isupper, term)): + warnings.warn( + "Upper case characters found in" + " vocabulary while 'lowercase'" + " is True. These entries will not" + " be matched with any documents" + ) + break + vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_) if self.binary: From 9ad6e1fa4b1cafd72c116aef16ce40705aa4272f Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 7 Oct 2021 17:28:42 +0200 Subject: [PATCH 2/5] what's new entry --- doc/whats_new/v1.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 296d84735864c..752b3fb73a9a7 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -43,6 +43,14 @@ Fixed models between sparse and dense input. :pr:`21195` by :user:`Jérémie du Boisberranger `. +:mod:`sklearn.feature_extraction` +................................. + +- |Efficiency| Fixed an efficiency regression introduced in version 1.0.0 in the + `transform` method of :class:`feature_extraction.text.CountVectorizer` which no + longer checks for uppercase characters in the provided vocabulary. :pr:`21251` + by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.neighbors` ........................ From 76a6a862e1feb40d2db354940e503c746ebc225f Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 19 Oct 2021 10:39:16 +0200 Subject: [PATCH 3/5] cln --- doc/whats_new/v1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index fb681bcae61c2..21c150e49de37 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -50,6 +50,7 @@ Fixed models `transform` method of :class:`feature_extraction.text.CountVectorizer` which no longer checks for uppercase characters in the provided vocabulary. :pr:`21251` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.linear_model` ........................... From bd1ff223190b9a879409cb85093a7a397996ec0e Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 20 Oct 2021 16:24:20 +0200 Subject: [PATCH 4/5] update existing test --- sklearn/feature_extraction/tests/test_text.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 6abd731b4559a..8dcba7dd5d0c1 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -436,7 +436,9 @@ def test_countvectorizer_custom_token_pattern_with_several_group(): def test_countvectorizer_uppercase_in_vocab(): - vocabulary = ["Sample", "Upper", "CaseVocabulary"] + # Check that the check for uppercase in the provided vocabulary is only done at fit + # time and not at transform time (#21251) + vocabulary = ["Sample", "Upper", "Case", "Vocabulary"] message = ( "Upper case characters found in" " vocabulary while 'lowercase'" @@ -445,8 +447,13 @@ def test_countvectorizer_uppercase_in_vocab(): ) vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary) + with pytest.warns(UserWarning, match=message): - vectorizer.fit_transform(vocabulary) + vectorizer.fit(vocabulary) + + with pytest.warns(None) as record: + vectorizer.transform(vocabulary) + assert not record def test_tf_transformer_feature_names_out(): From cd04847f2b51b2aae0198a4dc5c4b5fce6efb3d9 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 20 Oct 2021 16:26:24 +0200 Subject: [PATCH 5/5] black --- sklearn/feature_extraction/tests/test_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 8dcba7dd5d0c1..da32e855fabb6 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -450,7 +450,7 @@ def test_countvectorizer_uppercase_in_vocab(): with pytest.warns(UserWarning, match=message): vectorizer.fit(vocabulary) - + with pytest.warns(None) as record: vectorizer.transform(vocabulary) assert not record