Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,24 @@ Version 1.5.0

**In Development**

Security
--------

- |Fix| :class:`feature_extraction.text.CountVectorizer` and
:class:`feature_extraction.text.TfidfVectorizer` no longer store discarded
tokens from the training set in their `stop_words_` attribute. This attribute
would hold too frequent (above `max_df`) but also too rare tokens (below
`min_df`). This fixes a potential security issue (data leak) if the discarded
rare tokens hold sensitive information from the training set without the
model developer's knowledge.

Note: users of those classes are encouraged to either retrain their pipelines
with the new scikit-learn version or to manually clear the `stop_words_`
attribute from previously trained instances of those transformers. This
attribute was designed only for model inspection purposes and has no impact
on the behavior of the transformers.
:pr:`28823` by :user:`Olivier Grisel <ogrisel>`.

Changed models
--------------

Expand Down
42 changes: 0 additions & 42 deletions sklearn/feature_extraction/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,21 +756,11 @@ def test_feature_names():
@pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer))
def test_vectorizer_max_features(Vectorizer):
expected_vocabulary = {"burger", "beer", "salad", "pizza"}
expected_stop_words = {
"celeri",
"tomato",
"copyright",
"coke",
"sparkling",
"water",
"the",
}

# test bounded number of extracted features
vectorizer = Vectorizer(max_df=0.6, max_features=4)
vectorizer.fit(ALL_FOOD_DOCS)
assert set(vectorizer.vocabulary_) == expected_vocabulary
assert vectorizer.stop_words_ == expected_stop_words


def test_count_vectorizer_max_features():
Expand Down Expand Up @@ -805,21 +795,16 @@ def test_vectorizer_max_df():
vect.fit(test_data)
assert "a" in vect.vocabulary_.keys()
assert len(vect.vocabulary_.keys()) == 6
assert len(vect.stop_words_) == 0

vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5
vect.fit(test_data)
assert "a" not in vect.vocabulary_.keys() # {ae} ignored
assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
assert "a" in vect.stop_words_
assert len(vect.stop_words_) == 2

vect.max_df = 1
vect.fit(test_data)
assert "a" not in vect.vocabulary_.keys() # {ae} ignored
assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
assert "a" in vect.stop_words_
assert len(vect.stop_words_) == 2


def test_vectorizer_min_df():
Expand All @@ -828,21 +813,16 @@ def test_vectorizer_min_df():
vect.fit(test_data)
assert "a" in vect.vocabulary_.keys()
assert len(vect.vocabulary_.keys()) == 6
assert len(vect.stop_words_) == 0

vect.min_df = 2
vect.fit(test_data)
assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored
assert len(vect.vocabulary_.keys()) == 2 # {ae} remain
assert "c" in vect.stop_words_
assert len(vect.stop_words_) == 4

vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4
vect.fit(test_data)
assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored
assert len(vect.vocabulary_.keys()) == 1 # {a} remains
assert "c" in vect.stop_words_
assert len(vect.stop_words_) == 5


def test_count_binary_occurrences():
Expand Down Expand Up @@ -1155,28 +1135,6 @@ def test_countvectorizer_vocab_dicts_when_pickling():
)


def test_stop_words_removal():
# Ensure that deleting the stop_words_ attribute doesn't affect transform

fitted_vectorizers = (
TfidfVectorizer().fit(JUNK_FOOD_DOCS),
CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
)

for vect in fitted_vectorizers:
vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

vect.stop_words_ = None
stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

delattr(vect, "stop_words_")
stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()

assert_array_equal(stop_None_transform, vect_transform)
assert_array_equal(stop_del_transform, vect_transform)


def test_pickling_transformer():
X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
orig = TfidfTransformer().fit(X)
Expand Down
36 changes: 2 additions & 34 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,15 +1079,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
True if a fixed vocabulary of term to indices mapping
is provided by the user.

stop_words_ : set
Terms that were ignored because they either:

- occurred in too many documents (`max_df`)
- occurred in too few documents (`min_df`)
- were cut off by feature selection (`max_features`).

This is only available if no vocabulary was given.

See Also
--------
HashingVectorizer : Convert a collection of text documents to a
Expand All @@ -1096,12 +1087,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
TfidfVectorizer : Convert a collection of raw documents to a matrix
of TF-IDF features.

Notes
-----
The ``stop_words_`` attribute can get large and increase the model size
when pickling. This attribute is provided only for introspection and can
be safely removed using delattr or set to None before pickling.

Examples
--------
>>> from sklearn.feature_extraction.text import CountVectorizer
Expand Down Expand Up @@ -1240,19 +1225,17 @@ def _limit_features(self, X, vocabulary, high=None, low=None, limit=None):
mask = new_mask

new_indices = np.cumsum(mask) - 1 # maps old indices to new
removed_terms = set()
for term, old_index in list(vocabulary.items()):
if mask[old_index]:
vocabulary[term] = new_indices[old_index]
else:
del vocabulary[term]
removed_terms.add(term)
kept_indices = np.where(mask)[0]
if len(kept_indices) == 0:
raise ValueError(
"After pruning, no terms remain. Try a lower min_df or a higher max_df."
)
return X[:, kept_indices], removed_terms
return X[:, kept_indices]

def _count_vocab(self, raw_documents, fixed_vocab):
"""Create sparse feature matrix, and vocabulary where fixed_vocab=False"""
Expand Down Expand Up @@ -1397,7 +1380,7 @@ def fit_transform(self, raw_documents, y=None):
raise ValueError("max_df corresponds to < documents than min_df")
if max_features is not None:
X = self._sort_features(X, vocabulary)
X, self.stop_words_ = self._limit_features(
X = self._limit_features(
X, vocabulary, max_doc_count, min_doc_count, max_features
)
if max_features is None:
Expand Down Expand Up @@ -1911,28 +1894,13 @@ class TfidfVectorizer(CountVectorizer):
The inverse document frequency (IDF) vector; only defined
if ``use_idf`` is True.

stop_words_ : set
Terms that were ignored because they either:

- occurred in too many documents (`max_df`)
- occurred in too few documents (`min_df`)
- were cut off by feature selection (`max_features`).

This is only available if no vocabulary was given.

See Also
--------
CountVectorizer : Transforms text into a sparse matrix of n-gram counts.

TfidfTransformer : Performs the TF-IDF transformation from a provided
matrix of counts.

Notes
-----
The ``stop_words_`` attribute can get large and increase the model size
when pickling. This attribute is provided only for introspection and can
be safely removed using delattr or set to None before pickling.

Examples
--------
>>> from sklearn.feature_extraction.text import TfidfVectorizer
Expand Down