Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 67e371d

Browse files
thomasjpfanjjerphan
authored andcommitted
FIX Raises informative error in FeatureHasher when a sample is a single string (scikit-learn#25094)
1 parent 61b55f0 commit 67e371d

File tree

2 files changed

+41
-1
lines changed

2 files changed

+41
-1
lines changed

sklearn/feature_extraction/_hash.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# License: BSD 3 clause
33

44
from numbers import Integral
5+
from itertools import chain
56

67
import numpy as np
78
import scipy.sparse as sp
@@ -80,6 +81,17 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
8081
>>> f.toarray()
8182
array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],
8283
[ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])
84+
85+
With `input_type="string"`, the input must be an iterable over iterables of
86+
strings:
87+
88+
>>> h = FeatureHasher(n_features=8, input_type="string")
89+
>>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]]
90+
>>> f = h.transform(raw_X)
91+
>>> f.toarray()
92+
array([[ 0., 0., 0., -1., 0., -1., 0., 1.],
93+
[ 0., 0., 0., -1., 0., -1., 0., 0.],
94+
[ 0., -1., 0., 0., 0., 0., 0., 1.]])
8395
"""
8496

8597
_parameter_constraints: dict = {
@@ -146,7 +158,15 @@ def transform(self, raw_X):
146158
if self.input_type == "dict":
147159
raw_X = (_iteritems(d) for d in raw_X)
148160
elif self.input_type == "string":
149-
raw_X = (((f, 1) for f in x) for x in raw_X)
161+
first_raw_X = next(raw_X)
162+
if isinstance(first_raw_X, str):
163+
raise ValueError(
164+
"Samples can not be a single string. The input must be an iterable"
165+
" over iterables of strings."
166+
)
167+
raw_X_ = chain([first_raw_X], raw_X)
168+
raw_X = (((f, 1) for f in x) for x in raw_X_)
169+
150170
indices, indptr, values = _hashing_transform(
151171
raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0
152172
)

sklearn/feature_extraction/tests/test_feature_hasher.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,26 @@ def test_feature_hasher_strings():
4343
assert X.nnz == 6
4444

4545

46+
@pytest.mark.parametrize(
47+
"raw_X",
48+
[
49+
["my_string", "another_string"],
50+
(x for x in ["my_string", "another_string"]),
51+
],
52+
ids=["list", "generator"],
53+
)
54+
def test_feature_hasher_single_string(raw_X):
55+
"""FeatureHasher raises error when a sample is a single string.
56+
57+
Non-regression test for gh-13199.
58+
"""
59+
msg = "Samples can not be a single string"
60+
61+
feature_hasher = FeatureHasher(n_features=10, input_type="string")
62+
with pytest.raises(ValueError, match=msg):
63+
feature_hasher.transform(raw_X)
64+
65+
4666
def test_hashing_transform_seed():
4767
# check the influence of the seed when computing the hashes
4868
raw_X = [

0 commit comments

Comments
 (0)