Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit e88baea

Browse files
rthjnothman
authored andcommitted
TST Platform independent hash collision tests in FeatureHasher (#9710)
1 parent 4c61e8b commit e88baea

File tree

1 file changed

+24
-9
lines changed

1 file changed

+24
-9
lines changed

sklearn/feature_extraction/tests/test_feature_hasher.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -112,30 +112,45 @@ def test_hasher_zeros():
112112

113113
@ignore_warnings(category=DeprecationWarning)
114114
def test_hasher_alternate_sign():
115-
# the last two tokens produce a hash collision that sums as 0
116-
X = [["foo", "bar", "baz", "investigation need", "records"]]
115+
X = [list("Thequickbrownfoxjumped")]
117116

118117
Xt = FeatureHasher(alternate_sign=True, non_negative=False,
119118
input_type='string').fit_transform(X)
120-
assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
121-
# check that we have a collision that produces a 0 count
122-
assert_true(len(Xt.data) < len(X[0]))
123-
assert_true((Xt.data == 0.).any())
119+
assert Xt.data.min() < 0 and Xt.data.max() > 0
124120

125121
Xt = FeatureHasher(alternate_sign=True, non_negative=True,
126122
input_type='string').fit_transform(X)
127-
assert_true((Xt.data >= 0).all()) # all counts are positive
128-
assert_true((Xt.data == 0.).any()) # we still have a collision
123+
assert Xt.data.min() > 0
124+
129125
Xt = FeatureHasher(alternate_sign=False, non_negative=True,
130126
input_type='string').fit_transform(X)
131-
assert_true((Xt.data > 0).all()) # strictly positive counts
127+
assert Xt.data.min() > 0
132128
Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
133129
input_type='string').fit_transform(X)
134130
# With initially positive features, the non_negative option should
135131
# have no impact when alternate_sign=False
136132
assert_array_equal(Xt.data, Xt_2.data)
137133

138134

135+
@ignore_warnings(category=DeprecationWarning)
136+
def test_hash_collisions():
137+
X = [list("Thequickbrownfoxjumped")]
138+
139+
Xt = FeatureHasher(alternate_sign=True, non_negative=False,
140+
n_features=1, input_type='string').fit_transform(X)
141+
# check that some of the hashed tokens are added
142+
# with an opposite sign and cancel out
143+
assert abs(Xt.data[0]) < len(X[0])
144+
145+
Xt = FeatureHasher(alternate_sign=True, non_negative=True,
146+
n_features=1, input_type='string').fit_transform(X)
147+
assert abs(Xt.data[0]) < len(X[0])
148+
149+
Xt = FeatureHasher(alternate_sign=False, non_negative=True,
150+
n_features=1, input_type='string').fit_transform(X)
151+
assert Xt.data[0] == len(X[0])
152+
153+
139154
@ignore_warnings(category=DeprecationWarning)
140155
def test_hasher_negative():
141156
X = [{"foo": 2, "bar": -4, "baz": -1}.items()]

0 commit comments

Comments
 (0)