From 88d6eb0db2aa0255aa07e5ad3aef3c8ccf1948e0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 1 Dec 2022 17:32:38 -0500 Subject: [PATCH 1/2] FIX Raises informative error in FeatureHasher when a sample is a single string --- doc/whats_new/v1.3.rst | 6 +++++ sklearn/feature_extraction/_hash.py | 22 ++++++++++++++++++- .../tests/test_feature_hasher.py | 20 +++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 6012ac7a336a4..acf15724b93c6 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -36,6 +36,12 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. +:mod:`sklearn.feature_extraction` +................................. + +- |Fix| :class:`feature_extraction.FeatureHasher` raises an informative error + when the input is a list of strings. :pr:`xxxxx` by `Thomas Fan`_. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 223c3bb40d6dc..e687ff7229974 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -2,6 +2,7 @@ # License: BSD 3 clause from numbers import Integral +from itertools import chain import numpy as np import scipy.sparse as sp @@ -80,6 +81,17 @@ class FeatureHasher(TransformerMixin, BaseEstimator): >>> f.toarray() array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.], [ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]]) + + With `input_type="string"`, the input must be an iterable over iterables of + strings: + + >>> h = FeatureHasher(n_features=8, input_type="string") + >>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]] + >>> f = h.transform(raw_X) + >>> f.toarray() + array([[ 0., 0., 0., -1., 0., -1., 0., 1.], + [ 0., 0., 0., -1., 0., -1., 0., 0.], + [ 0., -1., 0., 0., 0., 0., 0., 1.]]) """ _parameter_constraints: dict = { @@ -146,7 +158,15 @@ def transform(self, raw_X): if self.input_type == "dict": raw_X = (_iteritems(d) for d in raw_X) elif self.input_type == "string": - raw_X = (((f, 1) for f in x) for x in raw_X) + first_raw_X = next(raw_X) + if isinstance(first_raw_X, str): + raise ValueError( + "Samples can not be a single string. The input must be an iterable" + " over iterables of strings." + ) + raw_X_ = chain([first_raw_X], raw_X) + raw_X = (((f, 1) for f in x) for x in raw_X_) + indices, indptr, values = _hashing_transform( raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0 ) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index d58acb06ead7f..b074620f8c029 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -43,6 +43,26 @@ def test_feature_hasher_strings(): assert X.nnz == 6 +@pytest.mark.parametrize( + "raw_X", + [ + ["my_string", "another_string"], + (x for x in ["my_string", "another_string"]), + ], + ids=["list", "generator"], +) +def test_feature_hasher_single_string(raw_X): + """FeatureHasher raises error when a sample is a single string. + + Non-regression test for gh-13199. + """ + msg = "Samples can not be a single string" + + feature_hasher = FeatureHasher(n_features=10, input_type="string") + with pytest.raises(ValueError, match=msg): + feature_hasher.transform(raw_X) + + def test_hashing_transform_seed(): # check the influence of the seed when computing the hashes raw_X = [ From 7c31e472c102347221cb6dd5a8ddd5a42dfcbfcc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 1 Dec 2022 17:40:52 -0500 Subject: [PATCH 2/2] DOC Adds whats new number --- doc/whats_new/v1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index acf15724b93c6..f6efdd8632e38 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -40,7 +40,7 @@ Changelog ................................. - |Fix| :class:`feature_extraction.FeatureHasher` raises an informative error - when the input is a list of strings. :pr:`xxxxx` by `Thomas Fan`_. + when the input is a list of strings. :pr:`25094` by `Thomas Fan`_. Code and Documentation Contributors -----------------------------------