scikit-learn · jeremiedbb · Jan 3, 2023 · Dec 1, 2022 · Dec 1, 2022 · Dec 2, 2022
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -73,6 +73,12 @@ Changelog
   :class:`ensemble.ExtraTreesClassifier` and :class:`ensemble.ExtraTreesRegressor`.
   :pr:`25177` by :user:`Tim Head <betatim>`.
 
+:mod:`sklearn.feature_extraction`
+.................................
+
+- |Fix| :class:`feature_extraction.FeatureHasher` raises an informative error
+  when the input is a list of strings. :pr:`25094` by `Thomas Fan`_.
+
 :mod:`sklearn.model_selection`
 ..............................
 - |Fix| :func:`model_selection.cross_validate` with multimetric scoring in

diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
@@ -2,6 +2,7 @@
 # License: BSD 3 clause
 
 from numbers import Integral
+from itertools import chain
 
 import numpy as np
 import scipy.sparse as sp
@@ -80,6 +81,17 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
     >>> f.toarray()
     array([[ 0.,  0., -4., -1.,  0.,  0.,  0.,  0.,  0.,  2.],
            [ 0.,  0.,  0., -2., -5.,  0.,  0.,  0.,  0.,  0.]])
+
+    With `input_type="string"`, the input must be an iterable over iterables of
+    strings:
+
+    >>> h = FeatureHasher(n_features=8, input_type="string")
+    >>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]]
+    >>> f = h.transform(raw_X)
+    >>> f.toarray()
+    array([[ 0.,  0.,  0., -1.,  0., -1.,  0.,  1.],
+           [ 0.,  0.,  0., -1.,  0., -1.,  0.,  0.],
+           [ 0., -1.,  0.,  0.,  0.,  0.,  0.,  1.]])
     """
 
     _parameter_constraints: dict = {
@@ -146,7 +158,15 @@ def transform(self, raw_X):
         if self.input_type == "dict":
             raw_X = (_iteritems(d) for d in raw_X)
         elif self.input_type == "string":
-            raw_X = (((f, 1) for f in x) for x in raw_X)
+            first_raw_X = next(raw_X)
+            if isinstance(first_raw_X, str):
+                raise ValueError(
+                    "Samples can not be a single string. The input must be an iterable"
+                    " over iterables of strings."
+                )
+            raw_X_ = chain([first_raw_X], raw_X)
+            raw_X = (((f, 1) for f in x) for x in raw_X_)
+
         indices, indptr, values = _hashing_transform(
             raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0
         )

diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -43,6 +43,26 @@ def test_feature_hasher_strings():
         assert X.nnz == 6
 
 
+@pytest.mark.parametrize(
+    "raw_X",
+    [
+        ["my_string", "another_string"],
+        (x for x in ["my_string", "another_string"]),
+    ],
+    ids=["list", "generator"],
+)
+def test_feature_hasher_single_string(raw_X):
+    """FeatureHasher raises error when a sample is a single string.
+
+    Non-regression test for gh-13199.
+    """
+    msg = "Samples can not be a single string"
+
+    feature_hasher = FeatureHasher(n_features=10, input_type="string")
+    with pytest.raises(ValueError, match=msg):
+        feature_hasher.transform(raw_X)
+
+
 def test_hashing_transform_seed():
     # check the influence of the seed when computing the hashes
     raw_X = [