Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v1.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ Changelog
:class:`ensemble.ExtraTreesClassifier` and :class:`ensemble.ExtraTreesRegressor`.
:pr:`25177` by :user:`Tim Head <betatim>`.

:mod:`sklearn.feature_extraction`
.................................

- |Fix| :class:`feature_extraction.FeatureHasher` raises an informative error
when the input is a list of strings. :pr:`25094` by `Thomas Fan`_.

:mod:`sklearn.model_selection`
..............................
- |Fix| :func:`model_selection.cross_validate` with multimetric scoring in
Expand Down
22 changes: 21 additions & 1 deletion sklearn/feature_extraction/_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# License: BSD 3 clause

from numbers import Integral
from itertools import chain

import numpy as np
import scipy.sparse as sp
Expand Down Expand Up @@ -80,6 +81,17 @@ class FeatureHasher(TransformerMixin, BaseEstimator):
>>> f.toarray()
array([[ 0., 0., -4., -1., 0., 0., 0., 0., 0., 2.],
[ 0., 0., 0., -2., -5., 0., 0., 0., 0., 0.]])

With `input_type="string"`, the input must be an iterable over iterables of
strings:

>>> h = FeatureHasher(n_features=8, input_type="string")
>>> raw_X = [["dog", "cat", "snake"], ["snake", "dog"], ["cat", "bird"]]
>>> f = h.transform(raw_X)
>>> f.toarray()
array([[ 0., 0., 0., -1., 0., -1., 0., 1.],
[ 0., 0., 0., -1., 0., -1., 0., 0.],
[ 0., -1., 0., 0., 0., 0., 0., 1.]])
"""

_parameter_constraints: dict = {
Expand Down Expand Up @@ -146,7 +158,15 @@ def transform(self, raw_X):
if self.input_type == "dict":
raw_X = (_iteritems(d) for d in raw_X)
elif self.input_type == "string":
raw_X = (((f, 1) for f in x) for x in raw_X)
first_raw_X = next(raw_X)
if isinstance(first_raw_X, str):
raise ValueError(
"Samples can not be a single string. The input must be an iterable"
" over iterables of strings."
)
raw_X_ = chain([first_raw_X], raw_X)
raw_X = (((f, 1) for f in x) for x in raw_X_)

indices, indptr, values = _hashing_transform(
raw_X, self.n_features, self.dtype, self.alternate_sign, seed=0
)
Expand Down
20 changes: 20 additions & 0 deletions sklearn/feature_extraction/tests/test_feature_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,26 @@ def test_feature_hasher_strings():
assert X.nnz == 6


@pytest.mark.parametrize(
"raw_X",
[
["my_string", "another_string"],
(x for x in ["my_string", "another_string"]),
],
ids=["list", "generator"],
)
def test_feature_hasher_single_string(raw_X):
"""FeatureHasher raises error when a sample is a single string.

Non-regression test for gh-13199.
"""
msg = "Samples can not be a single string"

feature_hasher = FeatureHasher(n_features=10, input_type="string")
with pytest.raises(ValueError, match=msg):
feature_hasher.transform(raw_X)


def test_hashing_transform_seed():
# check the influence of the seed when computing the hashes
raw_X = [
Expand Down