From 4cb6fd4788d1eea0e53ca9a41170015a5ca17837 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 22 Sep 2016 22:07:22 +0500 Subject: [PATCH 1/2] micro-optimize HashingVectorizer and FeatureHasher --- sklearn/feature_extraction/_hashing.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index 201082e94cbf3..fdbf21be752dc 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -8,8 +8,6 @@ from libc.stdlib cimport abs cimport numpy as np import numpy as np -from ..externals.six import string_types - from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32 np.import_array() @@ -45,7 +43,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype): for x in raw_X: for f, v in x: - if isinstance(v, string_types): + if isinstance(v, basestring): f = "%s%s%s" % (f, '=', v) value = 1 else: @@ -55,13 +53,13 @@ def transform(raw_X, Py_ssize_t n_features, dtype): continue if isinstance(f, unicode): - f = f.encode("utf-8") + f = (f).encode("utf-8") # Need explicit type check because Murmurhash does not propagate # all exceptions. Add "except *" there? elif not isinstance(f, bytes): raise TypeError("feature names must be strings") - h = murmurhash3_bytes_s32(f, 0) + h = murmurhash3_bytes_s32(f, 0) array.resize_smart(indices, len(indices) + 1) indices[len(indices) - 1] = abs(h) % n_features From 9de996d478b9e37ca9803b2d76502f0abe07f318 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 22 Sep 2016 22:40:10 +0500 Subject: [PATCH 2/2] fix backwards compatibility for Cython < 0.20 --- sklearn/feature_extraction/_hashing.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index fdbf21be752dc..39c2b10378132 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -43,7 +43,7 @@ def transform(raw_X, Py_ssize_t n_features, dtype): for x in raw_X: for f, v in x: - if isinstance(v, basestring): + if isinstance(v, (str, unicode)): f = "%s%s%s" % (f, '=', v) value = 1 else: