From 874ed30cba760775b03251100fa6a17fddfca07c Mon Sep 17 00:00:00 2001 From: Jaye Doepke Date: Sat, 1 Oct 2016 17:53:24 -0500 Subject: [PATCH 1/3] Improve ngram performance - method binding * Improve ngram performance by binding methods outside the loop. --- sklearn/feature_extraction/text.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index b9d09443832b5..a90603e09f99c 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -135,10 +135,15 @@ def _word_ngrams(self, tokens, stop_words=None): original_tokens = tokens tokens = [] n_original_tokens = len(original_tokens) + + # bind method outside of loop to reduce overhead + tokens_append = tokens.append + space_join = " ".join + for n in xrange(min_n, min(max_n + 1, n_original_tokens + 1)): for i in xrange(n_original_tokens - n + 1): - tokens.append(" ".join(original_tokens[i: i + n])) + tokens_append(space_join(original_tokens[i: i + n])) return tokens @@ -150,9 +155,13 @@ def _char_ngrams(self, text_document): text_len = len(text_document) ngrams = [] min_n, max_n = self.ngram_range + + # bind method outside of loop to reduce overhead + ngrams_append = ngrams.append + for n in xrange(min_n, min(max_n + 1, text_len + 1)): for i in xrange(text_len - n + 1): - ngrams.append(text_document[i: i + n]) + ngrams_append(text_document[i: i + n]) return ngrams def _char_wb_ngrams(self, text_document): @@ -165,15 +174,19 @@ def _char_wb_ngrams(self, text_document): min_n, max_n = self.ngram_range ngrams = [] + + # bind method outside of loop to reduce overhead + ngrams_append = ngrams.append + for w in text_document.split(): w = ' ' + w + ' ' w_len = len(w) for n in xrange(min_n, max_n + 1): offset = 0 - ngrams.append(w[offset:offset + n]) + ngrams_append(w[offset:offset + n]) while offset + n < w_len: offset += 1 - ngrams.append(w[offset:offset + n]) + ngrams_append(w[offset:offset + n]) if offset == 0: # count a short word (w_len < n) only once break return ngrams From ac62e1c0c4f563e29f2fb6f4653f107345e05345 Mon Sep 17 00:00:00 2001 From: Jaye Doepke Date: Sat, 1 Oct 2016 18:08:32 -0500 Subject: [PATCH 2/3] Improve ngram performance - unigram list * Create unigrams without slicing. --- sklearn/feature_extraction/text.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index a90603e09f99c..3c66046e30659 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -133,7 +133,12 @@ def _word_ngrams(self, tokens, stop_words=None): min_n, max_n = self.ngram_range if max_n != 1: original_tokens = tokens - tokens = [] + if min_n == 1: + tokens = list(original_tokens) + min_n += 1 + else: + tokens = [] + n_original_tokens = len(original_tokens) # bind method outside of loop to reduce overhead @@ -153,8 +158,12 @@ def _char_ngrams(self, text_document): text_document = self._white_spaces.sub(" ", text_document) text_len = len(text_document) - ngrams = [] min_n, max_n = self.ngram_range + if min_n == 1: + ngrams = list(text_document) + min_n += 1 + else: + ngrams = [] # bind method outside of loop to reduce overhead ngrams_append = ngrams.append From f7c444169c8e313e5f805597cf9d79dd8780480f Mon Sep 17 00:00:00 2001 From: Jaye Doepke Date: Tue, 4 Oct 2016 07:11:28 -0500 Subject: [PATCH 3/3] Improve ngram performance - code comment * Added code comment to explain using list() for unigrams. --- sklearn/feature_extraction/text.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 3c66046e30659..ebd44703887c4 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -134,6 +134,8 @@ def _word_ngrams(self, tokens, stop_words=None): if max_n != 1: original_tokens = tokens if min_n == 1: + # no need to do any slicing for unigrams + # just iterate through the original tokens tokens = list(original_tokens) min_n += 1 else: @@ -160,6 +162,8 @@ def _char_ngrams(self, text_document): text_len = len(text_document) min_n, max_n = self.ngram_range if min_n == 1: + # no need to do any slicing for unigrams + # iterate through the string ngrams = list(text_document) min_n += 1 else: