From 874ed30cba760775b03251100fa6a17fddfca07c Mon Sep 17 00:00:00 2001
From: Jaye Doepke <jtdoepke12@gmail.com>
Date: Sat, 1 Oct 2016 17:53:24 -0500
Subject: [PATCH 1/3] Improve ngram performance - method binding

* Improve ngram performance by binding methods outside the loop.
---
 sklearn/feature_extraction/text.py | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index b9d09443832b5..a90603e09f99c 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -135,10 +135,15 @@ def _word_ngrams(self, tokens, stop_words=None):
             original_tokens = tokens
             tokens = []
             n_original_tokens = len(original_tokens)
+
+            # bind method outside of loop to reduce overhead
+            tokens_append = tokens.append
+            space_join = " ".join
+
             for n in xrange(min_n,
                             min(max_n + 1, n_original_tokens + 1)):
                 for i in xrange(n_original_tokens - n + 1):
-                    tokens.append(" ".join(original_tokens[i: i + n]))
+                    tokens_append(space_join(original_tokens[i: i + n]))
 
         return tokens
 
@@ -150,9 +155,13 @@ def _char_ngrams(self, text_document):
         text_len = len(text_document)
         ngrams = []
         min_n, max_n = self.ngram_range
+
+        # bind method outside of loop to reduce overhead
+        ngrams_append = ngrams.append
+
         for n in xrange(min_n, min(max_n + 1, text_len + 1)):
             for i in xrange(text_len - n + 1):
-                ngrams.append(text_document[i: i + n])
+                ngrams_append(text_document[i: i + n])
         return ngrams
 
     def _char_wb_ngrams(self, text_document):
@@ -165,15 +174,19 @@ def _char_wb_ngrams(self, text_document):
 
         min_n, max_n = self.ngram_range
         ngrams = []
+
+        # bind method outside of loop to reduce overhead
+        ngrams_append = ngrams.append
+
         for w in text_document.split():
             w = ' ' + w + ' '
             w_len = len(w)
             for n in xrange(min_n, max_n + 1):
                 offset = 0
-                ngrams.append(w[offset:offset + n])
+                ngrams_append(w[offset:offset + n])
                 while offset + n < w_len:
                     offset += 1
-                    ngrams.append(w[offset:offset + n])
+                    ngrams_append(w[offset:offset + n])
                 if offset == 0:   # count a short word (w_len < n) only once
                     break
         return ngrams

From ac62e1c0c4f563e29f2fb6f4653f107345e05345 Mon Sep 17 00:00:00 2001
From: Jaye Doepke <jtdoepke12@gmail.com>
Date: Sat, 1 Oct 2016 18:08:32 -0500
Subject: [PATCH 2/3] Improve ngram performance - unigram list

* Create unigrams without slicing.
---
 sklearn/feature_extraction/text.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index a90603e09f99c..3c66046e30659 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -133,7 +133,12 @@ def _word_ngrams(self, tokens, stop_words=None):
         min_n, max_n = self.ngram_range
         if max_n != 1:
             original_tokens = tokens
-            tokens = []
+            if min_n == 1:
+                tokens = list(original_tokens)
+                min_n += 1
+            else:
+                tokens = []
+
             n_original_tokens = len(original_tokens)
 
             # bind method outside of loop to reduce overhead
@@ -153,8 +158,12 @@ def _char_ngrams(self, text_document):
         text_document = self._white_spaces.sub(" ", text_document)
 
         text_len = len(text_document)
-        ngrams = []
         min_n, max_n = self.ngram_range
+        if min_n == 1:
+            ngrams = list(text_document)
+            min_n += 1
+        else:
+            ngrams = []
 
         # bind method outside of loop to reduce overhead
         ngrams_append = ngrams.append

From f7c444169c8e313e5f805597cf9d79dd8780480f Mon Sep 17 00:00:00 2001
From: Jaye Doepke <jtdoepke12@gmail.com>
Date: Tue, 4 Oct 2016 07:11:28 -0500
Subject: [PATCH 3/3] Improve ngram performance - code comment

* Added code comment to explain using list() for unigrams.
---
 sklearn/feature_extraction/text.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 3c66046e30659..ebd44703887c4 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -134,6 +134,8 @@ def _word_ngrams(self, tokens, stop_words=None):
         if max_n != 1:
             original_tokens = tokens
             if min_n == 1:
+                # no need to do any slicing for unigrams
+                # just iterate through the original tokens
                 tokens = list(original_tokens)
                 min_n += 1
             else:
@@ -160,6 +162,8 @@ def _char_ngrams(self, text_document):
         text_len = len(text_document)
         min_n, max_n = self.ngram_range
         if min_n == 1:
+            # no need to do any slicing for unigrams
+            # iterate through the string
             ngrams = list(text_document)
             min_n += 1
         else: