From 0a5021b86ed3d9b6c52f11798f1bbf17929f6e91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jochen=20Wersd=C3=B6rfer?= <jochen@wersdoerfer.de>
Date: Sat, 27 Aug 2016 12:36:22 +0200
Subject: [PATCH 1/3] Improved memory usage in text vectorizers (PR #5122)

This update corresponds to the FastAupiffCountVectorizer
implementation from PR #5122
---
 sklearn/feature_extraction/text.py | 32 ++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index d291cc54d60b2..01ea27ce88783 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -676,6 +676,7 @@ def __init__(self, input='content', encoding='utf-8',
         self.vocabulary = vocabulary
         self.binary = binary
         self.dtype = dtype
+        self.chunksize = 10000
 
     def _sort_features(self, X, vocabulary):
         """Sort features by name
@@ -685,9 +686,15 @@ def _sort_features(self, X, vocabulary):
         sorted_features = sorted(six.iteritems(vocabulary))
         map_index = np.empty(len(sorted_features), dtype=np.int32)
         for new_val, (term, old_val) in enumerate(sorted_features):
-            map_index[new_val] = old_val
             vocabulary[term] = new_val
-        return X[:, map_index]
+            map_index[old_val] = new_val
+
+        # swap columns in place
+        indices = X.indices
+        for idx, val in enumerate(X.indices):
+            indices[idx] = map_index[val]
+        X.indices = indices
+        return X
 
     def _limit_features(self, X, vocabulary, high=None, low=None,
                         limit=None):
@@ -699,7 +706,10 @@ def _limit_features(self, X, vocabulary, high=None, low=None,
 
         This does not prune samples with zero features.
         """
-        if high is None and low is None and limit is None:
+        high_not_set = high is None or int(high) == X.shape[0]
+        low_not_set = low is None or int(low) == 1
+        limit_not_set = limit is None or int(limit) == X.shape[1]
+        if high_not_set and low_not_set and limit_not_set:
             return X, set()
 
         # Calculate a mask based on document frequencies
@@ -743,14 +753,24 @@ def _count_vocab(self, raw_documents, fixed_vocab):
         analyze = self.build_analyzer()
         j_indices = _make_int_array()
         indptr = _make_int_array()
+        values = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
+            feature_counter = {}
             for feature in analyze(doc):
                 try:
-                    j_indices.append(vocabulary[feature])
+                    feature_idx = vocabulary[feature]
+                    if feature_idx not in feature_counter:
+                        feature_counter[feature_idx] = 1
+                    else:
+                        feature_counter[feature_idx] += 1
                 except KeyError:
                     # Ignore out-of-vocabulary items for fixed_vocab=True
                     continue
+
+            j_indices.extend(feature_counter.keys())
+            values.extend(feature_counter.values())
+            del(feature_counter)
             indptr.append(len(j_indices))
 
         if not fixed_vocab:
@@ -762,12 +782,12 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
         j_indices = frombuffer_empty(j_indices, dtype=np.intc)
         indptr = np.frombuffer(indptr, dtype=np.intc)
-        values = np.ones(len(j_indices))
+        values = frombuffer_empty(values, dtype=np.intc)
 
         X = sp.csr_matrix((values, j_indices, indptr),
                           shape=(len(indptr) - 1, len(vocabulary)),
                           dtype=self.dtype)
-        X.sum_duplicates()
+        X.sort_indices()
         return vocabulary, X
 
     def fit(self, raw_documents, y=None):

From 1418392047968bb7d5a1ed4a5e523ec071b42adf Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Sun, 28 Aug 2016 09:19:22 +0200
Subject: [PATCH 2/3] Improved feature extraction performance (issue #5306)

---
 sklearn/feature_extraction/text.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 01ea27ce88783..0ff8640c999b4 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -676,7 +676,6 @@ def __init__(self, input='content', encoding='utf-8',
         self.vocabulary = vocabulary
         self.binary = binary
         self.dtype = dtype
-        self.chunksize = 10000
 
     def _sort_features(self, X, vocabulary):
         """Sort features by name
@@ -706,10 +705,7 @@ def _limit_features(self, X, vocabulary, high=None, low=None,
 
         This does not prune samples with zero features.
         """
-        high_not_set = high is None or int(high) == X.shape[0]
-        low_not_set = low is None or int(low) == 1
-        limit_not_set = limit is None or int(limit) == X.shape[1]
-        if high_not_set and low_not_set and limit_not_set:
+        if high is None and low is None and limit is None:
             return X, set()
 
         # Calculate a mask based on document frequencies
@@ -751,7 +747,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
             vocabulary.default_factory = vocabulary.__len__
 
         analyze = self.build_analyzer()
-        j_indices = _make_int_array()
+        j_indices = []
         indptr = _make_int_array()
         values = _make_int_array()
         indptr.append(0)
@@ -780,7 +776,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
+        j_indices = np.asarray(j_indices, dtype=np.intc)
         indptr = np.frombuffer(indptr, dtype=np.intc)
         values = frombuffer_empty(values, dtype=np.intc)
 

From 039f7ebf31ab7334832e4751c43d70b6f53e40b6 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 8 Sep 2016 17:15:41 +0200
Subject: [PATCH 3/3] Text vecotorizer: adressing review comments

---
 sklearn/feature_extraction/text.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 0ff8640c999b4..3bc2e8cb0ca3a 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -688,11 +688,7 @@ def _sort_features(self, X, vocabulary):
             vocabulary[term] = new_val
             map_index[old_val] = new_val
 
-        # swap columns in place
-        indices = X.indices
-        for idx, val in enumerate(X.indices):
-            indices[idx] = map_index[val]
-        X.indices = indices
+        X.indices = map_index.take(X.indices, mode='clip')
         return X
 
     def _limit_features(self, X, vocabulary, high=None, low=None,
@@ -766,7 +762,6 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
             j_indices.extend(feature_counter.keys())
             values.extend(feature_counter.values())
-            del(feature_counter)
             indptr.append(len(j_indices))
 
         if not fixed_vocab: