From 0a5021b86ed3d9b6c52f11798f1bbf17929f6e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jochen=20Wersd=C3=B6rfer?= Date: Sat, 27 Aug 2016 12:36:22 +0200 Subject: [PATCH 1/3] Improved memory usage in text vectorizers (PR #5122) This update corresponds to the FastAupiffCountVectorizer implementation from PR #5122 --- sklearn/feature_extraction/text.py | 32 ++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index d291cc54d60b2..01ea27ce88783 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -676,6 +676,7 @@ def __init__(self, input='content', encoding='utf-8', self.vocabulary = vocabulary self.binary = binary self.dtype = dtype + self.chunksize = 10000 def _sort_features(self, X, vocabulary): """Sort features by name @@ -685,9 +686,15 @@ def _sort_features(self, X, vocabulary): sorted_features = sorted(six.iteritems(vocabulary)) map_index = np.empty(len(sorted_features), dtype=np.int32) for new_val, (term, old_val) in enumerate(sorted_features): - map_index[new_val] = old_val vocabulary[term] = new_val - return X[:, map_index] + map_index[old_val] = new_val + + # swap columns in place + indices = X.indices + for idx, val in enumerate(X.indices): + indices[idx] = map_index[val] + X.indices = indices + return X def _limit_features(self, X, vocabulary, high=None, low=None, limit=None): @@ -699,7 +706,10 @@ def _limit_features(self, X, vocabulary, high=None, low=None, This does not prune samples with zero features. """ - if high is None and low is None and limit is None: + high_not_set = high is None or int(high) == X.shape[0] + low_not_set = low is None or int(low) == 1 + limit_not_set = limit is None or int(limit) == X.shape[1] + if high_not_set and low_not_set and limit_not_set: return X, set() # Calculate a mask based on document frequencies @@ -743,14 +753,24 @@ def _count_vocab(self, raw_documents, fixed_vocab): analyze = self.build_analyzer() j_indices = _make_int_array() indptr = _make_int_array() + values = _make_int_array() indptr.append(0) for doc in raw_documents: + feature_counter = {} for feature in analyze(doc): try: - j_indices.append(vocabulary[feature]) + feature_idx = vocabulary[feature] + if feature_idx not in feature_counter: + feature_counter[feature_idx] = 1 + else: + feature_counter[feature_idx] += 1 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue + + j_indices.extend(feature_counter.keys()) + values.extend(feature_counter.values()) + del(feature_counter) indptr.append(len(j_indices)) if not fixed_vocab: @@ -762,12 +782,12 @@ def _count_vocab(self, raw_documents, fixed_vocab): j_indices = frombuffer_empty(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) - values = np.ones(len(j_indices)) + values = frombuffer_empty(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) - X.sum_duplicates() + X.sort_indices() return vocabulary, X def fit(self, raw_documents, y=None): From 1418392047968bb7d5a1ed4a5e523ec071b42adf Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 28 Aug 2016 09:19:22 +0200 Subject: [PATCH 2/3] Improved feature extraction performance (issue #5306) --- sklearn/feature_extraction/text.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 01ea27ce88783..0ff8640c999b4 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -676,7 +676,6 @@ def __init__(self, input='content', encoding='utf-8', self.vocabulary = vocabulary self.binary = binary self.dtype = dtype - self.chunksize = 10000 def _sort_features(self, X, vocabulary): """Sort features by name @@ -706,10 +705,7 @@ def _limit_features(self, X, vocabulary, high=None, low=None, This does not prune samples with zero features. """ - high_not_set = high is None or int(high) == X.shape[0] - low_not_set = low is None or int(low) == 1 - limit_not_set = limit is None or int(limit) == X.shape[1] - if high_not_set and low_not_set and limit_not_set: + if high is None and low is None and limit is None: return X, set() # Calculate a mask based on document frequencies @@ -751,7 +747,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() - j_indices = _make_int_array() + j_indices = [] indptr = _make_int_array() values = _make_int_array() indptr.append(0) @@ -780,7 +776,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - j_indices = frombuffer_empty(j_indices, dtype=np.intc) + j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = frombuffer_empty(values, dtype=np.intc) From 039f7ebf31ab7334832e4751c43d70b6f53e40b6 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 8 Sep 2016 17:15:41 +0200 Subject: [PATCH 3/3] Text vecotorizer: adressing review comments --- sklearn/feature_extraction/text.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 0ff8640c999b4..3bc2e8cb0ca3a 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -688,11 +688,7 @@ def _sort_features(self, X, vocabulary): vocabulary[term] = new_val map_index[old_val] = new_val - # swap columns in place - indices = X.indices - for idx, val in enumerate(X.indices): - indices[idx] = map_index[val] - X.indices = indices + X.indices = map_index.take(X.indices, mode='clip') return X def _limit_features(self, X, vocabulary, high=None, low=None, @@ -766,7 +762,6 @@ def _count_vocab(self, raw_documents, fixed_vocab): j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) - del(feature_counter) indptr.append(len(j_indices)) if not fixed_vocab: