From ddcb64d5e943cd4cab8867ac5a8ca0b469e89cdc Mon Sep 17 00:00:00 2001 From: Claes-Fredrik Mannby Date: Tue, 19 Jan 2016 13:08:27 -0800 Subject: [PATCH 1/3] Support new scipy sparse array indices, which can now be > 2^31 (< 2^63). This is needed for very large training sets. Feature indices (based on the number of distinct features), are unlikely to need 4 bytes per value, however. --- sklearn/feature_extraction/text.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index b47569db51712..f7f22faf14ca7 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -744,8 +744,10 @@ def _count_vocab(self, raw_documents, fixed_vocab): vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() + # j_indices stores feature indices, likely to be < 2^31 j_indices = _make_int_array() - indptr = _make_int_array() + # indptr stores indices into j_indices, which can be large + indptr = _make_long_array() indptr.append(0) for doc in raw_documents: for feature in analyze(doc): @@ -909,6 +911,10 @@ def _make_int_array(): """Construct an array.array of a type suitable for scipy.sparse indices.""" return array.array(str("i")) +def _make_long_array(): + """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers).""" + return array.array(str("l")) + class TfidfTransformer(BaseEstimator, TransformerMixin): """Transform a count matrix to a normalized tf or tf-idf representation From c75c0b80a03521be3abb96e0b0286784bec34dfe Mon Sep 17 00:00:00 2001 From: Claes-Fredrik Mannby Date: Wed, 20 Jan 2016 14:13:12 -0800 Subject: [PATCH 2/3] Also increase size of integer values in indptr in the next step. --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index f7f22faf14ca7..5dae83be5a698 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -766,7 +766,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): " contain stop words") j_indices = frombuffer_empty(j_indices, dtype=np.intc) - indptr = np.frombuffer(indptr, dtype=np.intc) + indptr = np.frombuffer(indptr, dtype=np.int_) values = np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), From 3ec2503f1c7855beae4cd2460482afe6c047d596 Mon Sep 17 00:00:00 2001 From: Claes-Fredrik Mannby Date: Thu, 28 Jan 2016 11:28:20 -0800 Subject: [PATCH 3/3] Use long for both arrays if scipy >= 0.14. Tweak comments --- sklearn/feature_extraction/text.py | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 5dae83be5a698..96e3fef4c59b2 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -30,7 +30,7 @@ from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS from ..utils import deprecated -from ..utils.fixes import frombuffer_empty, bincount +from ..utils.fixes import frombuffer_empty, bincount, sp_version from ..utils.validation import check_is_fitted __all__ = ['CountVectorizer', @@ -744,10 +744,19 @@ def _count_vocab(self, raw_documents, fixed_vocab): vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() - # j_indices stores feature indices, likely to be < 2^31 - j_indices = _make_int_array() - # indptr stores indices into j_indices, which can be large - indptr = _make_long_array() + if sp_version >= (0, 14): + # We can use 64-bit indices + # NOTE: long on Windows is only 32 bits + # j_indices stores feature indices, likely to be < 2^31 + j_indices = _make_long_array() + # indptr stores indices into j_indices, which can be large + indptr = _make_long_array() + else: + # Sparse arrays only support 32-bit integers + # j_indices stores feature indices, likely to be < 2^31 + j_indices = _make_int_array() + # indptr stores indices into j_indices, which can be large + indptr = _make_int_array() indptr.append(0) for doc in raw_documents: for feature in analyze(doc): @@ -765,8 +774,16 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - j_indices = frombuffer_empty(j_indices, dtype=np.intc) - indptr = np.frombuffer(indptr, dtype=np.int_) + if sp_version >= (0, 14): + # We can use 64-bit indices + # int_ == "l" (long) + # NOTE: long on Windows is only 32 bits + j_indices = frombuffer_empty(j_indices, dtype=np.int_) + indptr = np.frombuffer(indptr, dtype=np.int_) + else: + # Sparse arrays only support 32-bit integers + j_indices = frombuffer_empty(j_indices, dtype=np.intc) + indptr = np.frombuffer(indptr, dtype=np.intc) values = np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), @@ -912,7 +929,12 @@ def _make_int_array(): return array.array(str("i")) def _make_long_array(): - """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers).""" + """Construct an array.array of a type suitable for large scipy.sparse indices. + + scipy 0.14 and later can construct sparse matrices with 64 bit integer indices. + + NOTE: long on Windows is only 32 bits + """ return array.array(str("l"))