From ddcb64d5e943cd4cab8867ac5a8ca0b469e89cdc Mon Sep 17 00:00:00 2001
From: Claes-Fredrik Mannby <claes-fredrik.mannby@nuance.com>
Date: Tue, 19 Jan 2016 13:08:27 -0800
Subject: [PATCH 1/3] Support new scipy sparse array indices, which can now be
 > 2^31 (< 2^63). This is needed for very large training sets. Feature indices
 (based on the number of distinct features), are unlikely to need 4 bytes per
 value, however.

---
 sklearn/feature_extraction/text.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index b47569db51712..f7f22faf14ca7 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -744,8 +744,10 @@ def _count_vocab(self, raw_documents, fixed_vocab):
             vocabulary.default_factory = vocabulary.__len__
 
         analyze = self.build_analyzer()
+        # j_indices stores feature indices, likely to be < 2^31
         j_indices = _make_int_array()
-        indptr = _make_int_array()
+        # indptr stores indices into j_indices, which can be large
+        indptr = _make_long_array()
         indptr.append(0)
         for doc in raw_documents:
             for feature in analyze(doc):
@@ -909,6 +911,10 @@ def _make_int_array():
     """Construct an array.array of a type suitable for scipy.sparse indices."""
     return array.array(str("i"))
 
+def _make_long_array():
+    """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers)."""
+    return array.array(str("l"))
+
 
 class TfidfTransformer(BaseEstimator, TransformerMixin):
     """Transform a count matrix to a normalized tf or tf-idf representation

From c75c0b80a03521be3abb96e0b0286784bec34dfe Mon Sep 17 00:00:00 2001
From: Claes-Fredrik Mannby <claes-fredrik.mannby@nuance.com>
Date: Wed, 20 Jan 2016 14:13:12 -0800
Subject: [PATCH 2/3] Also increase size of integer values in indptr in the
 next step.

---
 sklearn/feature_extraction/text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index f7f22faf14ca7..5dae83be5a698 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -766,7 +766,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                                  " contain stop words")
 
         j_indices = frombuffer_empty(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.intc)
+        indptr = np.frombuffer(indptr, dtype=np.int_)
         values = np.ones(len(j_indices))
 
         X = sp.csr_matrix((values, j_indices, indptr),

From 3ec2503f1c7855beae4cd2460482afe6c047d596 Mon Sep 17 00:00:00 2001
From: Claes-Fredrik Mannby <claes-fredrik.mannby@nuance.com>
Date: Thu, 28 Jan 2016 11:28:20 -0800
Subject: [PATCH 3/3] Use long for both arrays if scipy >= 0.14. Tweak comments

---
 sklearn/feature_extraction/text.py | 38 +++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 5dae83be5a698..96e3fef4c59b2 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -30,7 +30,7 @@
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils import deprecated
-from ..utils.fixes import frombuffer_empty, bincount
+from ..utils.fixes import frombuffer_empty, bincount, sp_version
 from ..utils.validation import check_is_fitted
 
 __all__ = ['CountVectorizer',
@@ -744,10 +744,19 @@ def _count_vocab(self, raw_documents, fixed_vocab):
             vocabulary.default_factory = vocabulary.__len__
 
         analyze = self.build_analyzer()
-        # j_indices stores feature indices, likely to be < 2^31
-        j_indices = _make_int_array()
-        # indptr stores indices into j_indices, which can be large
-        indptr = _make_long_array()
+        if sp_version >= (0, 14):
+            # We can use 64-bit indices
+            # NOTE: long on Windows is only 32 bits
+            # j_indices stores feature indices, likely to be < 2^31
+            j_indices = _make_long_array()
+            # indptr stores indices into j_indices, which can be large
+            indptr = _make_long_array()
+        else:
+            # Sparse arrays only support 32-bit integers
+            # j_indices stores feature indices, likely to be < 2^31
+            j_indices = _make_int_array()
+            # indptr stores indices into j_indices, which can be large
+            indptr = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
             for feature in analyze(doc):
@@ -765,8 +774,16 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.int_)
+        if sp_version >= (0, 14):
+            # We can use 64-bit indices
+            # int_ == "l" (long)
+            # NOTE: long on Windows is only 32 bits
+            j_indices = frombuffer_empty(j_indices, dtype=np.int_)
+            indptr = np.frombuffer(indptr, dtype=np.int_)
+        else:
+            # Sparse arrays only support 32-bit integers
+            j_indices = frombuffer_empty(j_indices, dtype=np.intc)
+            indptr = np.frombuffer(indptr, dtype=np.intc)
         values = np.ones(len(j_indices))
 
         X = sp.csr_matrix((values, j_indices, indptr),
@@ -912,7 +929,12 @@ def _make_int_array():
     return array.array(str("i"))
 
 def _make_long_array():
-    """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers)."""
+    """Construct an array.array of a type suitable for large scipy.sparse indices.
+
+    scipy 0.14 and later can construct sparse matrices with 64 bit integer indices.
+
+    NOTE: long on Windows is only 32 bits
+    """
     return array.array(str("l"))