From 2a95d1b92d598beed571f20e99fcf25ac994d70a Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 31 Aug 2017 18:12:20 +0200
Subject: [PATCH] Add 64 bit indices support in csr_row_norms and inplace L2/L1
 csr norm

---
 sklearn/utils/sparsefuncs_fast.pyx      | 33 +++++++++++++------------
 sklearn/utils/tests/test_extmath.py     | 17 ++++++++++---
 sklearn/utils/tests/test_sparsefuncs.py | 18 ++++++++++----
 3 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
index 9ff79c628a1b8..52c12ce5d5953 100644
--- a/sklearn/utils/sparsefuncs_fast.pyx
+++ b/sklearn/utils/sparsefuncs_fast.pyx
@@ -18,6 +18,9 @@ from cython cimport floating
 
 np.import_array()
 
+ctypedef fused integral:
+    int
+    long long
 
 ctypedef np.float64_t DOUBLE
 
@@ -30,11 +33,11 @@ def csr_row_norms(X):
 
 def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data,
                    shape,
-                   np.ndarray[int, ndim=1, mode="c"] X_indices,
-                   np.ndarray[int, ndim=1, mode="c"] X_indptr):
+                   np.ndarray[integral, ndim=1, mode="c"] X_indices,
+                   np.ndarray[integral, ndim=1, mode="c"] X_indptr):
     cdef:
-        unsigned int n_samples = shape[0]
-        unsigned int n_features = shape[1]
+        unsigned long long n_samples = shape[0]
+        unsigned long long n_features = shape[1]
         np.ndarray[DOUBLE, ndim=1, mode="c"] norms
 
         np.npy_intp i, j
@@ -326,17 +329,16 @@ def inplace_csr_row_normalize_l1(X):
 
 def _inplace_csr_row_normalize_l1(np.ndarray[floating, ndim=1] X_data,
                                   shape,
-                                  np.ndarray[int, ndim=1] X_indices,
-                                  np.ndarray[int, ndim=1] X_indptr):
-    cdef unsigned int n_samples = shape[0]
-    cdef unsigned int n_features = shape[1]
+                                  np.ndarray[integral, ndim=1] X_indices,
+                                  np.ndarray[integral, ndim=1] X_indptr):
+    cdef unsigned long long n_samples = shape[0]
+    cdef unsigned long long n_features = shape[1]
 
     # the column indices for row i are stored in:
     #    indices[indptr[i]:indices[i+1]]
     # and their corresponding values are stored in:
     #    data[indptr[i]:indptr[i+1]]
-    cdef unsigned int i
-    cdef unsigned int j
+    cdef np.npy_intp i, j
     cdef double sum_
 
     for i in xrange(n_samples):
@@ -361,13 +363,12 @@ def inplace_csr_row_normalize_l2(X):
 
 def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data,
                                   shape,
-                                  np.ndarray[int, ndim=1] X_indices,
-                                  np.ndarray[int, ndim=1] X_indptr):
-    cdef unsigned int n_samples = shape[0]
-    cdef unsigned int n_features = shape[1]
+                                  np.ndarray[integral, ndim=1] X_indices,
+                                  np.ndarray[integral, ndim=1] X_indptr):
+    cdef integral n_samples = shape[0]
+    cdef integral n_features = shape[1]
 
-    cdef unsigned int i
-    cdef unsigned int j
+    cdef np.npy_intp i, j
     cdef double sum_
 
     for i in xrange(n_samples):
diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py
index 86d604ef33f66..f53b814c70084 100644
--- a/sklearn/utils/tests/test_extmath.py
+++ b/sklearn/utils/tests/test_extmath.py
@@ -206,10 +206,19 @@ def test_row_norms():
                                   precision)
         assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
 
-        Xcsr = sparse.csr_matrix(X, dtype=dtype)
-        assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
-                                  precision)
-        assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
+        for csr_index_dtype in [np.int32, np.int64]:
+            Xcsr = sparse.csr_matrix(X, dtype=dtype)
+            # csr_matrix will use int32 indices by default,
+            # up-casting those to int64 when necessary
+            if csr_index_dtype is np.int64:
+                Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype)
+                Xcsr.indices = Xcsr.indices.astype(csr_index_dtype)
+            assert Xcsr.indices.dtype == csr_index_dtype
+            assert Xcsr.indptr.dtype == csr_index_dtype
+            assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
+                                      precision)
+            assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr),
+                                      precision)
 
 
 def test_randomized_svd_low_rank_with_noise():
diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
index fd09267ea7b0a..f2b35e7459833 100644
--- a/sklearn/utils/tests/test_sparsefuncs.py
+++ b/sklearn/utils/tests/test_sparsefuncs.py
@@ -478,8 +478,16 @@ def test_inplace_normalize():
         for dtype in (np.float64, np.float32):
             X = rs.randn(10, 5).astype(dtype)
             X_csr = sp.csr_matrix(X)
-            inplace_csr_row_normalize(X_csr)
-            assert_equal(X_csr.dtype, dtype)
-            if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
-                X_csr.data **= 2
-            assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
+            for index_dtype in [np.int32, np.int64]:
+                # csr_matrix will use int32 indices by default,
+                # up-casting those to int64 when necessary
+                if index_dtype is np.int64:
+                    X_csr.indptr = X_csr.indptr.astype(index_dtype)
+                    X_csr.indices = X_csr.indices.astype(index_dtype)
+                assert X_csr.indices.dtype == index_dtype
+                assert X_csr.indptr.dtype == index_dtype
+                inplace_csr_row_normalize(X_csr)
+                assert_equal(X_csr.dtype, dtype)
+                if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
+                    X_csr.data **= 2
+                assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)