From 2a95d1b92d598beed571f20e99fcf25ac994d70a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 31 Aug 2017 18:12:20 +0200 Subject: [PATCH] Add 64 bit indices support in csr_row_norms and inplace L2/L1 csr norm --- sklearn/utils/sparsefuncs_fast.pyx | 33 +++++++++++++------------ sklearn/utils/tests/test_extmath.py | 17 ++++++++++--- sklearn/utils/tests/test_sparsefuncs.py | 18 ++++++++++---- 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 9ff79c628a1b8..52c12ce5d5953 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -18,6 +18,9 @@ from cython cimport floating np.import_array() +ctypedef fused integral: + int + long long ctypedef np.float64_t DOUBLE @@ -30,11 +33,11 @@ def csr_row_norms(X): def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, shape, - np.ndarray[int, ndim=1, mode="c"] X_indices, - np.ndarray[int, ndim=1, mode="c"] X_indptr): + np.ndarray[integral, ndim=1, mode="c"] X_indices, + np.ndarray[integral, ndim=1, mode="c"] X_indptr): cdef: - unsigned int n_samples = shape[0] - unsigned int n_features = shape[1] + unsigned long long n_samples = shape[0] + unsigned long long n_features = shape[1] np.ndarray[DOUBLE, ndim=1, mode="c"] norms np.npy_intp i, j @@ -326,17 +329,16 @@ def inplace_csr_row_normalize_l1(X): def _inplace_csr_row_normalize_l1(np.ndarray[floating, ndim=1] X_data, shape, - np.ndarray[int, ndim=1] X_indices, - np.ndarray[int, ndim=1] X_indptr): - cdef unsigned int n_samples = shape[0] - cdef unsigned int n_features = shape[1] + np.ndarray[integral, ndim=1] X_indices, + np.ndarray[integral, ndim=1] X_indptr): + cdef unsigned long long n_samples = shape[0] + cdef unsigned long long n_features = shape[1] # the column indices for row i are stored in: # indices[indptr[i]:indices[i+1]] # and their corresponding values are stored in: # data[indptr[i]:indptr[i+1]] - cdef unsigned int i - cdef unsigned int j + cdef np.npy_intp i, j cdef double sum_ for i in xrange(n_samples): @@ -361,13 +363,12 @@ def inplace_csr_row_normalize_l2(X): def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data, shape, - np.ndarray[int, ndim=1] X_indices, - np.ndarray[int, ndim=1] X_indptr): - cdef unsigned int n_samples = shape[0] - cdef unsigned int n_features = shape[1] + np.ndarray[integral, ndim=1] X_indices, + np.ndarray[integral, ndim=1] X_indptr): + cdef integral n_samples = shape[0] + cdef integral n_features = shape[1] - cdef unsigned int i - cdef unsigned int j + cdef np.npy_intp i, j cdef double sum_ for i in xrange(n_samples): diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 86d604ef33f66..f53b814c70084 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -206,10 +206,19 @@ def test_row_norms(): precision) assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) - Xcsr = sparse.csr_matrix(X, dtype=dtype) - assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), - precision) - assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision) + for csr_index_dtype in [np.int32, np.int64]: + Xcsr = sparse.csr_matrix(X, dtype=dtype) + # csr_matrix will use int32 indices by default, + # up-casting those to int64 when necessary + if csr_index_dtype is np.int64: + Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype) + Xcsr.indices = Xcsr.indices.astype(csr_index_dtype) + assert Xcsr.indices.dtype == csr_index_dtype + assert Xcsr.indptr.dtype == csr_index_dtype + assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), + precision) + assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), + precision) def test_randomized_svd_low_rank_with_noise(): diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index fd09267ea7b0a..f2b35e7459833 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -478,8 +478,16 @@ def test_inplace_normalize(): for dtype in (np.float64, np.float32): X = rs.randn(10, 5).astype(dtype) X_csr = sp.csr_matrix(X) - inplace_csr_row_normalize(X_csr) - assert_equal(X_csr.dtype, dtype) - if inplace_csr_row_normalize is inplace_csr_row_normalize_l2: - X_csr.data **= 2 - assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones) + for index_dtype in [np.int32, np.int64]: + # csr_matrix will use int32 indices by default, + # up-casting those to int64 when necessary + if index_dtype is np.int64: + X_csr.indptr = X_csr.indptr.astype(index_dtype) + X_csr.indices = X_csr.indices.astype(index_dtype) + assert X_csr.indices.dtype == index_dtype + assert X_csr.indptr.dtype == index_dtype + inplace_csr_row_normalize(X_csr) + assert_equal(X_csr.dtype, dtype) + if inplace_csr_row_normalize is inplace_csr_row_normalize_l2: + X_csr.data **= 2 + assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)