From ac6bd1e515cc9dc65996328a8dbaa2adcc5c4a4b Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 25 Nov 2022 15:35:10 +0500 Subject: [PATCH 1/8] MAINT remove -Wcpp warnings when compiling sklearn.preprocessing._csr_polynomial_expansion --- setup.py | 1 + .../_csr_polynomial_expansion.pyx | 56 +++++++++++-------- 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/setup.py b/setup.py index 27773c8a57faa..0e873ccbc6e47 100755 --- a/setup.py +++ b/setup.py @@ -96,6 +96,7 @@ "sklearn.metrics._pairwise_distances_reduction._radius_neighbors", "sklearn.metrics._pairwise_fast", "sklearn.neighbors._partition_nodes", + "sklearn.preprocessing._csr_polynomial_expansion", "sklearn.tree._splitter", "sklearn.tree._utils", "sklearn.utils._cython_blas", diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 7083e9de1ae0d..14639460d290a 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -1,10 +1,9 @@ # Author: Andrew nystrom - +import numpy as np from scipy.sparse import csr_matrix cimport numpy as cnp cnp.import_array() -ctypedef cnp.int32_t INDEX_T ctypedef fused DATA_T: cnp.float32_t @@ -13,8 +12,12 @@ ctypedef fused DATA_T: cnp.int64_t -cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j, - INDEX_T interaction_only) nogil: +cdef inline cnp.int32_t _deg2_column( + cnp.int32_t d, + cnp.int32_t i, + cnp.int32_t j, + cnp.int32_t interaction_only +) nogil: """Compute the index of the column for a degree 2 expansion d is the dimensionality of the input data, i and j are the indices @@ -26,8 +29,13 @@ cdef inline INDEX_T _deg2_column(INDEX_T d, INDEX_T i, INDEX_T j, return d * i - (i**2 + i) / 2 + j -cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k, - INDEX_T interaction_only) nogil: +cdef inline cnp.int32_t _deg3_column( + cnp.int32_t d, + cnp.int32_t i, + cnp.int32_t j, + cnp.int32_t k, + cnp.int32_t interaction_only +) nogil: """Compute the index of the column for a degree 3 expansion d is the dimensionality of the input data, i, j and k are the indices @@ -43,11 +51,14 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k, + d * j + k) -def _csr_polynomial_expansion(cnp.ndarray[DATA_T, ndim=1] data, - cnp.ndarray[INDEX_T, ndim=1] indices, - cnp.ndarray[INDEX_T, ndim=1] indptr, - INDEX_T d, INDEX_T interaction_only, - INDEX_T degree): +def _csr_polynomial_expansion( + DATA_T[:] data, + cnp.int32_t[:] indices, + cnp.int32_t[:] indptr, + cnp.int32_t d, + cnp.int32_t interaction_only, + cnp.int32_t degree +): """ Perform a second-degree polynomial or interaction expansion on a scipy compressed sparse row (CSR) matrix. The method used only takes products of @@ -57,13 +68,13 @@ def _csr_polynomial_expansion(cnp.ndarray[DATA_T, ndim=1] data, Parameters ---------- - data : nd-array + data : memory view on nd-array The "data" attribute of the input CSR matrix. - indices : nd-array + indices : memory view on nd-array The "indices" attribute of the input CSR matrix. - indptr : nd-array + indptr : memory view on nd-array The "indptr" attribute of the input CSR matrix. d : int @@ -92,7 +103,7 @@ def _csr_polynomial_expansion(cnp.ndarray[DATA_T, ndim=1] data, return None assert expanded_dimensionality > 0 - cdef INDEX_T total_nnz = 0, row_i, nnz + cdef cnp.int32_t total_nnz = 0, row_i, nnz # Count how many nonzero elements the expanded matrix will contain. for row_i in range(indptr.shape[0]-1): @@ -105,15 +116,12 @@ def _csr_polynomial_expansion(cnp.ndarray[DATA_T, ndim=1] data, - interaction_only * nnz ** 2) # Make the arrays that will form the CSR matrix of the expansion. - cdef cnp.ndarray[DATA_T, ndim=1] expanded_data = cnp.ndarray( - shape=total_nnz, dtype=data.dtype) - cdef cnp.ndarray[INDEX_T, ndim=1] expanded_indices = cnp.ndarray( - shape=total_nnz, dtype=indices.dtype) - cdef INDEX_T num_rows = indptr.shape[0] - 1 - cdef cnp.ndarray[INDEX_T, ndim=1] expanded_indptr = cnp.ndarray( - shape=num_rows + 1, dtype=indptr.dtype) - - cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \ + cdef DATA_T[:] expanded_data = np.empty(shape=total_nnz, dtype=data.base.dtype) + cdef cnp.int32_t[:] expanded_indices = np.empty(shape=total_nnz, dtype=np.int32) + cdef cnp.int32_t num_rows = indptr.shape[0] - 1 + cdef cnp.int32_t[:] expanded_indptr = np.empty(shape=num_rows + 1, dtype=np.int32) + + cdef cnp.int32_t expanded_index = 0, row_starts, row_ends, i, j, k, \ i_ptr, j_ptr, k_ptr, num_cols_in_row, \ expanded_column From 41da49f2f3780a13b192963d2cf5134dbe44c844 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 25 Nov 2022 16:30:27 +0500 Subject: [PATCH 2/8] Arrange order of numpy import --- sklearn/preprocessing/_csr_polynomial_expansion.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 14639460d290a..926a7bdabd748 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -1,7 +1,8 @@ # Author: Andrew nystrom -import numpy as np + from scipy.sparse import csr_matrix cimport numpy as cnp +import numpy as np cnp.import_array() From 65629f0c5fd76dcbb8f42839366306c342f637af Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 25 Nov 2022 19:33:49 +0500 Subject: [PATCH 3/8] data is a read-only array which has a fused type so it can't be supported be a const memory view currently --- setup.py | 1 - sklearn/preprocessing/_csr_polynomial_expansion.pyx | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 0e873ccbc6e47..27773c8a57faa 100755 --- a/setup.py +++ b/setup.py @@ -96,7 +96,6 @@ "sklearn.metrics._pairwise_distances_reduction._radius_neighbors", "sklearn.metrics._pairwise_fast", "sklearn.neighbors._partition_nodes", - "sklearn.preprocessing._csr_polynomial_expansion", "sklearn.tree._splitter", "sklearn.tree._utils", "sklearn.utils._cython_blas", diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 926a7bdabd748..998ce7bb36463 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -53,7 +53,7 @@ cdef inline cnp.int32_t _deg3_column( def _csr_polynomial_expansion( - DATA_T[:] data, + cnp.ndarray[DATA_T, ndim=1] data, cnp.int32_t[:] indices, cnp.int32_t[:] indptr, cnp.int32_t d, @@ -117,7 +117,7 @@ def _csr_polynomial_expansion( - interaction_only * nnz ** 2) # Make the arrays that will form the CSR matrix of the expansion. - cdef DATA_T[:] expanded_data = np.empty(shape=total_nnz, dtype=data.base.dtype) + cdef DATA_T[:] expanded_data = np.empty(shape=total_nnz, dtype=data.dtype) cdef cnp.int32_t[:] expanded_indices = np.empty(shape=total_nnz, dtype=np.int32) cdef cnp.int32_t num_rows = indptr.shape[0] - 1 cdef cnp.int32_t[:] expanded_indptr = np.empty(shape=num_rows + 1, dtype=np.int32) From 843b011df8073c6e4a2484738b48b1d93d0da3db Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Thu, 26 Jan 2023 19:08:11 +0500 Subject: [PATCH 4/8] Add const with fused memory views --- sklearn/preprocessing/_csr_polynomial_expansion.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 998ce7bb36463..b4634d6dd1279 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -17,7 +17,7 @@ cdef inline cnp.int32_t _deg2_column( cnp.int32_t d, cnp.int32_t i, cnp.int32_t j, - cnp.int32_t interaction_only + cnp.int32_t interaction_only, ) nogil: """Compute the index of the column for a degree 2 expansion @@ -53,9 +53,9 @@ cdef inline cnp.int32_t _deg3_column( def _csr_polynomial_expansion( - cnp.ndarray[DATA_T, ndim=1] data, - cnp.int32_t[:] indices, - cnp.int32_t[:] indptr, + const DATA_T[:] data, + const cnp.int32_t[:] indices, + const cnp.int32_t[:] indptr, cnp.int32_t d, cnp.int32_t interaction_only, cnp.int32_t degree From 563fd00a8a729519b3d9be9a433b8df60a0f5ef0 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Thu, 26 Jan 2023 19:10:00 +0500 Subject: [PATCH 5/8] Minor formatting --- sklearn/preprocessing/_csr_polynomial_expansion.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index b4634d6dd1279..56a1034a2297a 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -117,10 +117,16 @@ def _csr_polynomial_expansion( - interaction_only * nnz ** 2) # Make the arrays that will form the CSR matrix of the expansion. - cdef DATA_T[:] expanded_data = np.empty(shape=total_nnz, dtype=data.dtype) - cdef cnp.int32_t[:] expanded_indices = np.empty(shape=total_nnz, dtype=np.int32) + cdef DATA_T[:] expanded_data = np.empty( + shape=total_nnz, dtype=data.dtype + ) + cdef cnp.int32_t[:] expanded_indices = np.empty( + shape=total_nnz, dtype=np.int32 + ) cdef cnp.int32_t num_rows = indptr.shape[0] - 1 - cdef cnp.int32_t[:] expanded_indptr = np.empty(shape=num_rows + 1, dtype=np.int32) + cdef cnp.int32_t[:] expanded_indptr = np.empty( + shape=num_rows + 1, dtype=np.int32 + ) cdef cnp.int32_t expanded_index = 0, row_starts, row_ends, i, j, k, \ i_ptr, j_ptr, k_ptr, num_cols_in_row, \ From 85deca3f8a16eeae363c5b64915c3eb190d9e564 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 27 Jan 2023 12:21:09 +0500 Subject: [PATCH 6/8] Applied PR suggestions --- .../_csr_polynomial_expansion.pyx | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 56a1034a2297a..7afafacb9aceb 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -1,17 +1,12 @@ # Author: Andrew nystrom +from cython cimport numeric from scipy.sparse import csr_matrix cimport numpy as cnp import numpy as np cnp.import_array() -ctypedef fused DATA_T: - cnp.float32_t - cnp.float64_t - cnp.int32_t - cnp.int64_t - cdef inline cnp.int32_t _deg2_column( cnp.int32_t d, @@ -53,7 +48,7 @@ cdef inline cnp.int32_t _deg3_column( def _csr_polynomial_expansion( - const DATA_T[:] data, + const numeric[:] data, const cnp.int32_t[:] indices, const cnp.int32_t[:] indptr, cnp.int32_t d, @@ -117,20 +112,21 @@ def _csr_polynomial_expansion( - interaction_only * nnz ** 2) # Make the arrays that will form the CSR matrix of the expansion. - cdef DATA_T[:] expanded_data = np.empty( - shape=total_nnz, dtype=data.dtype - ) - cdef cnp.int32_t[:] expanded_indices = np.empty( - shape=total_nnz, dtype=np.int32 - ) - cdef cnp.int32_t num_rows = indptr.shape[0] - 1 - cdef cnp.int32_t[:] expanded_indptr = np.empty( - shape=num_rows + 1, dtype=np.int32 - ) - - cdef cnp.int32_t expanded_index = 0, row_starts, row_ends, i, j, k, \ - i_ptr, j_ptr, k_ptr, num_cols_in_row, \ - expanded_column + cdef: + numeric[:] expanded_data = np.empty( + shape=total_nnz, dtype=data.base.dtype + ) + cnp.int32_t[:] expanded_indices = np.empty( + shape=total_nnz, dtype=np.int32 + ) + cnp.int32_t num_rows = indptr.shape[0] - 1 + cnp.int32_t[:] expanded_indptr = np.empty( + shape=num_rows + 1, dtype=np.int32 + ) + + cnp.int32_t expanded_index = 0, row_starts, row_ends, i, j, k, \ + i_ptr, j_ptr, k_ptr, num_cols_in_row, \ + expanded_column with nogil: expanded_indptr[0] = indptr[0] From d9a835a9df13f8f9eafa89482f9cb5b0beb8ef20 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 27 Jan 2023 14:28:05 +0500 Subject: [PATCH 7/8] Define DATA_T using float, double, int and long, Add _csr_polynomial inside USE_NEWEST_NUMPY_C_API --- setup.py | 1 + sklearn/preprocessing/_csr_polynomial_expansion.pyx | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index f071a05a84141..89d4c4e0aa3f9 100755 --- a/setup.py +++ b/setup.py @@ -104,6 +104,7 @@ "sklearn.neighbors._kd_tree", "sklearn.neighbors._partition_nodes", "sklearn.neighbors._quad_tree", + "sklearn.preprocessing._csr_polynomial_expansion", "sklearn.svm._liblinear", "sklearn.svm._libsvm", "sklearn.svm._libsvm_sparse", diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 7afafacb9aceb..1825d7e02a546 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -1,12 +1,17 @@ # Author: Andrew nystrom -from cython cimport numeric from scipy.sparse import csr_matrix cimport numpy as cnp import numpy as np cnp.import_array() +ctypedef fused DATA_T: + float + double + int + long + cdef inline cnp.int32_t _deg2_column( cnp.int32_t d, @@ -48,7 +53,7 @@ cdef inline cnp.int32_t _deg3_column( def _csr_polynomial_expansion( - const numeric[:] data, + const DATA_T[:] data, const cnp.int32_t[:] indices, const cnp.int32_t[:] indptr, cnp.int32_t d, @@ -113,7 +118,7 @@ def _csr_polynomial_expansion( # Make the arrays that will form the CSR matrix of the expansion. cdef: - numeric[:] expanded_data = np.empty( + DATA_T[:] expanded_data = np.empty( shape=total_nnz, dtype=data.base.dtype ) cnp.int32_t[:] expanded_indices = np.empty( From 8afcb7ed18cd30f9202277c277562c9db28dbf52 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 27 Jan 2023 15:44:28 +0500 Subject: [PATCH 8/8] Add TODO --- sklearn/preprocessing/_csr_polynomial_expansion.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/preprocessing/_csr_polynomial_expansion.pyx b/sklearn/preprocessing/_csr_polynomial_expansion.pyx index 1825d7e02a546..17ab1da537fff 100644 --- a/sklearn/preprocessing/_csr_polynomial_expansion.pyx +++ b/sklearn/preprocessing/_csr_polynomial_expansion.pyx @@ -6,6 +6,8 @@ import numpy as np cnp.import_array() +# TODO: use `cnp.{int,float}{32,64}` when cython#5230 is resolved: +# https://github.com/cython/cython/issues/5230 ctypedef fused DATA_T: float double