From 722a95e2c30dd715afda0e0ae53002e326e39473 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Mon, 16 Jan 2023 16:55:34 +0500 Subject: [PATCH 1/3] MAINT Remove -Wcpp warnings from sklearn.utils._seq_dataset --- setup.py | 13 +++-- sklearn/utils/_seq_dataset.pyx.tp | 94 ++++++++++++++++--------------- 2 files changed, 57 insertions(+), 50 deletions(-) diff --git a/setup.py b/setup.py index 5411165c50379..f8c8ff8747c5f 100755 --- a/setup.py +++ b/setup.py @@ -106,22 +106,23 @@ "sklearn.svm._liblinear", "sklearn.svm._libsvm", "sklearn.svm._libsvm_sparse", + "sklearn.svm._newrand", "sklearn.tree._splitter", "sklearn.tree._utils", "sklearn.utils._cython_blas", "sklearn.utils._fast_dict", + "sklearn.utils._heap", + "sklearn.utils._isfinite", + "sklearn.utils._logistic_sigmoid", "sklearn.utils._openmp_helpers", - "sklearn.utils._weight_vector", "sklearn.utils._random", - "sklearn.utils._logistic_sigmoid", "sklearn.utils._readonly_array_wrapper", - "sklearn.utils._typedefs", - "sklearn.utils._heap", + "sklearn.utils._seq_dataset", "sklearn.utils._sorting", + "sklearn.utils._typedefs", "sklearn.utils._vector_sentinel", - "sklearn.utils._isfinite", + "sklearn.utils._weight_vector", "sklearn.utils.murmurhash", - "sklearn.svm._newrand", "sklearn._isotonic", ) diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp index 0ef53222e3747..fa5f6486e0c91 100644 --- a/sklearn/utils/_seq_dataset.pyx.tp +++ b/sklearn/utils/_seq_dataset.pyx.tp @@ -197,11 +197,9 @@ cdef class SequentialDataset{{name_suffix}}: current_index) # transform the pointed data in numpy CSR array - cdef cnp.ndarray[{{c_type}}, ndim=1] x_data = np.empty(nnz, - dtype={{np_type}}) - cdef cnp.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) - cdef cnp.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], - dtype=np.int32) + cdef {{c_type}}[:] x_data = np.empty(nnz, dtype={{np_type}}) + cdef int[:] x_indices = np.empty(nnz, dtype=np.int32) + cdef int[:] x_indptr = np.asarray([0, nnz], dtype=np.int32) for j in range(nnz): x_data[j] = x_data_ptr[j] @@ -209,7 +207,12 @@ cdef class SequentialDataset{{name_suffix}}: cdef int sample_idx = self.index_data_ptr[current_index] - return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx + return ( + (x_data.base, x_indices.base, x_indptr.base), + y, + sample_weight, + sample_idx, + ) cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): @@ -219,10 +222,13 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): and C-style memory layout. """ - def __cinit__(self, cnp.ndarray[{{c_type}}, ndim=2, mode='c'] X, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] Y, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] sample_weights, - cnp.uint32_t seed=1): + def __cinit__( + self, + const {{c_type}}[:, ::1] X, + const {{c_type}}[::1] Y, + const {{c_type}}[::1] sample_weights, + cnp.uint32_t seed=1, + ): """A ``SequentialDataset`` backed by a two-dimensional numpy array. Parameters @@ -242,29 +248,27 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): % (INT_MAX, X.shape[0], X.shape[1])) # keep a reference to the data to prevent garbage collection - self.X = X - self.Y = Y - self.sample_weights = sample_weights + self.X = X.base + self.Y = Y.base + self.sample_weights = sample_weights.base self.n_samples = X.shape[0] self.n_features = X.shape[1] - cdef cnp.ndarray[int, ndim=1, mode='c'] feature_indices = \ - np.arange(0, self.n_features, dtype=np.intc) - self.feature_indices = feature_indices - self.feature_indices_ptr = feature_indices.data + cdef int[::1] feature_indices = np.arange(0, self.n_features, dtype=np.intc) + self.feature_indices = feature_indices.base + self.feature_indices_ptr = &feature_indices[0] self.current_index = -1 self.X_stride = X.strides[0] // X.itemsize - self.X_data_ptr = <{{c_type}} *>X.data - self.Y_data_ptr = <{{c_type}} *>Y.data - self.sample_weight_data = <{{c_type}} *>sample_weights.data + self.X_data_ptr = <{{c_type}} *> &X[0, 0] + self.Y_data_ptr = <{{c_type}} *> &Y[0] + self.sample_weight_data = <{{c_type}} *> &sample_weights[0] # Use index array for fast shuffling - cdef cnp.ndarray[int, ndim=1, mode='c'] index = \ - np.arange(0, self.n_samples, dtype=np.intc) - self.index = index - self.index_data_ptr = index.data + cdef int[::1] index = np.arange(0, self.n_samples, dtype=np.intc) + self.index = index.base + self.index_data_ptr = &index[0] # seed should not be 0 for our_rand_r self.seed = max(seed, 1) @@ -284,12 +288,15 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ - def __cinit__(self, cnp.ndarray[{{c_type}}, ndim=1, mode='c'] X_data, - cnp.ndarray[int, ndim=1, mode='c'] X_indptr, - cnp.ndarray[int, ndim=1, mode='c'] X_indices, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] Y, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] sample_weights, - cnp.uint32_t seed=1): + def __cinit__( + self, + const {{c_type}}[::1] X_data, + const int[::1] X_indptr, + const int[::1] X_indices, + const {{c_type}}[::1] Y, + const {{c_type}}[::1] sample_weights, + cnp.uint32_t seed=1, + ): """Dataset backed by a scipy sparse CSR matrix. The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. @@ -314,26 +321,25 @@ cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): The weight of each sample. """ # keep a reference to the data to prevent garbage collection - self.X_data = X_data - self.X_indptr = X_indptr - self.X_indices = X_indices - self.Y = Y - self.sample_weights = sample_weights + self.X_data = X_data.base + self.X_indptr = X_indptr.base + self.X_indices = X_indices.base + self.Y = Y.base + self.sample_weights = sample_weights.base self.n_samples = Y.shape[0] self.current_index = -1 - self.X_data_ptr = <{{c_type}} *>X_data.data - self.X_indptr_ptr = X_indptr.data - self.X_indices_ptr = X_indices.data + self.X_data_ptr = <{{c_type}} *> &X_data[0] + self.X_indptr_ptr = &X_indptr[0] + self.X_indices_ptr = &X_indices[0] - self.Y_data_ptr = <{{c_type}} *>Y.data - self.sample_weight_data = <{{c_type}} *>sample_weights.data + self.Y_data_ptr = <{{c_type}} *> &Y[0] + self.sample_weight_data = <{{c_type}} *> &sample_weights[0] # Use index array for fast shuffling - cdef cnp.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, - dtype=np.intc) - self.index = idx - self.index_data_ptr = idx.data + cdef int[::1] idx = np.arange(self.n_samples, dtype=np.intc) + self.index = idx.base + self.index_data_ptr = &idx[0] # seed should not be 0 for our_rand_r self.seed = max(seed, 1) From 0c764a393cccd296f42438f33e107a531575c9ae Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 27 Jan 2023 17:08:23 +0500 Subject: [PATCH 2/3] Minor adjustment --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7df3f8755e282..f252e1ec03ad8 100755 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ USE_NEWEST_NUMPY_C_API = ( "sklearn.__check_build._check_build", "sklearn._loss._loss", + "sklearn._isotonic", "sklearn.cluster._dbscan_inner", "sklearn.cluster._hierarchical_fast", "sklearn.cluster._k_means_common", @@ -125,7 +126,6 @@ "sklearn.utils._vector_sentinel", "sklearn.utils._weight_vector", "sklearn.utils.murmurhash", - "sklearn._isotonic", ) From 1278504dcc0ffc82912f91fecff60814bf7411d1 Mon Sep 17 00:00:00 2001 From: OmarManzoor Date: Fri, 27 Jan 2023 18:29:27 +0500 Subject: [PATCH 3/3] Replace memory views inside seq dataset pxd.tp file as well --- sklearn/utils/_seq_dataset.pxd.tp | 20 +++++++++---------- sklearn/utils/_seq_dataset.pyx.tp | 33 ++++++++++++++----------------- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/sklearn/utils/_seq_dataset.pxd.tp b/sklearn/utils/_seq_dataset.pxd.tp index 1f3b3a236efc2..6783a2da2c3ce 100644 --- a/sklearn/utils/_seq_dataset.pxd.tp +++ b/sklearn/utils/_seq_dataset.pxd.tp @@ -34,7 +34,7 @@ cimport numpy as cnp cdef class SequentialDataset{{name_suffix}}: cdef int current_index - cdef cnp.ndarray index + cdef int[::1] index cdef int *index_data_ptr cdef Py_ssize_t n_samples cdef cnp.uint32_t seed @@ -53,24 +53,24 @@ cdef class SequentialDataset{{name_suffix}}: cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): - cdef cnp.ndarray X - cdef cnp.ndarray Y - cdef cnp.ndarray sample_weights + cdef const {{c_type}}[:, ::1] X + cdef const {{c_type}}[::1] Y + cdef const {{c_type}}[::1] sample_weights cdef Py_ssize_t n_features cdef cnp.npy_intp X_stride cdef {{c_type}} *X_data_ptr cdef {{c_type}} *Y_data_ptr - cdef cnp.ndarray feature_indices + cdef const int[::1] feature_indices cdef int *feature_indices_ptr cdef {{c_type}} *sample_weight_data cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): - cdef cnp.ndarray X_data - cdef cnp.ndarray X_indptr - cdef cnp.ndarray X_indices - cdef cnp.ndarray Y - cdef cnp.ndarray sample_weights + cdef const {{c_type}}[::1] X_data + cdef const int[::1] X_indptr + cdef const int[::1] X_indices + cdef const {{c_type}}[::1] Y + cdef const {{c_type}}[::1] sample_weights cdef {{c_type}} *X_data_ptr cdef int *X_indptr_ptr cdef int *X_indices_ptr diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp index fa5f6486e0c91..89cea4690c982 100644 --- a/sklearn/utils/_seq_dataset.pyx.tp +++ b/sklearn/utils/_seq_dataset.pyx.tp @@ -208,7 +208,7 @@ cdef class SequentialDataset{{name_suffix}}: cdef int sample_idx = self.index_data_ptr[current_index] return ( - (x_data.base, x_indices.base, x_indptr.base), + (np.asarray(x_data), np.asarray(x_indices), np.asarray(x_indptr)), y, sample_weight, sample_idx, @@ -248,16 +248,15 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): % (INT_MAX, X.shape[0], X.shape[1])) # keep a reference to the data to prevent garbage collection - self.X = X.base - self.Y = Y.base - self.sample_weights = sample_weights.base + self.X = X + self.Y = Y + self.sample_weights = sample_weights self.n_samples = X.shape[0] self.n_features = X.shape[1] - cdef int[::1] feature_indices = np.arange(0, self.n_features, dtype=np.intc) - self.feature_indices = feature_indices.base - self.feature_indices_ptr = &feature_indices[0] + self.feature_indices = np.arange(0, self.n_features, dtype=np.intc) + self.feature_indices_ptr = &self.feature_indices[0] self.current_index = -1 self.X_stride = X.strides[0] // X.itemsize @@ -266,9 +265,8 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): self.sample_weight_data = <{{c_type}} *> &sample_weights[0] # Use index array for fast shuffling - cdef int[::1] index = np.arange(0, self.n_samples, dtype=np.intc) - self.index = index.base - self.index_data_ptr = &index[0] + self.index = np.arange(0, self.n_samples, dtype=np.intc) + self.index_data_ptr = &self.index[0] # seed should not be 0 for our_rand_r self.seed = max(seed, 1) @@ -321,11 +319,11 @@ cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): The weight of each sample. """ # keep a reference to the data to prevent garbage collection - self.X_data = X_data.base - self.X_indptr = X_indptr.base - self.X_indices = X_indices.base - self.Y = Y.base - self.sample_weights = sample_weights.base + self.X_data = X_data + self.X_indptr = X_indptr + self.X_indices = X_indices + self.Y = Y + self.sample_weights = sample_weights self.n_samples = Y.shape[0] self.current_index = -1 @@ -337,9 +335,8 @@ cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): self.sample_weight_data = <{{c_type}} *> &sample_weights[0] # Use index array for fast shuffling - cdef int[::1] idx = np.arange(self.n_samples, dtype=np.intc) - self.index = idx.base - self.index_data_ptr = &idx[0] + self.index = np.arange(self.n_samples, dtype=np.intc) + self.index_data_ptr = &self.index[0] # seed should not be 0 for our_rand_r self.seed = max(seed, 1)