From 722a95e2c30dd715afda0e0ae53002e326e39473 Mon Sep 17 00:00:00 2001
From: OmarManzoor <omar.salman@arbisoft.com>
Date: Mon, 16 Jan 2023 16:55:34 +0500
Subject: [PATCH 1/3] MAINT Remove -Wcpp warnings from
 sklearn.utils._seq_dataset

---
 setup.py                          | 13 +++--
 sklearn/utils/_seq_dataset.pyx.tp | 94 ++++++++++++++++---------------
 2 files changed, 57 insertions(+), 50 deletions(-)

diff --git a/setup.py b/setup.py
index 5411165c50379..f8c8ff8747c5f 100755
--- a/setup.py
+++ b/setup.py
@@ -106,22 +106,23 @@
     "sklearn.svm._liblinear",
     "sklearn.svm._libsvm",
     "sklearn.svm._libsvm_sparse",
+    "sklearn.svm._newrand",
     "sklearn.tree._splitter",
     "sklearn.tree._utils",
     "sklearn.utils._cython_blas",
     "sklearn.utils._fast_dict",
+    "sklearn.utils._heap",
+    "sklearn.utils._isfinite",
+    "sklearn.utils._logistic_sigmoid",
     "sklearn.utils._openmp_helpers",
-    "sklearn.utils._weight_vector",
     "sklearn.utils._random",
-    "sklearn.utils._logistic_sigmoid",
     "sklearn.utils._readonly_array_wrapper",
-    "sklearn.utils._typedefs",
-    "sklearn.utils._heap",
+    "sklearn.utils._seq_dataset",
     "sklearn.utils._sorting",
+    "sklearn.utils._typedefs",
     "sklearn.utils._vector_sentinel",
-    "sklearn.utils._isfinite",
+    "sklearn.utils._weight_vector",
     "sklearn.utils.murmurhash",
-    "sklearn.svm._newrand",
     "sklearn._isotonic",
 )
 
diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp
index 0ef53222e3747..fa5f6486e0c91 100644
--- a/sklearn/utils/_seq_dataset.pyx.tp
+++ b/sklearn/utils/_seq_dataset.pyx.tp
@@ -197,11 +197,9 @@ cdef class SequentialDataset{{name_suffix}}:
                      current_index)
 
         # transform the pointed data in numpy CSR array
-        cdef cnp.ndarray[{{c_type}}, ndim=1] x_data = np.empty(nnz,
-                                                              dtype={{np_type}})
-        cdef cnp.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32)
-        cdef cnp.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz],
-                                                           dtype=np.int32)
+        cdef {{c_type}}[:] x_data = np.empty(nnz, dtype={{np_type}})
+        cdef int[:] x_indices = np.empty(nnz, dtype=np.int32)
+        cdef int[:] x_indptr = np.asarray([0, nnz], dtype=np.int32)
 
         for j in range(nnz):
             x_data[j] = x_data_ptr[j]
@@ -209,7 +207,12 @@ cdef class SequentialDataset{{name_suffix}}:
 
         cdef int sample_idx = self.index_data_ptr[current_index]
 
-        return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx
+        return (
+            (x_data.base, x_indices.base, x_indptr.base),
+            y,
+            sample_weight,
+            sample_idx,
+        )
 
 
 cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
@@ -219,10 +222,13 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
     and C-style memory layout.
     """
 
-    def __cinit__(self, cnp.ndarray[{{c_type}}, ndim=2, mode='c'] X,
-                  cnp.ndarray[{{c_type}}, ndim=1, mode='c'] Y,
-                  cnp.ndarray[{{c_type}}, ndim=1, mode='c'] sample_weights,
-                  cnp.uint32_t seed=1):
+    def __cinit__(
+        self,
+        const {{c_type}}[:, ::1] X,
+        const {{c_type}}[::1] Y,
+        const {{c_type}}[::1] sample_weights,
+        cnp.uint32_t seed=1,
+    ):
         """A ``SequentialDataset`` backed by a two-dimensional numpy array.
 
         Parameters
@@ -242,29 +248,27 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
                              % (INT_MAX, X.shape[0], X.shape[1]))
 
         # keep a reference to the data to prevent garbage collection
-        self.X = X
-        self.Y = Y
-        self.sample_weights = sample_weights
+        self.X = X.base
+        self.Y = Y.base
+        self.sample_weights = sample_weights.base
 
         self.n_samples = X.shape[0]
         self.n_features = X.shape[1]
 
-        cdef cnp.ndarray[int, ndim=1, mode='c'] feature_indices = \
-            np.arange(0, self.n_features, dtype=np.intc)
-        self.feature_indices = feature_indices
-        self.feature_indices_ptr = <int *> feature_indices.data
+        cdef int[::1] feature_indices = np.arange(0, self.n_features, dtype=np.intc)
+        self.feature_indices = feature_indices.base
+        self.feature_indices_ptr = <int *> &feature_indices[0]
 
         self.current_index = -1
         self.X_stride = X.strides[0] // X.itemsize
-        self.X_data_ptr = <{{c_type}} *>X.data
-        self.Y_data_ptr = <{{c_type}} *>Y.data
-        self.sample_weight_data = <{{c_type}} *>sample_weights.data
+        self.X_data_ptr = <{{c_type}} *> &X[0, 0]
+        self.Y_data_ptr = <{{c_type}} *> &Y[0]
+        self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
 
         # Use index array for fast shuffling
-        cdef cnp.ndarray[int, ndim=1, mode='c'] index = \
-            np.arange(0, self.n_samples, dtype=np.intc)
-        self.index = index
-        self.index_data_ptr = <int *>index.data
+        cdef int[::1] index = np.arange(0, self.n_samples, dtype=np.intc)
+        self.index = index.base
+        self.index_data_ptr = <int *> &index[0]
         # seed should not be 0 for our_rand_r
         self.seed = max(seed, 1)
 
@@ -284,12 +288,15 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
 cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
     """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
 
-    def __cinit__(self, cnp.ndarray[{{c_type}}, ndim=1, mode='c'] X_data,
-                  cnp.ndarray[int, ndim=1, mode='c'] X_indptr,
-                  cnp.ndarray[int, ndim=1, mode='c'] X_indices,
-                  cnp.ndarray[{{c_type}}, ndim=1, mode='c'] Y,
-                  cnp.ndarray[{{c_type}}, ndim=1, mode='c'] sample_weights,
-                  cnp.uint32_t seed=1):
+    def __cinit__(
+        self,
+        const {{c_type}}[::1] X_data,
+        const int[::1] X_indptr,
+        const int[::1] X_indices,
+        const {{c_type}}[::1] Y,
+        const {{c_type}}[::1] sample_weights,
+        cnp.uint32_t seed=1,
+    ):
         """Dataset backed by a scipy sparse CSR matrix.
 
         The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
@@ -314,26 +321,25 @@ cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
             The weight of each sample.
         """
         # keep a reference to the data to prevent garbage collection
-        self.X_data = X_data
-        self.X_indptr = X_indptr
-        self.X_indices = X_indices
-        self.Y = Y
-        self.sample_weights = sample_weights
+        self.X_data = X_data.base
+        self.X_indptr = X_indptr.base
+        self.X_indices = X_indices.base
+        self.Y = Y.base
+        self.sample_weights = sample_weights.base
 
         self.n_samples = Y.shape[0]
         self.current_index = -1
-        self.X_data_ptr = <{{c_type}} *>X_data.data
-        self.X_indptr_ptr = <int *>X_indptr.data
-        self.X_indices_ptr = <int *>X_indices.data
+        self.X_data_ptr = <{{c_type}} *> &X_data[0]
+        self.X_indptr_ptr = <int *> &X_indptr[0]
+        self.X_indices_ptr = <int *> &X_indices[0]
 
-        self.Y_data_ptr = <{{c_type}} *>Y.data
-        self.sample_weight_data = <{{c_type}} *>sample_weights.data
+        self.Y_data_ptr = <{{c_type}} *> &Y[0]
+        self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
 
         # Use index array for fast shuffling
-        cdef cnp.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples,
-                                                               dtype=np.intc)
-        self.index = idx
-        self.index_data_ptr = <int *>idx.data
+        cdef int[::1] idx = np.arange(self.n_samples, dtype=np.intc)
+        self.index = idx.base
+        self.index_data_ptr = <int *> &idx[0]
         # seed should not be 0 for our_rand_r
         self.seed = max(seed, 1)
 

From 0c764a393cccd296f42438f33e107a531575c9ae Mon Sep 17 00:00:00 2001
From: OmarManzoor <omar.salman@arbisoft.com>
Date: Fri, 27 Jan 2023 17:08:23 +0500
Subject: [PATCH 2/3] Minor adjustment

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7df3f8755e282..f252e1ec03ad8 100755
--- a/setup.py
+++ b/setup.py
@@ -70,6 +70,7 @@
 USE_NEWEST_NUMPY_C_API = (
     "sklearn.__check_build._check_build",
     "sklearn._loss._loss",
+    "sklearn._isotonic",
     "sklearn.cluster._dbscan_inner",
     "sklearn.cluster._hierarchical_fast",
     "sklearn.cluster._k_means_common",
@@ -125,7 +126,6 @@
     "sklearn.utils._vector_sentinel",
     "sklearn.utils._weight_vector",
     "sklearn.utils.murmurhash",
-    "sklearn._isotonic",
 )
 
 

From 1278504dcc0ffc82912f91fecff60814bf7411d1 Mon Sep 17 00:00:00 2001
From: OmarManzoor <omar.salman@arbisoft.com>
Date: Fri, 27 Jan 2023 18:29:27 +0500
Subject: [PATCH 3/3] Replace memory views inside seq dataset pxd.tp file as
 well

---
 sklearn/utils/_seq_dataset.pxd.tp | 20 +++++++++----------
 sklearn/utils/_seq_dataset.pyx.tp | 33 ++++++++++++++-----------------
 2 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/sklearn/utils/_seq_dataset.pxd.tp b/sklearn/utils/_seq_dataset.pxd.tp
index 1f3b3a236efc2..6783a2da2c3ce 100644
--- a/sklearn/utils/_seq_dataset.pxd.tp
+++ b/sklearn/utils/_seq_dataset.pxd.tp
@@ -34,7 +34,7 @@ cimport numpy as cnp
 
 cdef class SequentialDataset{{name_suffix}}:
     cdef int current_index
-    cdef cnp.ndarray index
+    cdef int[::1] index
     cdef int *index_data_ptr
     cdef Py_ssize_t n_samples
     cdef cnp.uint32_t seed
@@ -53,24 +53,24 @@ cdef class SequentialDataset{{name_suffix}}:
 
 
 cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
-    cdef cnp.ndarray X
-    cdef cnp.ndarray Y
-    cdef cnp.ndarray sample_weights
+    cdef const {{c_type}}[:, ::1] X
+    cdef const {{c_type}}[::1] Y
+    cdef const {{c_type}}[::1] sample_weights
     cdef Py_ssize_t n_features
     cdef cnp.npy_intp X_stride
     cdef {{c_type}} *X_data_ptr
     cdef {{c_type}} *Y_data_ptr
-    cdef cnp.ndarray feature_indices
+    cdef const int[::1] feature_indices
     cdef int *feature_indices_ptr
     cdef {{c_type}} *sample_weight_data
 
 
 cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
-    cdef cnp.ndarray X_data
-    cdef cnp.ndarray X_indptr
-    cdef cnp.ndarray X_indices
-    cdef cnp.ndarray Y
-    cdef cnp.ndarray sample_weights
+    cdef const {{c_type}}[::1] X_data
+    cdef const int[::1] X_indptr
+    cdef const int[::1] X_indices
+    cdef const {{c_type}}[::1] Y
+    cdef const {{c_type}}[::1] sample_weights
     cdef {{c_type}} *X_data_ptr
     cdef int *X_indptr_ptr
     cdef int *X_indices_ptr
diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp
index fa5f6486e0c91..89cea4690c982 100644
--- a/sklearn/utils/_seq_dataset.pyx.tp
+++ b/sklearn/utils/_seq_dataset.pyx.tp
@@ -208,7 +208,7 @@ cdef class SequentialDataset{{name_suffix}}:
         cdef int sample_idx = self.index_data_ptr[current_index]
 
         return (
-            (x_data.base, x_indices.base, x_indptr.base),
+            (np.asarray(x_data), np.asarray(x_indices), np.asarray(x_indptr)),
             y,
             sample_weight,
             sample_idx,
@@ -248,16 +248,15 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
                              % (INT_MAX, X.shape[0], X.shape[1]))
 
         # keep a reference to the data to prevent garbage collection
-        self.X = X.base
-        self.Y = Y.base
-        self.sample_weights = sample_weights.base
+        self.X = X
+        self.Y = Y
+        self.sample_weights = sample_weights
 
         self.n_samples = X.shape[0]
         self.n_features = X.shape[1]
 
-        cdef int[::1] feature_indices = np.arange(0, self.n_features, dtype=np.intc)
-        self.feature_indices = feature_indices.base
-        self.feature_indices_ptr = <int *> &feature_indices[0]
+        self.feature_indices = np.arange(0, self.n_features, dtype=np.intc)
+        self.feature_indices_ptr = <int *> &self.feature_indices[0]
 
         self.current_index = -1
         self.X_stride = X.strides[0] // X.itemsize
@@ -266,9 +265,8 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
         self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
 
         # Use index array for fast shuffling
-        cdef int[::1] index = np.arange(0, self.n_samples, dtype=np.intc)
-        self.index = index.base
-        self.index_data_ptr = <int *> &index[0]
+        self.index = np.arange(0, self.n_samples, dtype=np.intc)
+        self.index_data_ptr = <int *> &self.index[0]
         # seed should not be 0 for our_rand_r
         self.seed = max(seed, 1)
 
@@ -321,11 +319,11 @@ cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
             The weight of each sample.
         """
         # keep a reference to the data to prevent garbage collection
-        self.X_data = X_data.base
-        self.X_indptr = X_indptr.base
-        self.X_indices = X_indices.base
-        self.Y = Y.base
-        self.sample_weights = sample_weights.base
+        self.X_data = X_data
+        self.X_indptr = X_indptr
+        self.X_indices = X_indices
+        self.Y = Y
+        self.sample_weights = sample_weights
 
         self.n_samples = Y.shape[0]
         self.current_index = -1
@@ -337,9 +335,8 @@ cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}):
         self.sample_weight_data = <{{c_type}} *> &sample_weights[0]
 
         # Use index array for fast shuffling
-        cdef int[::1] idx = np.arange(self.n_samples, dtype=np.intc)
-        self.index = idx.base
-        self.index_data_ptr = <int *> &idx[0]
+        self.index = np.arange(self.n_samples, dtype=np.intc)
+        self.index_data_ptr = <int *> &self.index[0]
         # seed should not be 0 for our_rand_r
         self.seed = max(seed, 1)