diff --git a/setup.py b/setup.py index 89d4c4e0aa3f9..f252e1ec03ad8 100755 --- a/setup.py +++ b/setup.py @@ -70,6 +70,7 @@ USE_NEWEST_NUMPY_C_API = ( "sklearn.__check_build._check_build", "sklearn._loss._loss", + "sklearn._isotonic", "sklearn.cluster._dbscan_inner", "sklearn.cluster._hierarchical_fast", "sklearn.cluster._k_means_common", @@ -108,23 +109,23 @@ "sklearn.svm._liblinear", "sklearn.svm._libsvm", "sklearn.svm._libsvm_sparse", + "sklearn.svm._newrand", "sklearn.tree._splitter", "sklearn.tree._utils", "sklearn.utils._cython_blas", "sklearn.utils._fast_dict", + "sklearn.utils._heap", + "sklearn.utils._isfinite", + "sklearn.utils._logistic_sigmoid", "sklearn.utils._openmp_helpers", - "sklearn.utils._weight_vector", "sklearn.utils._random", - "sklearn.utils._logistic_sigmoid", "sklearn.utils._readonly_array_wrapper", - "sklearn.utils._typedefs", - "sklearn.utils._heap", + "sklearn.utils._seq_dataset", "sklearn.utils._sorting", + "sklearn.utils._typedefs", "sklearn.utils._vector_sentinel", - "sklearn.utils._isfinite", + "sklearn.utils._weight_vector", "sklearn.utils.murmurhash", - "sklearn.svm._newrand", - "sklearn._isotonic", ) diff --git a/sklearn/utils/_seq_dataset.pxd.tp b/sklearn/utils/_seq_dataset.pxd.tp index 1f3b3a236efc2..6783a2da2c3ce 100644 --- a/sklearn/utils/_seq_dataset.pxd.tp +++ b/sklearn/utils/_seq_dataset.pxd.tp @@ -34,7 +34,7 @@ cimport numpy as cnp cdef class SequentialDataset{{name_suffix}}: cdef int current_index - cdef cnp.ndarray index + cdef int[::1] index cdef int *index_data_ptr cdef Py_ssize_t n_samples cdef cnp.uint32_t seed @@ -53,24 +53,24 @@ cdef class SequentialDataset{{name_suffix}}: cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): - cdef cnp.ndarray X - cdef cnp.ndarray Y - cdef cnp.ndarray sample_weights + cdef const {{c_type}}[:, ::1] X + cdef const {{c_type}}[::1] Y + cdef const {{c_type}}[::1] sample_weights cdef Py_ssize_t n_features cdef cnp.npy_intp X_stride cdef {{c_type}} *X_data_ptr cdef {{c_type}} *Y_data_ptr - cdef cnp.ndarray feature_indices + cdef const int[::1] feature_indices cdef int *feature_indices_ptr cdef {{c_type}} *sample_weight_data cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): - cdef cnp.ndarray X_data - cdef cnp.ndarray X_indptr - cdef cnp.ndarray X_indices - cdef cnp.ndarray Y - cdef cnp.ndarray sample_weights + cdef const {{c_type}}[::1] X_data + cdef const int[::1] X_indptr + cdef const int[::1] X_indices + cdef const {{c_type}}[::1] Y + cdef const {{c_type}}[::1] sample_weights cdef {{c_type}} *X_data_ptr cdef int *X_indptr_ptr cdef int *X_indices_ptr diff --git a/sklearn/utils/_seq_dataset.pyx.tp b/sklearn/utils/_seq_dataset.pyx.tp index 0ef53222e3747..89cea4690c982 100644 --- a/sklearn/utils/_seq_dataset.pyx.tp +++ b/sklearn/utils/_seq_dataset.pyx.tp @@ -197,11 +197,9 @@ cdef class SequentialDataset{{name_suffix}}: current_index) # transform the pointed data in numpy CSR array - cdef cnp.ndarray[{{c_type}}, ndim=1] x_data = np.empty(nnz, - dtype={{np_type}}) - cdef cnp.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) - cdef cnp.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], - dtype=np.int32) + cdef {{c_type}}[:] x_data = np.empty(nnz, dtype={{np_type}}) + cdef int[:] x_indices = np.empty(nnz, dtype=np.int32) + cdef int[:] x_indptr = np.asarray([0, nnz], dtype=np.int32) for j in range(nnz): x_data[j] = x_data_ptr[j] @@ -209,7 +207,12 @@ cdef class SequentialDataset{{name_suffix}}: cdef int sample_idx = self.index_data_ptr[current_index] - return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx + return ( + (np.asarray(x_data), np.asarray(x_indices), np.asarray(x_indptr)), + y, + sample_weight, + sample_idx, + ) cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): @@ -219,10 +222,13 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): and C-style memory layout. """ - def __cinit__(self, cnp.ndarray[{{c_type}}, ndim=2, mode='c'] X, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] Y, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] sample_weights, - cnp.uint32_t seed=1): + def __cinit__( + self, + const {{c_type}}[:, ::1] X, + const {{c_type}}[::1] Y, + const {{c_type}}[::1] sample_weights, + cnp.uint32_t seed=1, + ): """A ``SequentialDataset`` backed by a two-dimensional numpy array. Parameters @@ -249,22 +255,18 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): self.n_samples = X.shape[0] self.n_features = X.shape[1] - cdef cnp.ndarray[int, ndim=1, mode='c'] feature_indices = \ - np.arange(0, self.n_features, dtype=np.intc) - self.feature_indices = feature_indices - self.feature_indices_ptr = feature_indices.data + self.feature_indices = np.arange(0, self.n_features, dtype=np.intc) + self.feature_indices_ptr = &self.feature_indices[0] self.current_index = -1 self.X_stride = X.strides[0] // X.itemsize - self.X_data_ptr = <{{c_type}} *>X.data - self.Y_data_ptr = <{{c_type}} *>Y.data - self.sample_weight_data = <{{c_type}} *>sample_weights.data + self.X_data_ptr = <{{c_type}} *> &X[0, 0] + self.Y_data_ptr = <{{c_type}} *> &Y[0] + self.sample_weight_data = <{{c_type}} *> &sample_weights[0] # Use index array for fast shuffling - cdef cnp.ndarray[int, ndim=1, mode='c'] index = \ - np.arange(0, self.n_samples, dtype=np.intc) - self.index = index - self.index_data_ptr = index.data + self.index = np.arange(0, self.n_samples, dtype=np.intc) + self.index_data_ptr = &self.index[0] # seed should not be 0 for our_rand_r self.seed = max(seed, 1) @@ -284,12 +286,15 @@ cdef class ArrayDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ - def __cinit__(self, cnp.ndarray[{{c_type}}, ndim=1, mode='c'] X_data, - cnp.ndarray[int, ndim=1, mode='c'] X_indptr, - cnp.ndarray[int, ndim=1, mode='c'] X_indices, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] Y, - cnp.ndarray[{{c_type}}, ndim=1, mode='c'] sample_weights, - cnp.uint32_t seed=1): + def __cinit__( + self, + const {{c_type}}[::1] X_data, + const int[::1] X_indptr, + const int[::1] X_indices, + const {{c_type}}[::1] Y, + const {{c_type}}[::1] sample_weights, + cnp.uint32_t seed=1, + ): """Dataset backed by a scipy sparse CSR matrix. The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. @@ -322,18 +327,16 @@ cdef class CSRDataset{{name_suffix}}(SequentialDataset{{name_suffix}}): self.n_samples = Y.shape[0] self.current_index = -1 - self.X_data_ptr = <{{c_type}} *>X_data.data - self.X_indptr_ptr = X_indptr.data - self.X_indices_ptr = X_indices.data + self.X_data_ptr = <{{c_type}} *> &X_data[0] + self.X_indptr_ptr = &X_indptr[0] + self.X_indices_ptr = &X_indices[0] - self.Y_data_ptr = <{{c_type}} *>Y.data - self.sample_weight_data = <{{c_type}} *>sample_weights.data + self.Y_data_ptr = <{{c_type}} *> &Y[0] + self.sample_weight_data = <{{c_type}} *> &sample_weights[0] # Use index array for fast shuffling - cdef cnp.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, - dtype=np.intc) - self.index = idx - self.index_data_ptr = idx.data + self.index = np.arange(self.n_samples, dtype=np.intc) + self.index_data_ptr = &self.index[0] # seed should not be 0 for our_rand_r self.seed = max(seed, 1)