scikit-learn · glemaitre · Feb 28, 2019 · Dec 28, 2018 · Dec 30, 2018 · Dec 30, 2018
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
@@ -1,3 +1,4 @@
+# cython: language_level=3
 # Authors: Gilles Louppe <[email protected]>
 #          Peter Prettenhofer <[email protected]>
 #          Brian Holt <[email protected]>
@@ -12,20 +13,19 @@
 import numpy as np
 cimport numpy as np
 
-ctypedef np.npy_float32 DTYPE_t          # Type of X
-ctypedef np.npy_float64 DOUBLE_t         # Type of y, sample_weight
-ctypedef np.npy_intp SIZE_t              # Type for indices and counters
-ctypedef np.npy_int32 INT32_t            # Signed 32 bit integer
-ctypedef np.npy_uint32 UINT32_t          # Unsigned 32 bit integer
+from ._tree cimport DTYPE_t          # Type of X
+from ._tree cimport DOUBLE_t         # Type of y, sample_weight
+from ._tree cimport SIZE_t           # Type for indices and counters
+from ._tree cimport INT32_t          # Signed 32 bit integer
+from ._tree cimport UINT32_t         # Unsigned 32 bit integer
 
 cdef class Criterion:
     # The criterion computes the impurity of a node and the reduction of
     # impurity of a split on that node. It also computes the output statistics
     # such as the mean in regression and class probabilities in classification.
 
     # Internal structures
-    cdef DOUBLE_t* y                     # Values of y
-    cdef SIZE_t y_stride                 # Stride in y (since n_outputs >= 1)
+    cdef const DOUBLE_t[:, ::1] y        # Values of y
     cdef DOUBLE_t* sample_weight         # Sample weights
 
     cdef SIZE_t* samples                 # Sample indices in X, y
@@ -53,7 +53,7 @@ cdef class Criterion:
     # statistics correspond to samples[start:pos] and samples[pos:end].
 
     # Methods
-    cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
+    cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
                   SIZE_t end) nogil except -1
     cdef int reset(self) nogil except -1

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
@@ -51,7 +51,7 @@ cdef class Criterion:
     def __setstate__(self, d):
         pass
 
-    cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
+    cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
                   SIZE_t end) nogil except -1:
         """Placeholder for a method which will initialize the criterion.
@@ -63,9 +63,6 @@ cdef class Criterion:
         ----------
         y : array-like, dtype=DOUBLE_t
             y is a buffer that can store values for n_outputs target variables
-        y_stride : SIZE_t
-            y_stride is used to index the kth output value as follows:
-            y[i, k] = y[i * y_stride + k]
         sample_weight : array-like, dtype=DOUBLE_t
             The weight of each sample
         weighted_n_samples : DOUBLE_t
@@ -224,8 +221,6 @@ cdef class ClassificationCriterion(Criterion):
             The number of unique classes in each target
         """
 
-        self.y = NULL
-        self.y_stride = 0
         self.sample_weight = NULL
 
         self.samples = NULL
@@ -281,7 +276,7 @@ cdef class ClassificationCriterion(Criterion):
                  sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)),
                 self.__getstate__())
 
-    cdef int init(self, DOUBLE_t* y, SIZE_t y_stride,
+    cdef int init(self, const DOUBLE_t[:, ::1] y,
                   DOUBLE_t* sample_weight, double weighted_n_samples,
                   SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1:
         """Initialize the criterion at node samples[start:end] and
@@ -294,9 +289,6 @@ cdef class ClassificationCriterion(Criterion):
         ----------
         y : array-like, dtype=DOUBLE_t
             The target stored as a buffer for memory efficiency
-        y_stride : SIZE_t
-            The stride between elements in the buffer, important if there
-            are multiple targets (multi-output)
         sample_weight : array-like, dtype=DTYPE_t
             The weight of each sample
         weighted_n_samples : SIZE_t
@@ -310,7 +302,6 @@ cdef class ClassificationCriterion(Criterion):
         """
 
         self.y = y
-        self.y_stride = y_stride
         self.sample_weight = sample_weight
         self.samples = samples
         self.start = start
@@ -343,7 +334,7 @@ cdef class ClassificationCriterion(Criterion):
 
             # Count weighted class frequency for each target
             for k in range(self.n_outputs):
-                c = <SIZE_t> y[i * y_stride + k]
+                c = <SIZE_t> self.y[i, k]
                 sum_total[k * self.sum_stride + c] += w
 
             self.weighted_n_node_samples += w
@@ -418,7 +409,6 @@ cdef class ClassificationCriterion(Criterion):
             The new ending position for which to move samples from the right
             child to the left child.
         """
-        cdef DOUBLE_t* y = self.y
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
 
@@ -453,8 +443,7 @@ cdef class ClassificationCriterion(Criterion):
                     w = sample_weight[i]
 
                 for k in range(self.n_outputs):
-                    label_index = (k * self.sum_stride +
-                                   <SIZE_t> y[i * self.y_stride + k])
+                    label_index = k * self.sum_stride + <SIZE_t> self.y[i, k]
                     sum_left[label_index] += w
 
                 self.weighted_n_left += w
@@ -469,8 +458,7 @@ cdef class ClassificationCriterion(Criterion):
                     w = sample_weight[i]
 
                 for k in range(self.n_outputs):
-                    label_index = (k * self.sum_stride +
-                                   <SIZE_t> y[i * self.y_stride + k])
+                    label_index = k * self.sum_stride + <SIZE_t> self.y[i, k]
                     sum_left[label_index] -= w
 
                 self.weighted_n_left -= w
@@ -714,8 +702,6 @@ cdef class RegressionCriterion(Criterion):
         """
 
         # Default values
-        self.y = NULL
-        self.y_stride = 0
         self.sample_weight = NULL
 
         self.samples = NULL
@@ -751,14 +737,13 @@ cdef class RegressionCriterion(Criterion):
     def __reduce__(self):
         return (type(self), (self.n_outputs, self.n_samples), self.__getstate__())
 
-    cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
+    cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
                   SIZE_t end) nogil except -1:
         """Initialize the criterion at node samples[start:end] and
            children samples[start:start] and samples[start:end]."""
         # Initialize fields
         self.y = y
-        self.y_stride = y_stride
         self.sample_weight = sample_weight
         self.samples = samples
         self.start = start
@@ -784,7 +769,7 @@ cdef class RegressionCriterion(Criterion):
                 w = sample_weight[i]
 
             for k in range(self.n_outputs):
-                y_ik = y[i * y_stride + k]
+                y_ik = self.y[i, k]
                 w_y_ik = w * y_ik
                 self.sum_total[k] += w_y_ik
                 self.sq_sum_total += w_y_ik * y_ik
@@ -827,14 +812,12 @@ cdef class RegressionCriterion(Criterion):
         cdef double* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
 
-        cdef DOUBLE_t* y = self.y
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
         cdef SIZE_t i
         cdef SIZE_t p
         cdef SIZE_t k
         cdef DOUBLE_t w = 1.0
-        cdef DOUBLE_t y_ik
 
         # Update statistics up to new_pos
         #
@@ -852,8 +835,7 @@ cdef class RegressionCriterion(Criterion):
                     w = sample_weight[i]
 
                 for k in range(self.n_outputs):
-                    y_ik = y[i * self.y_stride + k]
-                    sum_left[k] += w * y_ik
+                    sum_left[k] += w * self.y[i, k]
 
                 self.weighted_n_left += w
         else:
@@ -866,8 +848,7 @@ cdef class RegressionCriterion(Criterion):
                     w = sample_weight[i]
 
                 for k in range(self.n_outputs):
-                    y_ik = y[i * self.y_stride + k]
-                    sum_left[k] -= w * y_ik
+                    sum_left[k] -= w * self.y[i, k]
 
                 self.weighted_n_left -= w
 
@@ -947,15 +928,14 @@ cdef class MSE(RegressionCriterion):
            left child (samples[start:pos]) and the impurity the right child
            (samples[pos:end])."""
 
-
-        cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t pos = self.pos
         cdef SIZE_t start = self.start
 
         cdef double* sum_left = self.sum_left
         cdef double* sum_right = self.sum_right
+        cdef DOUBLE_t y_ik
 
         cdef double sq_sum_left = 0.0
         cdef double sq_sum_right
@@ -964,7 +944,6 @@ cdef class MSE(RegressionCriterion):
         cdef SIZE_t p
         cdef SIZE_t k
         cdef DOUBLE_t w = 1.0
-        cdef DOUBLE_t y_ik
 
         for p in range(start, pos):
             i = samples[p]
@@ -973,7 +952,7 @@ cdef class MSE(RegressionCriterion):
                 w = sample_weight[i]
 
             for k in range(self.n_outputs):
-                y_ik = y[i * self.y_stride + k]
+                y_ik = self.y[i, k]
                 sq_sum_left += w * y_ik * y_ik
 
         sq_sum_right = self.sq_sum_total - sq_sum_left
@@ -1014,8 +993,6 @@ cdef class MAE(RegressionCriterion):
         """
 
         # Default values
-        self.y = NULL
-        self.y_stride = 0
         self.sample_weight = NULL
 
         self.samples = NULL
@@ -1044,19 +1021,17 @@ cdef class MAE(RegressionCriterion):
             self.left_child[k] = WeightedMedianCalculator(n_samples)
             self.right_child[k] = WeightedMedianCalculator(n_samples)
 
-    cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
+    cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
                   double weighted_n_samples, SIZE_t* samples, SIZE_t start,
                   SIZE_t end) nogil except -1:
         """Initialize the criterion at node samples[start:end] and
            children samples[start:start] and samples[start:end]."""
 
         cdef SIZE_t i, p, k
-        cdef DOUBLE_t y_ik
         cdef DOUBLE_t w = 1.0
 
         # Initialize fields
         self.y = y
-        self.y_stride = y_stride
         self.sample_weight = sample_weight
         self.samples = samples
         self.start = start
@@ -1082,12 +1057,10 @@ cdef class MAE(RegressionCriterion):
                 w = sample_weight[i]
 
             for k in range(self.n_outputs):
-                y_ik = y[i * y_stride + k]
-
                 # push method ends up calling safe_realloc, hence `except -1`
                 # push all values to the right side,
                 # since pos = start initially anyway
-                (<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
+                (<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)
 
             self.weighted_n_node_samples += w
         # calculate the node medians
@@ -1172,12 +1145,10 @@ cdef class MAE(RegressionCriterion):
         cdef void** left_child = <void**> self.left_child.data
         cdef void** right_child = <void**> self.right_child.data
 
-        cdef DOUBLE_t* y = self.y
         cdef SIZE_t pos = self.pos
         cdef SIZE_t end = self.end
         cdef SIZE_t i, p, k
         cdef DOUBLE_t w = 1.0
-        cdef DOUBLE_t y_ik
 
         # Update statistics up to new_pos
         #
@@ -1193,11 +1164,10 @@ cdef class MAE(RegressionCriterion):
                     w = sample_weight[i]
 
                 for k in range(self.n_outputs):
-                    y_ik = y[i * self.y_stride + k]
                     # remove y_ik and its weight w from right and add to left
-                    (<WeightedMedianCalculator> right_child[k]).remove(y_ik, w)
+                    (<WeightedMedianCalculator> right_child[k]).remove(self.y[i, k], w)
                     # push method ends up calling safe_realloc, hence except -1
-                    (<WeightedMedianCalculator> left_child[k]).push(y_ik, w)
+                    (<WeightedMedianCalculator> left_child[k]).push(self.y[i, k], w)
 
                 self.weighted_n_left += w
         else:
@@ -1210,10 +1180,9 @@ cdef class MAE(RegressionCriterion):
                     w = sample_weight[i]
 
                 for k in range(self.n_outputs):
-                    y_ik = y[i * self.y_stride + k]
                     # remove y_ik and its weight w from left and add to right
-                    (<WeightedMedianCalculator> left_child[k]).remove(y_ik, w)
-                    (<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
+                    (<WeightedMedianCalculator> left_child[k]).remove(self.y[i, k], w)
+                    (<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)
 
                 self.weighted_n_left -= w
 
@@ -1233,24 +1202,20 @@ cdef class MAE(RegressionCriterion):
         """Evaluate the impurity of the current node, i.e. the impurity of
            samples[start:end]"""
 
-        cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
         cdef SIZE_t i, p, k
-        cdef DOUBLE_t y_ik
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t impurity = 0.0
 
         for k in range(self.n_outputs):
             for p in range(self.start, self.end):
                 i = samples[p]
 
-                y_ik = y[i * self.y_stride + k]
-
                 if sample_weight != NULL:
                     w = sample_weight[i]
 
-                impurity += fabs(y_ik - self.node_medians[k]) * w
+                impurity += fabs(self.y[i, k] - self.node_medians[k]) * w
 
         return impurity / (self.weighted_n_node_samples * self.n_outputs)
 
@@ -1261,7 +1226,6 @@ cdef class MAE(RegressionCriterion):
            (samples[pos:end]).
         """
 
-        cdef DOUBLE_t* y = self.y
         cdef DOUBLE_t* sample_weight = self.sample_weight
         cdef SIZE_t* samples = self.samples
 
@@ -1270,7 +1234,6 @@ cdef class MAE(RegressionCriterion):
         cdef SIZE_t end = self.end
 
         cdef SIZE_t i, p, k
-        cdef DOUBLE_t y_ik
         cdef DOUBLE_t median
         cdef DOUBLE_t w = 1.0
         cdef DOUBLE_t impurity_left = 0.0
@@ -1284,12 +1247,10 @@ cdef class MAE(RegressionCriterion):
             for p in range(start, pos):
                 i = samples[p]
 
-                y_ik = y[i * self.y_stride + k]
-
                 if sample_weight != NULL:
                     w = sample_weight[i]
 
-                impurity_left += fabs(y_ik - median) * w
+                impurity_left += fabs(self.y[i, k] - median) * w
         p_impurity_left[0] = impurity_left / (self.weighted_n_left * 
                                               self.n_outputs)
 
@@ -1298,12 +1259,10 @@ cdef class MAE(RegressionCriterion):
             for p in range(pos, end):
                 i = samples[p]
 
-                y_ik = y[i * self.y_stride + k]
-
                 if sample_weight != NULL:
                     w = sample_weight[i]
 
-                impurity_right += fabs(y_ik - median) * w
+                impurity_right += fabs(self.y[i, k] - median) * w
         p_impurity_right[0] = impurity_right / (self.weighted_n_right * 
                                                 self.n_outputs)