From 095f173d455bdacd00882768790a67fd75344a76 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 28 Dec 2018 20:01:53 +0100 Subject: [PATCH 01/13] simplify apply_dense with memoryviews --- sklearn/tree/_tree.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ed259c98ac850..0a7698b0d684a 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -794,10 +794,7 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef np.ndarray X_ndarray = X - cdef DTYPE_t* X_ptr = X_ndarray.data - cdef SIZE_t X_sample_stride = X.strides[0] / X.itemsize - cdef SIZE_t X_fx_stride = X.strides[1] / X.itemsize + cdef float [:, :] X_ndarray = X cdef SIZE_t n_samples = X.shape[0] # Initialize output @@ -814,8 +811,7 @@ cdef class Tree: # While node not a leaf while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - if X_ptr[X_sample_stride * i + - X_fx_stride * node.feature] <= node.threshold: + if X_ndarray[i, node.feature] <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] From 99b946e14ac6e8356a168218021ca38d0be39088 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 30 Dec 2018 16:31:39 +0100 Subject: [PATCH 02/13] change more Xs to memviews in splitter --- sklearn/tree/_splitter.pyx | 52 +++++++++++++++++++------------------- sklearn/tree/_tree.pyx | 10 +++----- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 3f5a176d9171a..04e9b16a62e24 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -240,9 +240,9 @@ cdef class Splitter: cdef class BaseDenseSplitter(Splitter): - cdef DTYPE_t* X - cdef SIZE_t X_sample_stride - cdef SIZE_t X_feature_stride + cdef DTYPE_t [:, :] X + #cdef SIZE_t X_sample_stride + #cdef SIZE_t X_feature_stride cdef np.ndarray X_idx_sorted cdef INT32_t* X_idx_sorted_ptr @@ -254,9 +254,9 @@ cdef class BaseDenseSplitter(Splitter): SIZE_t min_samples_leaf, double min_weight_leaf, object random_state, bint presort): - self.X = NULL - self.X_sample_stride = 0 - self.X_feature_stride = 0 + #self.X = NULL + #self.X_sample_stride = 0 + #self.X_feature_stride = 0 self.X_idx_sorted_ptr = NULL self.X_idx_sorted_stride = 0 self.sample_mask = NULL @@ -282,11 +282,11 @@ cdef class BaseDenseSplitter(Splitter): Splitter.init(self, X, y, sample_weight) # Initialize X - cdef np.ndarray X_ndarray = X + #cdef np.ndarray X_ndarray = X - self.X = X_ndarray.data - self.X_sample_stride = X.strides[0] / X.itemsize - self.X_feature_stride = X.strides[1] / X.itemsize + self.X = X + #self.X_sample_stride = X.strides[0] / X.itemsize + #self.X_feature_stride = X.strides[1] / X.itemsize if self.presort == 1: self.X_idx_sorted = X_idx_sorted @@ -327,10 +327,10 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t* constant_features = self.constant_features cdef SIZE_t n_features = self.n_features - cdef DTYPE_t* X = self.X + cdef DTYPE_t [:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values - cdef SIZE_t X_sample_stride = self.X_sample_stride - cdef SIZE_t X_feature_stride = self.X_feature_stride + #cdef SIZE_t X_sample_stride = self.X_sample_stride + #cdef SIZE_t X_feature_stride = self.X_feature_stride cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef double min_weight_leaf = self.min_weight_leaf @@ -414,7 +414,7 @@ cdef class BestSplitter(BaseDenseSplitter): f_j += n_found_constants # f_j in the interval [n_total_constants, f_i[ current.feature = features[f_j] - feature_offset = self.X_feature_stride * current.feature + #feature_offset = self.X_feature_stride * current.feature # Sort samples along that feature; either by utilizing # presorting, or by copying the values into an array and @@ -428,11 +428,11 @@ cdef class BestSplitter(BaseDenseSplitter): j = X_idx_sorted[i + feature_idx_offset] if sample_mask[j] == 1: samples[p] = j - Xf[p] = X[self.X_sample_stride * j + feature_offset] + Xf[p] = X[j, current.feature] p += 1 else: for i in range(start, end): - Xf[i] = X[self.X_sample_stride * samples[i] + feature_offset] + Xf[i] = X[samples[i], current.feature] sort(Xf + start, samples + start, end - start) @@ -493,12 +493,12 @@ cdef class BestSplitter(BaseDenseSplitter): # Reorganize into samples[start:best.pos] + samples[best.pos:end] if best.pos < end: - feature_offset = X_feature_stride * best.feature + #feature_offset = X_feature_stride * best.feature partition_end = end p = start while p < partition_end: - if X[X_sample_stride * samples[p] + feature_offset] <= best.threshold: + if X[samples[p], best.feature] <= best.threshold: p += 1 else: @@ -675,10 +675,10 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef SIZE_t* constant_features = self.constant_features cdef SIZE_t n_features = self.n_features - cdef DTYPE_t* X = self.X + cdef DTYPE_t [:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values - cdef SIZE_t X_sample_stride = self.X_sample_stride - cdef SIZE_t X_feature_stride = self.X_feature_stride + #cdef SIZE_t X_sample_stride = self.X_sample_stride + #cdef SIZE_t X_feature_stride = self.X_feature_stride cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef double min_weight_leaf = self.min_weight_leaf @@ -753,15 +753,15 @@ cdef class RandomSplitter(BaseDenseSplitter): # f_j in the interval [n_total_constants, f_i[ current.feature = features[f_j] - feature_stride = X_feature_stride * current.feature + #feature_stride = X_feature_stride * current.feature # Find min, max - min_feature_value = X[X_sample_stride * samples[start] + feature_stride] + min_feature_value = X[samples[start], current.feature] max_feature_value = min_feature_value Xf[start] = min_feature_value for p in range(start + 1, end): - current_feature_value = X[X_sample_stride * samples[p] + feature_stride] + current_feature_value = X[samples[p], current.feature] Xf[p] = current_feature_value if current_feature_value < min_feature_value: @@ -828,14 +828,14 @@ cdef class RandomSplitter(BaseDenseSplitter): best = current # copy # Reorganize into samples[start:best.pos] + samples[best.pos:end] - feature_stride = X_feature_stride * best.feature + #feature_stride = X_feature_stride * best.feature if best.pos < end: if current.feature != best.feature: partition_end = end p = start while p < partition_end: - if X[X_sample_stride * samples[p] + feature_stride] <= best.threshold: + if X[samples[p], best.feature] <= best.threshold: p += 1 else: diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 0a7698b0d684a..0ffeaa3ab8ee2 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -794,7 +794,7 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef float [:, :] X_ndarray = X + cdef DTYPE_t [:, :] X_ndarray = X cdef SIZE_t n_samples = X.shape[0] # Initialize output @@ -914,10 +914,7 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef np.ndarray X_ndarray = X - cdef DTYPE_t* X_ptr = X_ndarray.data - cdef SIZE_t X_sample_stride = X.strides[0] / X.itemsize - cdef SIZE_t X_fx_stride = X.strides[1] / X.itemsize + cdef float [:, :] X_ndarray = X cdef SIZE_t n_samples = X.shape[0] # Initialize output @@ -944,8 +941,7 @@ cdef class Tree: indices_ptr[indptr_ptr[i + 1]] = (node - self.nodes) indptr_ptr[i + 1] += 1 - if X_ptr[X_sample_stride * i + - X_fx_stride * node.feature] <= node.threshold: + if X_ndarray[i, node.feature] <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] From 17c3bbbdf1c240419bbe17b5fb27f89d16a08efc Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sun, 30 Dec 2018 16:40:14 +0100 Subject: [PATCH 03/13] remove commented out lines --- sklearn/tree/_splitter.pyx | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 04e9b16a62e24..2c5b8a7cbd078 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -241,8 +241,6 @@ cdef class Splitter: cdef class BaseDenseSplitter(Splitter): cdef DTYPE_t [:, :] X - #cdef SIZE_t X_sample_stride - #cdef SIZE_t X_feature_stride cdef np.ndarray X_idx_sorted cdef INT32_t* X_idx_sorted_ptr @@ -254,9 +252,6 @@ cdef class BaseDenseSplitter(Splitter): SIZE_t min_samples_leaf, double min_weight_leaf, object random_state, bint presort): - #self.X = NULL - #self.X_sample_stride = 0 - #self.X_feature_stride = 0 self.X_idx_sorted_ptr = NULL self.X_idx_sorted_stride = 0 self.sample_mask = NULL @@ -281,12 +276,7 @@ cdef class BaseDenseSplitter(Splitter): # Call parent init Splitter.init(self, X, y, sample_weight) - # Initialize X - #cdef np.ndarray X_ndarray = X - self.X = X - #self.X_sample_stride = X.strides[0] / X.itemsize - #self.X_feature_stride = X.strides[1] / X.itemsize if self.presort == 1: self.X_idx_sorted = X_idx_sorted @@ -329,8 +319,6 @@ cdef class BestSplitter(BaseDenseSplitter): cdef DTYPE_t [:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values - #cdef SIZE_t X_sample_stride = self.X_sample_stride - #cdef SIZE_t X_feature_stride = self.X_feature_stride cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef double min_weight_leaf = self.min_weight_leaf @@ -414,7 +402,6 @@ cdef class BestSplitter(BaseDenseSplitter): f_j += n_found_constants # f_j in the interval [n_total_constants, f_i[ current.feature = features[f_j] - #feature_offset = self.X_feature_stride * current.feature # Sort samples along that feature; either by utilizing # presorting, or by copying the values into an array and @@ -493,7 +480,6 @@ cdef class BestSplitter(BaseDenseSplitter): # Reorganize into samples[start:best.pos] + samples[best.pos:end] if best.pos < end: - #feature_offset = X_feature_stride * best.feature partition_end = end p = start @@ -677,8 +663,6 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef DTYPE_t [:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values - #cdef SIZE_t X_sample_stride = self.X_sample_stride - #cdef SIZE_t X_feature_stride = self.X_feature_stride cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef double min_weight_leaf = self.min_weight_leaf @@ -753,7 +737,6 @@ cdef class RandomSplitter(BaseDenseSplitter): # f_j in the interval [n_total_constants, f_i[ current.feature = features[f_j] - #feature_stride = X_feature_stride * current.feature # Find min, max min_feature_value = X[samples[start], current.feature] @@ -828,7 +811,6 @@ cdef class RandomSplitter(BaseDenseSplitter): best = current # copy # Reorganize into samples[start:best.pos] + samples[best.pos:end] - #feature_stride = X_feature_stride * best.feature if best.pos < end: if current.feature != best.feature: partition_end = end From 096276e2e3971db4523e38d2434c479870ef15fd Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 31 Dec 2018 11:58:05 +0100 Subject: [PATCH 04/13] trying to remove y_stride --- sklearn/tree/_criterion.pxd | 5 ++- sklearn/tree/_criterion.pyx | 63 ++++++++++++++----------------------- sklearn/tree/_splitter.pxd | 5 ++- sklearn/tree/_splitter.pyx | 12 +++---- 4 files changed, 32 insertions(+), 53 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 1cbd395af8e37..b39b69a70f8a8 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -24,8 +24,7 @@ cdef class Criterion: # such as the mean in regression and class probabilities in classification. # Internal structures - cdef DOUBLE_t* y # Values of y - cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1) + cdef DOUBLE_t[:, :] y # Values of y cdef DOUBLE_t* sample_weight # Sample weights cdef SIZE_t* samples # Sample indices in X, y @@ -53,7 +52,7 @@ cdef class Criterion: # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, + cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1 cdef int reset(self) nogil except -1 diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index a2b362334de54..d18baaf5da76c 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -51,7 +51,7 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, + cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Placeholder for a method which will initialize the criterion. @@ -63,9 +63,6 @@ cdef class Criterion: ---------- y : array-like, dtype=DOUBLE_t y is a buffer that can store values for n_outputs target variables - y_stride : SIZE_t - y_stride is used to index the kth output value as follows: - y[i, k] = y[i * y_stride + k] sample_weight : array-like, dtype=DOUBLE_t The weight of each sample weighted_n_samples : DOUBLE_t @@ -224,8 +221,6 @@ cdef class ClassificationCriterion(Criterion): The number of unique classes in each target """ - self.y = NULL - self.y_stride = 0 self.sample_weight = NULL self.samples = NULL @@ -281,7 +276,7 @@ cdef class ClassificationCriterion(Criterion): sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)), self.__getstate__()) - cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, + cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Initialize the criterion at node samples[start:end] and @@ -292,11 +287,8 @@ cdef class ClassificationCriterion(Criterion): Parameters ---------- - y : array-like, dtype=DOUBLE_t + y : array-like, dtype=DOUBLE_t[:, :] The target stored as a buffer for memory efficiency - y_stride : SIZE_t - The stride between elements in the buffer, important if there - are multiple targets (multi-output) sample_weight : array-like, dtype=DTYPE_t The weight of each sample weighted_n_samples : SIZE_t @@ -310,7 +302,6 @@ cdef class ClassificationCriterion(Criterion): """ self.y = y - self.y_stride = y_stride self.sample_weight = sample_weight self.samples = samples self.start = start @@ -343,7 +334,7 @@ cdef class ClassificationCriterion(Criterion): # Count weighted class frequency for each target for k in range(self.n_outputs): - c = y[i * y_stride + k] + c = y[i, k] sum_total[k * self.sum_stride + c] += w self.weighted_n_node_samples += w @@ -418,7 +409,7 @@ cdef class ClassificationCriterion(Criterion): The new ending position for which to move samples from the right child to the left child. """ - cdef DOUBLE_t* y = self.y + cdef DOUBLE_t[:, :] y = self.y cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end @@ -454,7 +445,7 @@ cdef class ClassificationCriterion(Criterion): for k in range(self.n_outputs): label_index = (k * self.sum_stride + - y[i * self.y_stride + k]) + y[i, k]) sum_left[label_index] += w self.weighted_n_left += w @@ -470,7 +461,7 @@ cdef class ClassificationCriterion(Criterion): for k in range(self.n_outputs): label_index = (k * self.sum_stride + - y[i * self.y_stride + k]) + y[i, k]) sum_left[label_index] -= w self.weighted_n_left -= w @@ -714,8 +705,6 @@ cdef class RegressionCriterion(Criterion): """ # Default values - self.y = NULL - self.y_stride = 0 self.sample_weight = NULL self.samples = NULL @@ -751,14 +740,13 @@ cdef class RegressionCriterion(Criterion): def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) - cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, + cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Initialize the criterion at node samples[start:end] and children samples[start:start] and samples[start:end].""" # Initialize fields self.y = y - self.y_stride = y_stride self.sample_weight = sample_weight self.samples = samples self.start = start @@ -784,7 +772,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i * y_stride + k] + y_ik = y[i, k] w_y_ik = w * y_ik self.sum_total[k] += w_y_ik self.sq_sum_total += w_y_ik * y_ik @@ -827,7 +815,7 @@ cdef class RegressionCriterion(Criterion): cdef double* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples - cdef DOUBLE_t* y = self.y + cdef DOUBLE_t[:, :] y = self.y cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end cdef SIZE_t i @@ -852,7 +840,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] sum_left[k] += w * y_ik self.weighted_n_left += w @@ -866,7 +854,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] sum_left[k] -= w * y_ik self.weighted_n_left -= w @@ -948,7 +936,7 @@ cdef class MSE(RegressionCriterion): (samples[pos:end]).""" - cdef DOUBLE_t* y = self.y + cdef DOUBLE_t[:, :] y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos @@ -973,7 +961,7 @@ cdef class MSE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] sq_sum_left += w * y_ik * y_ik sq_sum_right = self.sq_sum_total - sq_sum_left @@ -1014,8 +1002,6 @@ cdef class MAE(RegressionCriterion): """ # Default values - self.y = NULL - self.y_stride = 0 self.sample_weight = NULL self.samples = NULL @@ -1044,7 +1030,7 @@ cdef class MAE(RegressionCriterion): self.left_child[k] = WeightedMedianCalculator(n_samples) self.right_child[k] = WeightedMedianCalculator(n_samples) - cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight, + cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Initialize the criterion at node samples[start:end] and @@ -1056,7 +1042,6 @@ cdef class MAE(RegressionCriterion): # Initialize fields self.y = y - self.y_stride = y_stride self.sample_weight = sample_weight self.samples = samples self.start = start @@ -1082,7 +1067,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i * y_stride + k] + y_ik = y[i, k] # push method ends up calling safe_realloc, hence `except -1` # push all values to the right side, @@ -1172,7 +1157,7 @@ cdef class MAE(RegressionCriterion): cdef void** left_child = self.left_child.data cdef void** right_child = self.right_child.data - cdef DOUBLE_t* y = self.y + cdef DOUBLE_t[:, :] y = self.y cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end cdef SIZE_t i, p, k @@ -1193,7 +1178,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] # remove y_ik and its weight w from right and add to left ( right_child[k]).remove(y_ik, w) # push method ends up calling safe_realloc, hence except -1 @@ -1210,7 +1195,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] # remove y_ik and its weight w from left and add to right ( left_child[k]).remove(y_ik, w) ( right_child[k]).push(y_ik, w) @@ -1233,7 +1218,7 @@ cdef class MAE(RegressionCriterion): """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" - cdef DOUBLE_t* y = self.y + cdef DOUBLE_t[:, :] y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t i, p, k @@ -1245,7 +1230,7 @@ cdef class MAE(RegressionCriterion): for p in range(self.start, self.end): i = samples[p] - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] if sample_weight != NULL: w = sample_weight[i] @@ -1261,7 +1246,7 @@ cdef class MAE(RegressionCriterion): (samples[pos:end]). """ - cdef DOUBLE_t* y = self.y + cdef DOUBLE_t[:, :] y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1284,7 +1269,7 @@ cdef class MAE(RegressionCriterion): for p in range(start, pos): i = samples[p] - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] if sample_weight != NULL: w = sample_weight[i] @@ -1298,7 +1283,7 @@ cdef class MAE(RegressionCriterion): for p in range(pos, end): i = samples[p] - y_ik = y[i * self.y_stride + k] + y_ik = y[i, k] if sample_weight != NULL: w = sample_weight[i] diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 4d5c5ae46bceb..37255d1cd20fd 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -60,8 +60,7 @@ cdef class Splitter: cdef bint presort # Whether to use presorting, only # allowed on dense data - cdef DOUBLE_t* y - cdef SIZE_t y_stride + cdef DOUBLE_t[:, :] y cdef DOUBLE_t* sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -81,7 +80,7 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init(self, object X, np.ndarray y, + cdef int init(self, object X, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=*) except -1 diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 2c5b8a7cbd078..8a4f61c691f16 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -92,8 +92,6 @@ cdef class Splitter: self.n_features = 0 self.feature_values = NULL - self.y = NULL - self.y_stride = 0 self.sample_weight = NULL self.max_features = max_features @@ -118,7 +116,7 @@ cdef class Splitter: cdef int init(self, object X, - np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, + DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=None) except -1: """Initialize the splitter. @@ -179,8 +177,7 @@ cdef class Splitter: safe_realloc(&self.feature_values, n_samples) safe_realloc(&self.constant_features, n_features) - self.y = y.data - self.y_stride = y.strides[0] / y.itemsize + self.y = y self.sample_weight = sample_weight return 0 @@ -206,7 +203,6 @@ cdef class Splitter: self.end = end self.criterion.init(self.y, - self.y_stride, self.sample_weight, self.weighted_n_samples, self.samples, @@ -264,7 +260,7 @@ cdef class BaseDenseSplitter(Splitter): cdef int init(self, object X, - np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, + DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=None) except -1: """Initialize the splitter @@ -882,7 +878,7 @@ cdef class BaseSparseSplitter(Splitter): cdef int init(self, object X, - np.ndarray[DOUBLE_t, ndim=2, mode="c"] y, + DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=None) except -1: """Initialize the splitter From 7248a8c29a0c9b046adf1d8e43dc483366fc217e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Mon, 31 Dec 2018 12:02:29 +0100 Subject: [PATCH 05/13] float->dtype_t --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 0ffeaa3ab8ee2..ff1a30ced30ae 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -914,7 +914,7 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef float [:, :] X_ndarray = X + cdef DTYPE_t[:, :] X_ndarray = X cdef SIZE_t n_samples = X.shape[0] # Initialize output From a416c70f819337d41effb2e1853b61de630e058c Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 2 Jan 2019 13:41:08 +0100 Subject: [PATCH 06/13] criterion has no more y_stride --- sklearn/tree/_criterion.pxd | 11 ++++++----- sklearn/tree/_criterion.pyx | 7 ++++--- sklearn/tree/_splitter.pxd | 12 +++++++----- sklearn/tree/_utils.pxd | 4 +++- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index b39b69a70f8a8..60c6bac9f6205 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -1,3 +1,4 @@ +# cython: language_level=3 # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -12,11 +13,11 @@ import numpy as np cimport numpy as np -ctypedef np.npy_float32 DTYPE_t # Type of X -ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight -ctypedef np.npy_intp SIZE_t # Type for indices and counters -ctypedef np.npy_int32 INT32_t # Signed 32 bit integer -ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport UINT32_t # Unsigned 32 bit integer cdef class Criterion: # The criterion computes the impurity of a node and the reduction of diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index d18baaf5da76c..1474d0f5cc1e9 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -302,6 +302,7 @@ cdef class ClassificationCriterion(Criterion): """ self.y = y + cdef DOUBLE_t[:, :] y_memview = y self.sample_weight = sample_weight self.samples = samples self.start = start @@ -334,7 +335,7 @@ cdef class ClassificationCriterion(Criterion): # Count weighted class frequency for each target for k in range(self.n_outputs): - c = y[i, k] + c = y_memview[i, k] sum_total[k * self.sum_stride + c] += w self.weighted_n_node_samples += w @@ -772,7 +773,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i, k] + y_ik = self.y[i, k] w_y_ik = w * y_ik self.sum_total[k] += w_y_ik self.sq_sum_total += w_y_ik * y_ik @@ -1067,7 +1068,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i, k] + y_ik = self.y[i, k] # push method ends up calling safe_realloc, hence `except -1` # push all values to the right side, diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 37255d1cd20fd..fe27a45e4823b 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3 + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -14,11 +16,11 @@ cimport numpy as np from ._criterion cimport Criterion -ctypedef np.npy_float32 DTYPE_t # Type of X -ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight -ctypedef np.npy_intp SIZE_t # Type for indices and counters -ctypedef np.npy_int32 INT32_t # Signed 32 bit integer -ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport UINT32_t # Unsigned 32 bit integer cdef struct SplitRecord: # Data to track sample split diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 04806ade180c2..60a4f552a9527 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -1,3 +1,5 @@ +# cython: language_level=3 + # Authors: Gilles Louppe # Peter Prettenhofer # Arnaud Joly @@ -10,7 +12,7 @@ import numpy as np cimport numpy as np -from _tree cimport Node +from ._tree cimport Node from sklearn.neighbors.quad_tree cimport Cell ctypedef np.npy_float32 DTYPE_t # Type of X From 554b79e289d18e0873b7e7b424751a65931af934 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 2 Jan 2019 17:02:41 +0100 Subject: [PATCH 07/13] remove redundant constructs and handle const y input --- sklearn/tree/_criterion.pxd | 4 ++-- sklearn/tree/_criterion.pyx | 38 +++++++++++++++---------------------- sklearn/tree/_splitter.pxd | 4 ++-- sklearn/tree/_splitter.pyx | 6 +++--- 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 60c6bac9f6205..4c7412e0bccf6 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -25,7 +25,7 @@ cdef class Criterion: # such as the mean in regression and class probabilities in classification. # Internal structures - cdef DOUBLE_t[:, :] y # Values of y + cdef const DOUBLE_t[:, ::1] y # Values of y cdef DOUBLE_t* sample_weight # Sample weights cdef SIZE_t* samples # Sample indices in X, y @@ -53,7 +53,7 @@ cdef class Criterion: # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1 cdef int reset(self) nogil except -1 diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 1474d0f5cc1e9..909db7e077040 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -51,7 +51,7 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Placeholder for a method which will initialize the criterion. @@ -276,7 +276,7 @@ cdef class ClassificationCriterion(Criterion): sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)), self.__getstate__()) - cdef int init(self, DOUBLE_t[:, :] y, + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Initialize the criterion at node samples[start:end] and @@ -302,7 +302,6 @@ cdef class ClassificationCriterion(Criterion): """ self.y = y - cdef DOUBLE_t[:, :] y_memview = y self.sample_weight = sample_weight self.samples = samples self.start = start @@ -335,7 +334,7 @@ cdef class ClassificationCriterion(Criterion): # Count weighted class frequency for each target for k in range(self.n_outputs): - c = y_memview[i, k] + c = self.y[i, k] sum_total[k * self.sum_stride + c] += w self.weighted_n_node_samples += w @@ -410,7 +409,6 @@ cdef class ClassificationCriterion(Criterion): The new ending position for which to move samples from the right child to the left child. """ - cdef DOUBLE_t[:, :] y = self.y cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end @@ -446,7 +444,7 @@ cdef class ClassificationCriterion(Criterion): for k in range(self.n_outputs): label_index = (k * self.sum_stride + - y[i, k]) + self.y[i, k]) sum_left[label_index] += w self.weighted_n_left += w @@ -462,7 +460,7 @@ cdef class ClassificationCriterion(Criterion): for k in range(self.n_outputs): label_index = (k * self.sum_stride + - y[i, k]) + self.y[i, k]) sum_left[label_index] -= w self.weighted_n_left -= w @@ -741,7 +739,7 @@ cdef class RegressionCriterion(Criterion): def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) - cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Initialize the criterion at node samples[start:end] and @@ -816,7 +814,6 @@ cdef class RegressionCriterion(Criterion): cdef double* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples - cdef DOUBLE_t[:, :] y = self.y cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end cdef SIZE_t i @@ -841,7 +838,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i, k] + y_ik = self.y[i, k] sum_left[k] += w * y_ik self.weighted_n_left += w @@ -855,7 +852,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i, k] + y_ik = self.y[i, k] sum_left[k] -= w * y_ik self.weighted_n_left -= w @@ -936,8 +933,6 @@ cdef class MSE(RegressionCriterion): left child (samples[start:pos]) and the impurity the right child (samples[pos:end]).""" - - cdef DOUBLE_t[:, :] y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t pos = self.pos @@ -962,7 +957,7 @@ cdef class MSE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i, k] + y_ik = self.y[i, k] sq_sum_left += w * y_ik * y_ik sq_sum_right = self.sq_sum_total - sq_sum_left @@ -1031,7 +1026,7 @@ cdef class MAE(RegressionCriterion): self.left_child[k] = WeightedMedianCalculator(n_samples) self.right_child[k] = WeightedMedianCalculator(n_samples) - cdef int init(self, DOUBLE_t[:, :] y, DOUBLE_t* sample_weight, + cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, double weighted_n_samples, SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: """Initialize the criterion at node samples[start:end] and @@ -1158,7 +1153,6 @@ cdef class MAE(RegressionCriterion): cdef void** left_child = self.left_child.data cdef void** right_child = self.right_child.data - cdef DOUBLE_t[:, :] y = self.y cdef SIZE_t pos = self.pos cdef SIZE_t end = self.end cdef SIZE_t i, p, k @@ -1179,7 +1173,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i, k] + y_ik = self.y[i, k] # remove y_ik and its weight w from right and add to left ( right_child[k]).remove(y_ik, w) # push method ends up calling safe_realloc, hence except -1 @@ -1196,7 +1190,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = y[i, k] + y_ik = self.y[i, k] # remove y_ik and its weight w from left and add to right ( left_child[k]).remove(y_ik, w) ( right_child[k]).push(y_ik, w) @@ -1219,7 +1213,6 @@ cdef class MAE(RegressionCriterion): """Evaluate the impurity of the current node, i.e. the impurity of samples[start:end]""" - cdef DOUBLE_t[:, :] y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t i, p, k @@ -1231,7 +1224,7 @@ cdef class MAE(RegressionCriterion): for p in range(self.start, self.end): i = samples[p] - y_ik = y[i, k] + y_ik = self.y[i, k] if sample_weight != NULL: w = sample_weight[i] @@ -1247,7 +1240,6 @@ cdef class MAE(RegressionCriterion): (samples[pos:end]). """ - cdef DOUBLE_t[:, :] y = self.y cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples @@ -1270,7 +1262,7 @@ cdef class MAE(RegressionCriterion): for p in range(start, pos): i = samples[p] - y_ik = y[i, k] + y_ik = self.y[i, k] if sample_weight != NULL: w = sample_weight[i] @@ -1284,7 +1276,7 @@ cdef class MAE(RegressionCriterion): for p in range(pos, end): i = samples[p] - y_ik = y[i, k] + y_ik = self.y[i, k] if sample_weight != NULL: w = sample_weight[i] diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index fe27a45e4823b..5ca7303659a68 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -62,7 +62,7 @@ cdef class Splitter: cdef bint presort # Whether to use presorting, only # allowed on dense data - cdef DOUBLE_t[:, :] y + cdef const DOUBLE_t[:, ::1] y cdef DOUBLE_t* sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -82,7 +82,7 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init(self, object X, DOUBLE_t[:, :] y, + cdef int init(self, object X, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=*) except -1 diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 8a4f61c691f16..eca0a06a60b05 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -116,7 +116,7 @@ cdef class Splitter: cdef int init(self, object X, - DOUBLE_t[:, :] y, + DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=None) except -1: """Initialize the splitter. @@ -260,7 +260,7 @@ cdef class BaseDenseSplitter(Splitter): cdef int init(self, object X, - DOUBLE_t[:, :] y, + DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=None) except -1: """Initialize the splitter @@ -878,7 +878,7 @@ cdef class BaseSparseSplitter(Splitter): cdef int init(self, object X, - DOUBLE_t[:, :] y, + DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight, np.ndarray X_idx_sorted=None) except -1: """Initialize the splitter From 3ae6c5a8ca7b2581d329a894a2827b0cb9fdeab9 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 2 Jan 2019 17:04:47 +0100 Subject: [PATCH 08/13] fix criterion docstring for y's dtype --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 909db7e077040..2e7609eb01342 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -287,7 +287,7 @@ cdef class ClassificationCriterion(Criterion): Parameters ---------- - y : array-like, dtype=DOUBLE_t[:, :] + y : array-like, dtype=const DOUBLE_t[:, ::1] The target stored as a buffer for memory efficiency sample_weight : array-like, dtype=DTYPE_t The weight of each sample From b136c8bf21f39497b4425bd58cfe931691b6fa49 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Wed, 2 Jan 2019 23:53:13 +0100 Subject: [PATCH 09/13] revert docstring for y --- sklearn/tree/_criterion.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 2e7609eb01342..23b1edfa0f599 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -287,7 +287,7 @@ cdef class ClassificationCriterion(Criterion): Parameters ---------- - y : array-like, dtype=const DOUBLE_t[:, ::1] + y : array-like, dtype=DOUBLE_t The target stored as a buffer for memory efficiency sample_weight : array-like, dtype=DTYPE_t The weight of each sample From 5899d08ddd4d9e776bee8b646740bb94734d119e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Thu, 3 Jan 2019 00:18:50 +0100 Subject: [PATCH 10/13] formatting: no space between type name and [:, :] --- sklearn/tree/_splitter.pyx | 6 +++--- sklearn/tree/_tree.pyx | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index eca0a06a60b05..a3e95fad87550 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -236,7 +236,7 @@ cdef class Splitter: cdef class BaseDenseSplitter(Splitter): - cdef DTYPE_t [:, :] X + cdef DTYPE_t[:, :] X cdef np.ndarray X_idx_sorted cdef INT32_t* X_idx_sorted_ptr @@ -313,7 +313,7 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t* constant_features = self.constant_features cdef SIZE_t n_features = self.n_features - cdef DTYPE_t [:, :] X = self.X + cdef DTYPE_t[:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf @@ -657,7 +657,7 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef SIZE_t* constant_features = self.constant_features cdef SIZE_t n_features = self.n_features - cdef DTYPE_t [:, :] X = self.X + cdef DTYPE_t[:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ff1a30ced30ae..c7aee769ae213 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -794,7 +794,7 @@ cdef class Tree: raise ValueError("X.dtype should be np.float32, got %s" % X.dtype) # Extract input - cdef DTYPE_t [:, :] X_ndarray = X + cdef DTYPE_t[:, :] X_ndarray = X cdef SIZE_t n_samples = X.shape[0] # Initialize output From 95c11fb60519e5ca1d667e97ec387c0d5dfcc970 Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Fri, 4 Jan 2019 18:06:47 +0100 Subject: [PATCH 11/13] remove redundant X --- sklearn/tree/_splitter.pyx | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index a3e95fad87550..fab9aabb03e28 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -313,7 +313,6 @@ cdef class BestSplitter(BaseDenseSplitter): cdef SIZE_t* constant_features = self.constant_features cdef SIZE_t n_features = self.n_features - cdef DTYPE_t[:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf @@ -411,11 +410,11 @@ cdef class BestSplitter(BaseDenseSplitter): j = X_idx_sorted[i + feature_idx_offset] if sample_mask[j] == 1: samples[p] = j - Xf[p] = X[j, current.feature] + Xf[p] = self.X[j, current.feature] p += 1 else: for i in range(start, end): - Xf[i] = X[samples[i], current.feature] + Xf[i] = self.X[samples[i], current.feature] sort(Xf + start, samples + start, end - start) @@ -480,7 +479,7 @@ cdef class BestSplitter(BaseDenseSplitter): p = start while p < partition_end: - if X[samples[p], best.feature] <= best.threshold: + if self.X[samples[p], best.feature] <= best.threshold: p += 1 else: @@ -657,7 +656,6 @@ cdef class RandomSplitter(BaseDenseSplitter): cdef SIZE_t* constant_features = self.constant_features cdef SIZE_t n_features = self.n_features - cdef DTYPE_t[:, :] X = self.X cdef DTYPE_t* Xf = self.feature_values cdef SIZE_t max_features = self.max_features cdef SIZE_t min_samples_leaf = self.min_samples_leaf @@ -735,12 +733,12 @@ cdef class RandomSplitter(BaseDenseSplitter): current.feature = features[f_j] # Find min, max - min_feature_value = X[samples[start], current.feature] + min_feature_value = self.X[samples[start], current.feature] max_feature_value = min_feature_value Xf[start] = min_feature_value for p in range(start + 1, end): - current_feature_value = X[samples[p], current.feature] + current_feature_value = self.X[samples[p], current.feature] Xf[p] = current_feature_value if current_feature_value < min_feature_value: @@ -813,7 +811,7 @@ cdef class RandomSplitter(BaseDenseSplitter): p = start while p < partition_end: - if X[samples[p], best.feature] <= best.threshold: + if self.X[samples[p], best.feature] <= best.threshold: p += 1 else: From a5d2457b9c6d89ff608e6ecc05b5b24c964e1d2e Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Sat, 9 Feb 2019 14:32:02 +0100 Subject: [PATCH 12/13] more cleanup on criterion --- sklearn/tree/_criterion.pyx | 45 +++++++++++-------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 23b1edfa0f599..e6c3d628baf53 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -443,8 +443,7 @@ cdef class ClassificationCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - label_index = (k * self.sum_stride + - self.y[i, k]) + label_index = k * self.sum_stride + self.y[i, k] sum_left[label_index] += w self.weighted_n_left += w @@ -459,8 +458,7 @@ cdef class ClassificationCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - label_index = (k * self.sum_stride + - self.y[i, k]) + label_index = k * self.sum_stride + self.y[i, k] sum_left[label_index] -= w self.weighted_n_left -= w @@ -820,7 +818,6 @@ cdef class RegressionCriterion(Criterion): cdef SIZE_t p cdef SIZE_t k cdef DOUBLE_t w = 1.0 - cdef DOUBLE_t y_ik # Update statistics up to new_pos # @@ -838,8 +835,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = self.y[i, k] - sum_left[k] += w * y_ik + sum_left[k] += w * self.y[i, k] self.weighted_n_left += w else: @@ -852,8 +848,7 @@ cdef class RegressionCriterion(Criterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = self.y[i, k] - sum_left[k] -= w * y_ik + sum_left[k] -= w * self.y[i, k] self.weighted_n_left -= w @@ -940,6 +935,7 @@ cdef class MSE(RegressionCriterion): cdef double* sum_left = self.sum_left cdef double* sum_right = self.sum_right + cdef DOUBLE_t y_ik cdef double sq_sum_left = 0.0 cdef double sq_sum_right @@ -948,7 +944,6 @@ cdef class MSE(RegressionCriterion): cdef SIZE_t p cdef SIZE_t k cdef DOUBLE_t w = 1.0 - cdef DOUBLE_t y_ik for p in range(start, pos): i = samples[p] @@ -1033,7 +1028,6 @@ cdef class MAE(RegressionCriterion): children samples[start:start] and samples[start:end].""" cdef SIZE_t i, p, k - cdef DOUBLE_t y_ik cdef DOUBLE_t w = 1.0 # Initialize fields @@ -1063,12 +1057,10 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = self.y[i, k] - # push method ends up calling safe_realloc, hence `except -1` # push all values to the right side, # since pos = start initially anyway - ( right_child[k]).push(y_ik, w) + ( right_child[k]).push(self.y[i, k], w) self.weighted_n_node_samples += w # calculate the node medians @@ -1157,7 +1149,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t end = self.end cdef SIZE_t i, p, k cdef DOUBLE_t w = 1.0 - cdef DOUBLE_t y_ik # Update statistics up to new_pos # @@ -1173,11 +1164,10 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = self.y[i, k] # remove y_ik and its weight w from right and add to left - ( right_child[k]).remove(y_ik, w) + ( right_child[k]).remove(self.y[i, k], w) # push method ends up calling safe_realloc, hence except -1 - ( left_child[k]).push(y_ik, w) + ( left_child[k]).push(self.y[i, k], w) self.weighted_n_left += w else: @@ -1190,10 +1180,9 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] for k in range(self.n_outputs): - y_ik = self.y[i, k] # remove y_ik and its weight w from left and add to right - ( left_child[k]).remove(y_ik, w) - ( right_child[k]).push(y_ik, w) + ( left_child[k]).remove(self.y[i, k], w) + ( right_child[k]).push(self.y[i, k], w) self.weighted_n_left -= w @@ -1216,7 +1205,6 @@ cdef class MAE(RegressionCriterion): cdef DOUBLE_t* sample_weight = self.sample_weight cdef SIZE_t* samples = self.samples cdef SIZE_t i, p, k - cdef DOUBLE_t y_ik cdef DOUBLE_t w = 1.0 cdef DOUBLE_t impurity = 0.0 @@ -1224,12 +1212,10 @@ cdef class MAE(RegressionCriterion): for p in range(self.start, self.end): i = samples[p] - y_ik = self.y[i, k] - if sample_weight != NULL: w = sample_weight[i] - impurity += fabs(y_ik - self.node_medians[k]) * w + impurity += fabs(self.y[i, k] - self.node_medians[k]) * w return impurity / (self.weighted_n_node_samples * self.n_outputs) @@ -1248,7 +1234,6 @@ cdef class MAE(RegressionCriterion): cdef SIZE_t end = self.end cdef SIZE_t i, p, k - cdef DOUBLE_t y_ik cdef DOUBLE_t median cdef DOUBLE_t w = 1.0 cdef DOUBLE_t impurity_left = 0.0 @@ -1262,12 +1247,10 @@ cdef class MAE(RegressionCriterion): for p in range(start, pos): i = samples[p] - y_ik = self.y[i, k] - if sample_weight != NULL: w = sample_weight[i] - impurity_left += fabs(y_ik - median) * w + impurity_left += fabs(self.y[i, k] - median) * w p_impurity_left[0] = impurity_left / (self.weighted_n_left * self.n_outputs) @@ -1276,12 +1259,10 @@ cdef class MAE(RegressionCriterion): for p in range(pos, end): i = samples[p] - y_ik = self.y[i, k] - if sample_weight != NULL: w = sample_weight[i] - impurity_right += fabs(y_ik - median) * w + impurity_right += fabs(self.y[i, k] - median) * w p_impurity_right[0] = impurity_right / (self.weighted_n_right * self.n_outputs) From fd7b5febf924c6a22c58898cacd0d1ffa4b253ea Mon Sep 17 00:00:00 2001 From: adrinjalali Date: Tue, 26 Feb 2019 16:30:38 +0100 Subject: [PATCH 13/13] address comment --- sklearn/tree/_criterion.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 4c7412e0bccf6..e4a09cd6b3394 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -25,7 +25,7 @@ cdef class Criterion: # such as the mean in regression and class probabilities in classification. # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y + cdef const DOUBLE_t[:, ::1] y # Values of y cdef DOUBLE_t* sample_weight # Sample weights cdef SIZE_t* samples # Sample indices in X, y