Thanks to visit codestin.com
Credit goes to github.com

Skip to content

MNT simplify some tree code with memoryviews #12886

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Feb 28, 2019
Merged
16 changes: 8 additions & 8 deletions sklearn/tree/_criterion.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# cython: language_level=3
# Authors: Gilles Louppe <[email protected]>
# Peter Prettenhofer <[email protected]>
# Brian Holt <[email protected]>
Expand All @@ -12,20 +13,19 @@
import numpy as np
cimport numpy as np

ctypedef np.npy_float32 DTYPE_t # Type of X
ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight
ctypedef np.npy_intp SIZE_t # Type for indices and counters
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
from ._tree cimport DTYPE_t # Type of X
from ._tree cimport DOUBLE_t # Type of y, sample_weight
from ._tree cimport SIZE_t # Type for indices and counters
from ._tree cimport INT32_t # Signed 32 bit integer
from ._tree cimport UINT32_t # Unsigned 32 bit integer

cdef class Criterion:
# The criterion computes the impurity of a node and the reduction of
# impurity of a split on that node. It also computes the output statistics
# such as the mean in regression and class probabilities in classification.

# Internal structures
cdef DOUBLE_t* y # Values of y
cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1)
cdef const DOUBLE_t[:, ::1] y # Values of y
cdef DOUBLE_t* sample_weight # Sample weights

cdef SIZE_t* samples # Sample indices in X, y
Expand Down Expand Up @@ -53,7 +53,7 @@ cdef class Criterion:
# statistics correspond to samples[start:pos] and samples[pos:end].

# Methods
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1
cdef int reset(self) nogil except -1
Expand Down
81 changes: 20 additions & 61 deletions sklearn/tree/_criterion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ cdef class Criterion:
def __setstate__(self, d):
pass

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1:
"""Placeholder for a method which will initialize the criterion.
Expand All @@ -63,9 +63,6 @@ cdef class Criterion:
----------
y : array-like, dtype=DOUBLE_t
y is a buffer that can store values for n_outputs target variables
y_stride : SIZE_t
y_stride is used to index the kth output value as follows:
y[i, k] = y[i * y_stride + k]
sample_weight : array-like, dtype=DOUBLE_t
The weight of each sample
weighted_n_samples : DOUBLE_t
Expand Down Expand Up @@ -224,8 +221,6 @@ cdef class ClassificationCriterion(Criterion):
The number of unique classes in each target
"""

self.y = NULL
self.y_stride = 0
self.sample_weight = NULL

self.samples = NULL
Expand Down Expand Up @@ -281,7 +276,7 @@ cdef class ClassificationCriterion(Criterion):
sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)),
self.__getstate__())

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride,
cdef int init(self, const DOUBLE_t[:, ::1] y,
DOUBLE_t* sample_weight, double weighted_n_samples,
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1:
"""Initialize the criterion at node samples[start:end] and
Expand All @@ -294,9 +289,6 @@ cdef class ClassificationCriterion(Criterion):
----------
y : array-like, dtype=DOUBLE_t
The target stored as a buffer for memory efficiency
y_stride : SIZE_t
The stride between elements in the buffer, important if there
are multiple targets (multi-output)
sample_weight : array-like, dtype=DTYPE_t
The weight of each sample
weighted_n_samples : SIZE_t
Expand All @@ -310,7 +302,6 @@ cdef class ClassificationCriterion(Criterion):
"""

self.y = y
self.y_stride = y_stride
self.sample_weight = sample_weight
self.samples = samples
self.start = start
Expand Down Expand Up @@ -343,7 +334,7 @@ cdef class ClassificationCriterion(Criterion):

# Count weighted class frequency for each target
for k in range(self.n_outputs):
c = <SIZE_t> y[i * y_stride + k]
c = <SIZE_t> self.y[i, k]
sum_total[k * self.sum_stride + c] += w

self.weighted_n_node_samples += w
Expand Down Expand Up @@ -418,7 +409,6 @@ cdef class ClassificationCriterion(Criterion):
The new ending position for which to move samples from the right
child to the left child.
"""
cdef DOUBLE_t* y = self.y
cdef SIZE_t pos = self.pos
cdef SIZE_t end = self.end

Expand Down Expand Up @@ -453,8 +443,7 @@ cdef class ClassificationCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
label_index = (k * self.sum_stride +
<SIZE_t> y[i * self.y_stride + k])
label_index = k * self.sum_stride + <SIZE_t> self.y[i, k]
sum_left[label_index] += w

self.weighted_n_left += w
Expand All @@ -469,8 +458,7 @@ cdef class ClassificationCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
label_index = (k * self.sum_stride +
<SIZE_t> y[i * self.y_stride + k])
label_index = k * self.sum_stride + <SIZE_t> self.y[i, k]
sum_left[label_index] -= w

self.weighted_n_left -= w
Expand Down Expand Up @@ -714,8 +702,6 @@ cdef class RegressionCriterion(Criterion):
"""

# Default values
self.y = NULL
self.y_stride = 0
self.sample_weight = NULL

self.samples = NULL
Expand Down Expand Up @@ -751,14 +737,13 @@ cdef class RegressionCriterion(Criterion):
def __reduce__(self):
return (type(self), (self.n_outputs, self.n_samples), self.__getstate__())

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1:
"""Initialize the criterion at node samples[start:end] and
children samples[start:start] and samples[start:end]."""
# Initialize fields
self.y = y
self.y_stride = y_stride
self.sample_weight = sample_weight
self.samples = samples
self.start = start
Expand All @@ -784,7 +769,7 @@ cdef class RegressionCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * y_stride + k]
y_ik = self.y[i, k]
w_y_ik = w * y_ik
self.sum_total[k] += w_y_ik
self.sq_sum_total += w_y_ik * y_ik
Expand Down Expand Up @@ -827,14 +812,12 @@ cdef class RegressionCriterion(Criterion):
cdef double* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples

cdef DOUBLE_t* y = self.y
cdef SIZE_t pos = self.pos
cdef SIZE_t end = self.end
cdef SIZE_t i
cdef SIZE_t p
cdef SIZE_t k
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t y_ik

# Update statistics up to new_pos
#
Expand All @@ -852,8 +835,7 @@ cdef class RegressionCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
sum_left[k] += w * y_ik
sum_left[k] += w * self.y[i, k]

self.weighted_n_left += w
else:
Expand All @@ -866,8 +848,7 @@ cdef class RegressionCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
sum_left[k] -= w * y_ik
sum_left[k] -= w * self.y[i, k]

self.weighted_n_left -= w

Expand Down Expand Up @@ -947,15 +928,14 @@ cdef class MSE(RegressionCriterion):
left child (samples[start:pos]) and the impurity the right child
(samples[pos:end])."""


cdef DOUBLE_t* y = self.y
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t pos = self.pos
cdef SIZE_t start = self.start

cdef double* sum_left = self.sum_left
cdef double* sum_right = self.sum_right
cdef DOUBLE_t y_ik

cdef double sq_sum_left = 0.0
cdef double sq_sum_right
Expand All @@ -964,7 +944,6 @@ cdef class MSE(RegressionCriterion):
cdef SIZE_t p
cdef SIZE_t k
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t y_ik

for p in range(start, pos):
i = samples[p]
Expand All @@ -973,7 +952,7 @@ cdef class MSE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
y_ik = self.y[i, k]
sq_sum_left += w * y_ik * y_ik

sq_sum_right = self.sq_sum_total - sq_sum_left
Expand Down Expand Up @@ -1014,8 +993,6 @@ cdef class MAE(RegressionCriterion):
"""

# Default values
self.y = NULL
self.y_stride = 0
self.sample_weight = NULL

self.samples = NULL
Expand Down Expand Up @@ -1044,19 +1021,17 @@ cdef class MAE(RegressionCriterion):
self.left_child[k] = WeightedMedianCalculator(n_samples)
self.right_child[k] = WeightedMedianCalculator(n_samples)

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1:
"""Initialize the criterion at node samples[start:end] and
children samples[start:start] and samples[start:end]."""

cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t w = 1.0

# Initialize fields
self.y = y
self.y_stride = y_stride
self.sample_weight = sample_weight
self.samples = samples
self.start = start
Expand All @@ -1082,12 +1057,10 @@ cdef class MAE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * y_stride + k]

# push method ends up calling safe_realloc, hence `except -1`
# push all values to the right side,
# since pos = start initially anyway
(<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
(<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)

self.weighted_n_node_samples += w
# calculate the node medians
Expand Down Expand Up @@ -1172,12 +1145,10 @@ cdef class MAE(RegressionCriterion):
cdef void** left_child = <void**> self.left_child.data
cdef void** right_child = <void**> self.right_child.data

cdef DOUBLE_t* y = self.y
cdef SIZE_t pos = self.pos
cdef SIZE_t end = self.end
cdef SIZE_t i, p, k
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t y_ik

# Update statistics up to new_pos
#
Expand All @@ -1193,11 +1164,10 @@ cdef class MAE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
# remove y_ik and its weight w from right and add to left
(<WeightedMedianCalculator> right_child[k]).remove(y_ik, w)
(<WeightedMedianCalculator> right_child[k]).remove(self.y[i, k], w)
# push method ends up calling safe_realloc, hence except -1
(<WeightedMedianCalculator> left_child[k]).push(y_ik, w)
(<WeightedMedianCalculator> left_child[k]).push(self.y[i, k], w)

self.weighted_n_left += w
else:
Expand All @@ -1210,10 +1180,9 @@ cdef class MAE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
# remove y_ik and its weight w from left and add to right
(<WeightedMedianCalculator> left_child[k]).remove(y_ik, w)
(<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
(<WeightedMedianCalculator> left_child[k]).remove(self.y[i, k], w)
(<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)

self.weighted_n_left -= w

Expand All @@ -1233,24 +1202,20 @@ cdef class MAE(RegressionCriterion):
"""Evaluate the impurity of the current node, i.e. the impurity of
samples[start:end]"""

cdef DOUBLE_t* y = self.y
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t impurity = 0.0

for k in range(self.n_outputs):
for p in range(self.start, self.end):
i = samples[p]

y_ik = y[i * self.y_stride + k]

if sample_weight != NULL:
w = sample_weight[i]

impurity += fabs(y_ik - self.node_medians[k]) * w
impurity += fabs(self.y[i, k] - self.node_medians[k]) * w

return impurity / (self.weighted_n_node_samples * self.n_outputs)

Expand All @@ -1261,7 +1226,6 @@ cdef class MAE(RegressionCriterion):
(samples[pos:end]).
"""

cdef DOUBLE_t* y = self.y
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples

Expand All @@ -1270,7 +1234,6 @@ cdef class MAE(RegressionCriterion):
cdef SIZE_t end = self.end

cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t median
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t impurity_left = 0.0
Expand All @@ -1284,12 +1247,10 @@ cdef class MAE(RegressionCriterion):
for p in range(start, pos):
i = samples[p]

y_ik = y[i * self.y_stride + k]

if sample_weight != NULL:
w = sample_weight[i]

impurity_left += fabs(y_ik - median) * w
impurity_left += fabs(self.y[i, k] - median) * w
p_impurity_left[0] = impurity_left / (self.weighted_n_left *
self.n_outputs)

Expand All @@ -1298,12 +1259,10 @@ cdef class MAE(RegressionCriterion):
for p in range(pos, end):
i = samples[p]

y_ik = y[i * self.y_stride + k]

if sample_weight != NULL:
w = sample_weight[i]

impurity_right += fabs(y_ik - median) * w
impurity_right += fabs(self.y[i, k] - median) * w
p_impurity_right[0] = impurity_right / (self.weighted_n_right *
self.n_outputs)

Expand Down
Loading