Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
16 changes: 8 additions & 8 deletions sklearn/tree/_criterion.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# cython: language_level=3
# Authors: Gilles Louppe <[email protected]>
# Peter Prettenhofer <[email protected]>
# Brian Holt <[email protected]>
Expand All @@ -12,20 +13,19 @@
import numpy as np
cimport numpy as np

ctypedef np.npy_float32 DTYPE_t # Type of X
ctypedef np.npy_float64 DOUBLE_t # Type of y, sample_weight
ctypedef np.npy_intp SIZE_t # Type for indices and counters
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
from ._tree cimport DTYPE_t # Type of X
from ._tree cimport DOUBLE_t # Type of y, sample_weight
from ._tree cimport SIZE_t # Type for indices and counters
from ._tree cimport INT32_t # Signed 32 bit integer
from ._tree cimport UINT32_t # Unsigned 32 bit integer

cdef class Criterion:
# The criterion computes the impurity of a node and the reduction of
# impurity of a split on that node. It also computes the output statistics
# such as the mean in regression and class probabilities in classification.

# Internal structures
cdef DOUBLE_t* y # Values of y
cdef SIZE_t y_stride # Stride in y (since n_outputs >= 1)
cdef const DOUBLE_t[:, ::1] y # Values of y
cdef DOUBLE_t* sample_weight # Sample weights

cdef SIZE_t* samples # Sample indices in X, y
Expand Down Expand Up @@ -53,7 +53,7 @@ cdef class Criterion:
# statistics correspond to samples[start:pos] and samples[pos:end].

# Methods
cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1
cdef int reset(self) nogil except -1
Expand Down
81 changes: 20 additions & 61 deletions sklearn/tree/_criterion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ cdef class Criterion:
def __setstate__(self, d):
pass

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1:
"""Placeholder for a method which will initialize the criterion.
Expand All @@ -63,9 +63,6 @@ cdef class Criterion:
----------
y : array-like, dtype=DOUBLE_t
y is a buffer that can store values for n_outputs target variables
y_stride : SIZE_t
y_stride is used to index the kth output value as follows:
y[i, k] = y[i * y_stride + k]
sample_weight : array-like, dtype=DOUBLE_t
The weight of each sample
weighted_n_samples : DOUBLE_t
Expand Down Expand Up @@ -224,8 +221,6 @@ cdef class ClassificationCriterion(Criterion):
The number of unique classes in each target
"""

self.y = NULL
self.y_stride = 0
self.sample_weight = NULL

self.samples = NULL
Expand Down Expand Up @@ -281,7 +276,7 @@ cdef class ClassificationCriterion(Criterion):
sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)),
self.__getstate__())

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride,
cdef int init(self, const DOUBLE_t[:, ::1] y,
DOUBLE_t* sample_weight, double weighted_n_samples,
SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1:
"""Initialize the criterion at node samples[start:end] and
Expand All @@ -294,9 +289,6 @@ cdef class ClassificationCriterion(Criterion):
----------
y : array-like, dtype=DOUBLE_t
The target stored as a buffer for memory efficiency
y_stride : SIZE_t
The stride between elements in the buffer, important if there
are multiple targets (multi-output)
sample_weight : array-like, dtype=DTYPE_t
The weight of each sample
weighted_n_samples : SIZE_t
Expand All @@ -310,7 +302,6 @@ cdef class ClassificationCriterion(Criterion):
"""

self.y = y
self.y_stride = y_stride
self.sample_weight = sample_weight
self.samples = samples
self.start = start
Expand Down Expand Up @@ -343,7 +334,7 @@ cdef class ClassificationCriterion(Criterion):

# Count weighted class frequency for each target
for k in range(self.n_outputs):
c = <SIZE_t> y[i * y_stride + k]
c = <SIZE_t> self.y[i, k]
sum_total[k * self.sum_stride + c] += w

self.weighted_n_node_samples += w
Expand Down Expand Up @@ -418,7 +409,6 @@ cdef class ClassificationCriterion(Criterion):
The new ending position for which to move samples from the right
child to the left child.
"""
cdef DOUBLE_t* y = self.y
cdef SIZE_t pos = self.pos
cdef SIZE_t end = self.end

Expand Down Expand Up @@ -453,8 +443,7 @@ cdef class ClassificationCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
label_index = (k * self.sum_stride +
<SIZE_t> y[i * self.y_stride + k])
label_index = k * self.sum_stride + <SIZE_t> self.y[i, k]
sum_left[label_index] += w

self.weighted_n_left += w
Expand All @@ -469,8 +458,7 @@ cdef class ClassificationCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
label_index = (k * self.sum_stride +
<SIZE_t> y[i * self.y_stride + k])
label_index = k * self.sum_stride + <SIZE_t> self.y[i, k]
sum_left[label_index] -= w

self.weighted_n_left -= w
Expand Down Expand Up @@ -714,8 +702,6 @@ cdef class RegressionCriterion(Criterion):
"""

# Default values
self.y = NULL
self.y_stride = 0
self.sample_weight = NULL

self.samples = NULL
Expand Down Expand Up @@ -751,14 +737,13 @@ cdef class RegressionCriterion(Criterion):
def __reduce__(self):
return (type(self), (self.n_outputs, self.n_samples), self.__getstate__())

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1:
"""Initialize the criterion at node samples[start:end] and
children samples[start:start] and samples[start:end]."""
# Initialize fields
self.y = y
self.y_stride = y_stride
self.sample_weight = sample_weight
self.samples = samples
self.start = start
Expand All @@ -784,7 +769,7 @@ cdef class RegressionCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * y_stride + k]
y_ik = self.y[i, k]
w_y_ik = w * y_ik
self.sum_total[k] += w_y_ik
self.sq_sum_total += w_y_ik * y_ik
Expand Down Expand Up @@ -827,14 +812,12 @@ cdef class RegressionCriterion(Criterion):
cdef double* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples

cdef DOUBLE_t* y = self.y
cdef SIZE_t pos = self.pos
cdef SIZE_t end = self.end
cdef SIZE_t i
cdef SIZE_t p
cdef SIZE_t k
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t y_ik

# Update statistics up to new_pos
#
Expand All @@ -852,8 +835,7 @@ cdef class RegressionCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
sum_left[k] += w * y_ik
sum_left[k] += w * self.y[i, k]

self.weighted_n_left += w
else:
Expand All @@ -866,8 +848,7 @@ cdef class RegressionCriterion(Criterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
sum_left[k] -= w * y_ik
sum_left[k] -= w * self.y[i, k]

self.weighted_n_left -= w

Expand Down Expand Up @@ -947,15 +928,14 @@ cdef class MSE(RegressionCriterion):
left child (samples[start:pos]) and the impurity the right child
(samples[pos:end])."""


cdef DOUBLE_t* y = self.y
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t pos = self.pos
cdef SIZE_t start = self.start

cdef double* sum_left = self.sum_left
cdef double* sum_right = self.sum_right
cdef DOUBLE_t y_ik

cdef double sq_sum_left = 0.0
cdef double sq_sum_right
Expand All @@ -964,7 +944,6 @@ cdef class MSE(RegressionCriterion):
cdef SIZE_t p
cdef SIZE_t k
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t y_ik

for p in range(start, pos):
i = samples[p]
Expand All @@ -973,7 +952,7 @@ cdef class MSE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
y_ik = self.y[i, k]
sq_sum_left += w * y_ik * y_ik

sq_sum_right = self.sq_sum_total - sq_sum_left
Expand Down Expand Up @@ -1014,8 +993,6 @@ cdef class MAE(RegressionCriterion):
"""

# Default values
self.y = NULL
self.y_stride = 0
self.sample_weight = NULL

self.samples = NULL
Expand Down Expand Up @@ -1044,19 +1021,17 @@ cdef class MAE(RegressionCriterion):
self.left_child[k] = WeightedMedianCalculator(n_samples)
self.right_child[k] = WeightedMedianCalculator(n_samples)

cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, DOUBLE_t* sample_weight,
cdef int init(self, const DOUBLE_t[:, ::1] y, DOUBLE_t* sample_weight,
double weighted_n_samples, SIZE_t* samples, SIZE_t start,
SIZE_t end) nogil except -1:
"""Initialize the criterion at node samples[start:end] and
children samples[start:start] and samples[start:end]."""

cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t w = 1.0

# Initialize fields
self.y = y
self.y_stride = y_stride
self.sample_weight = sample_weight
self.samples = samples
self.start = start
Expand All @@ -1082,12 +1057,10 @@ cdef class MAE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * y_stride + k]

# push method ends up calling safe_realloc, hence `except -1`
# push all values to the right side,
# since pos = start initially anyway
(<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
(<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)

self.weighted_n_node_samples += w
# calculate the node medians
Expand Down Expand Up @@ -1172,12 +1145,10 @@ cdef class MAE(RegressionCriterion):
cdef void** left_child = <void**> self.left_child.data
cdef void** right_child = <void**> self.right_child.data

cdef DOUBLE_t* y = self.y
cdef SIZE_t pos = self.pos
cdef SIZE_t end = self.end
cdef SIZE_t i, p, k
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t y_ik

# Update statistics up to new_pos
#
Expand All @@ -1193,11 +1164,10 @@ cdef class MAE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
# remove y_ik and its weight w from right and add to left
(<WeightedMedianCalculator> right_child[k]).remove(y_ik, w)
(<WeightedMedianCalculator> right_child[k]).remove(self.y[i, k], w)
# push method ends up calling safe_realloc, hence except -1
(<WeightedMedianCalculator> left_child[k]).push(y_ik, w)
(<WeightedMedianCalculator> left_child[k]).push(self.y[i, k], w)

self.weighted_n_left += w
else:
Expand All @@ -1210,10 +1180,9 @@ cdef class MAE(RegressionCriterion):
w = sample_weight[i]

for k in range(self.n_outputs):
y_ik = y[i * self.y_stride + k]
# remove y_ik and its weight w from left and add to right
(<WeightedMedianCalculator> left_child[k]).remove(y_ik, w)
(<WeightedMedianCalculator> right_child[k]).push(y_ik, w)
(<WeightedMedianCalculator> left_child[k]).remove(self.y[i, k], w)
(<WeightedMedianCalculator> right_child[k]).push(self.y[i, k], w)

self.weighted_n_left -= w

Expand All @@ -1233,24 +1202,20 @@ cdef class MAE(RegressionCriterion):
"""Evaluate the impurity of the current node, i.e. the impurity of
samples[start:end]"""

cdef DOUBLE_t* y = self.y
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples
cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t impurity = 0.0

for k in range(self.n_outputs):
for p in range(self.start, self.end):
i = samples[p]

y_ik = y[i * self.y_stride + k]

if sample_weight != NULL:
w = sample_weight[i]

impurity += fabs(y_ik - self.node_medians[k]) * w
impurity += fabs(self.y[i, k] - self.node_medians[k]) * w

return impurity / (self.weighted_n_node_samples * self.n_outputs)

Expand All @@ -1261,7 +1226,6 @@ cdef class MAE(RegressionCriterion):
(samples[pos:end]).
"""

cdef DOUBLE_t* y = self.y
cdef DOUBLE_t* sample_weight = self.sample_weight
cdef SIZE_t* samples = self.samples

Expand All @@ -1270,7 +1234,6 @@ cdef class MAE(RegressionCriterion):
cdef SIZE_t end = self.end

cdef SIZE_t i, p, k
cdef DOUBLE_t y_ik
cdef DOUBLE_t median
cdef DOUBLE_t w = 1.0
cdef DOUBLE_t impurity_left = 0.0
Expand All @@ -1284,12 +1247,10 @@ cdef class MAE(RegressionCriterion):
for p in range(start, pos):
i = samples[p]

y_ik = y[i * self.y_stride + k]

if sample_weight != NULL:
w = sample_weight[i]

impurity_left += fabs(y_ik - median) * w
impurity_left += fabs(self.y[i, k] - median) * w
p_impurity_left[0] = impurity_left / (self.weighted_n_left *
self.n_outputs)

Expand All @@ -1298,12 +1259,10 @@ cdef class MAE(RegressionCriterion):
for p in range(pos, end):
i = samples[p]

y_ik = y[i * self.y_stride + k]

if sample_weight != NULL:
w = sample_weight[i]

impurity_right += fabs(y_ik - median) * w
impurity_right += fabs(self.y[i, k] - median) * w
p_impurity_right[0] = impurity_right / (self.weighted_n_right *
self.n_outputs)

Expand Down
Loading