Thanks to visit codestin.com
Credit goes to github.com

Skip to content

CLN Renames missing_values_in_feature_mask #26580

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions sklearn/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def get_n_leaves(self):
def _support_missing_values(self, X):
return not issparse(X) and self._get_tags()["allow_nan"]

def _compute_feature_has_missing(self, X):
def _compute_missing_values_in_feature_mask(self, X):
"""Return boolean mask denoting if there are missing values for each feature.

This method also ensures that X is finite.
Expand All @@ -192,7 +192,7 @@ def _compute_feature_has_missing(self, X):

Returns
-------
feature_has_missing : ndarray of shape (n_features,), or None
missing_values_in_feature_mask : ndarray of shape (n_features,), or None
Missing value mask. If missing values are not supported or there
are no missing values, return None.
"""
Expand All @@ -213,11 +213,16 @@ def _compute_feature_has_missing(self, X):
if not np.isnan(overall_sum):
return None

feature_has_missing = _any_isnan_axis0(X)
return feature_has_missing
missing_values_in_feature_mask = _any_isnan_axis0(X)
return missing_values_in_feature_mask

def _fit(
self, X, y, sample_weight=None, check_input=True, feature_has_missing=None
self,
X,
y,
sample_weight=None,
check_input=True,
missing_values_in_feature_mask=None,
):
self._validate_params()
random_state = check_random_state(self.random_state)
Expand All @@ -227,7 +232,7 @@ def _fit(
# We can't pass multi_output=True because that would allow y to be
# csr.

# _compute_feature_has_missing will check for finite values and
# _compute_missing_values_in_feature_mask will check for finite values and
# compute the missing mask if the tree supports missing values
check_X_params = dict(
dtype=DTYPE, accept_sparse="csc", force_all_finite=False
Expand All @@ -237,7 +242,9 @@ def _fit(
X, y, validate_separately=(check_X_params, check_y_params)
)

feature_has_missing = self._compute_feature_has_missing(X)
missing_values_in_feature_mask = (
self._compute_missing_values_in_feature_mask(X)
)
if issparse(X):
X.sort_indices()

Expand Down Expand Up @@ -432,7 +439,7 @@ def _fit(
self.min_impurity_decrease,
)

builder.build(self.tree_, X, y, sample_weight, feature_has_missing)
builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)

if self.n_outputs_ == 1 and is_classifier(self):
self.n_classes_ = self.n_classes_[0]
Expand Down
2 changes: 1 addition & 1 deletion sklearn/tree/_splitter.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ cdef class Splitter:
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
) except -1

cdef int node_reset(
Expand Down
44 changes: 22 additions & 22 deletions sklearn/tree/_splitter.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ cdef class Splitter:
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
) except -1:
"""Initialize the splitter.

Expand Down Expand Up @@ -172,7 +172,7 @@ cdef class Splitter:
self.y = y

self.sample_weight = sample_weight
if feature_has_missing is not None:
if missing_values_in_feature_mask is not None:
self.criterion.init_sum_missing()
return 0

Expand Down Expand Up @@ -808,19 +808,19 @@ cdef class DensePartitioner:
cdef SIZE_t start
cdef SIZE_t end
cdef SIZE_t n_missing
cdef const unsigned char[::1] feature_has_missing
cdef const unsigned char[::1] missing_values_in_feature_mask

def __init__(
self,
const DTYPE_t[:, :] X,
SIZE_t[::1] samples,
DTYPE_t[::1] feature_values,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
):
self.X = X
self.samples = samples
self.feature_values = feature_values
self.feature_has_missing = feature_has_missing
self.missing_values_in_feature_mask = missing_values_in_feature_mask

cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil:
"""Initialize splitter at the beginning of node_split."""
Expand All @@ -843,13 +843,13 @@ cdef class DensePartitioner:
const DTYPE_t[:, :] X = self.X
SIZE_t[::1] samples = self.samples
SIZE_t n_missing = 0
const unsigned char[::1] feature_has_missing = self.feature_has_missing
const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask

# Sort samples along that feature; by
# copying the values into an array and
# sorting the array in a manner which utilizes the cache more
# effectively.
if feature_has_missing is not None and feature_has_missing[current_feature]:
if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
i, current_end = self.start, self.end - 1
# Missing values are placed at the end and do not participate in the sorting.
while i <= current_end:
Expand Down Expand Up @@ -1018,7 +1018,7 @@ cdef class SparsePartitioner:
cdef SIZE_t start
cdef SIZE_t end
cdef SIZE_t n_missing
cdef const unsigned char[::1] feature_has_missing
cdef const unsigned char[::1] missing_values_in_feature_mask

cdef const DTYPE_t[::1] X_data
cdef const INT32_t[::1] X_indices
Expand All @@ -1039,7 +1039,7 @@ cdef class SparsePartitioner:
SIZE_t[::1] samples,
SIZE_t n_samples,
DTYPE_t[::1] feature_values,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
):
if not isspmatrix_csc(X):
raise ValueError("X should be in csc format")
Expand All @@ -1063,7 +1063,7 @@ cdef class SparsePartitioner:
for p in range(n_samples):
self.index_to_samples[samples[p]] = p

self.feature_has_missing = feature_has_missing
self.missing_values_in_feature_mask = missing_values_in_feature_mask

cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil:
"""Initialize splitter at the beginning of node_split."""
Expand Down Expand Up @@ -1434,11 +1434,11 @@ cdef class BestSplitter(Splitter):
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, feature_has_missing)
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = DensePartitioner(
X, self.samples, self.feature_values, feature_has_missing
X, self.samples, self.feature_values, missing_values_in_feature_mask
)

cdef int node_split(self, double impurity, SplitRecord* split,
Expand All @@ -1460,11 +1460,11 @@ cdef class BestSparseSplitter(Splitter):
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, feature_has_missing)
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = SparsePartitioner(
X, self.samples, self.n_samples, self.feature_values, feature_has_missing
X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
)

cdef int node_split(self, double impurity, SplitRecord* split,
Expand All @@ -1486,11 +1486,11 @@ cdef class RandomSplitter(Splitter):
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, feature_has_missing)
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = DensePartitioner(
X, self.samples, self.feature_values, feature_has_missing
X, self.samples, self.feature_values, missing_values_in_feature_mask
)

cdef int node_split(self, double impurity, SplitRecord* split,
Expand All @@ -1512,11 +1512,11 @@ cdef class RandomSparseSplitter(Splitter):
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight,
const unsigned char[::1] feature_has_missing,
const unsigned char[::1] missing_values_in_feature_mask,
) except -1:
Splitter.init(self, X, y, sample_weight, feature_has_missing)
Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
self.partitioner = SparsePartitioner(
X, self.samples, self.n_samples, self.feature_values, feature_has_missing
X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
)

cdef int node_split(self, double impurity, SplitRecord* split,
Expand Down
2 changes: 1 addition & 1 deletion sklearn/tree/_tree.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ cdef class TreeBuilder:
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=*,
const unsigned char[::1] feature_has_missing=*,
const unsigned char[::1] missing_values_in_feature_mask=*,
)

cdef _check_input(
Expand Down
10 changes: 5 additions & 5 deletions sklearn/tree/_tree.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ cdef class TreeBuilder:
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=None,
const unsigned char[::1] feature_has_missing=None,
const unsigned char[::1] missing_values_in_feature_mask=None,
):
"""Build a decision tree from the training set (X, y)."""
pass
Expand Down Expand Up @@ -168,7 +168,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=None,
const unsigned char[::1] feature_has_missing=None,
const unsigned char[::1] missing_values_in_feature_mask=None,
):
"""Build a decision tree from the training set (X, y)."""

Expand All @@ -194,7 +194,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
cdef double min_impurity_decrease = self.min_impurity_decrease

# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, feature_has_missing)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)

cdef SIZE_t start
cdef SIZE_t end
Expand Down Expand Up @@ -366,7 +366,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
object X,
const DOUBLE_t[:, ::1] y,
const DOUBLE_t[:] sample_weight=None,
const unsigned char[::1] feature_has_missing=None,
const unsigned char[::1] missing_values_in_feature_mask=None,
):
"""Build a decision tree from the training set (X, y)."""

Expand All @@ -378,7 +378,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes

# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, feature_has_missing)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)

cdef vector[FrontierRecord] frontier
cdef FrontierRecord record
Expand Down