diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 7ba9c8a119836..9cab0db995c5d 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -516,7 +516,7 @@ Changelog :class:`tree.DecisionTreeClassifier` support missing values when `splitter='best'` and criterion is `gini`, `entropy`, or `log_loss`, for classification or `squared_error`, `friedman_mse`, or `poisson` - for regression. :pr:`23595` by `Thomas Fan`_. + for regression. :pr:`23595`, :pr:`26376` by `Thomas Fan`_. - |Enhancement| Adds a `class_names` parameter to :func:`tree.export_text`. This allows specifying the parameter `class_names` diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 2f8a99fe7a26e..ef3b3e119d3aa 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -838,7 +838,9 @@ cdef class RegressionCriterion(Criterion): self.sample_indices[-n_missing:] """ cdef SIZE_t i, p, k - cdef DOUBLE_t w = 0.0 + cdef DOUBLE_t y_ik + cdef DOUBLE_t w_y_ik + cdef DOUBLE_t w = 1.0 self.n_missing = n_missing if n_missing == 0: @@ -855,7 +857,9 @@ cdef class RegressionCriterion(Criterion): w = self.sample_weight[i] for k in range(self.n_outputs): - self.sum_missing[k] += w + y_ik = self.y[i, k] + w_y_ik = w * y_ik + self.sum_missing[k] += w_y_ik self.weighted_n_missing += w diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index ea3e40fddb7a5..eefae6cdaa3f6 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2549,7 +2549,8 @@ def test_missing_values_poisson(): (datasets.make_classification, DecisionTreeClassifier), ], ) -def test_missing_values_is_resilience(make_data, Tree): +@pytest.mark.parametrize("sample_weight_train", [None, "ones"]) +def test_missing_values_is_resilience(make_data, Tree, sample_weight_train): """Check that trees can deal with missing values and have decent performance.""" rng = np.random.RandomState(0) @@ -2563,15 +2564,18 @@ def test_missing_values_is_resilience(make_data, Tree): X_missing, y, random_state=0 ) + if sample_weight_train == "ones": + sample_weight_train = np.ones(X_missing_train.shape[0]) + # Train tree with missing values tree_with_missing = Tree(random_state=rng) - tree_with_missing.fit(X_missing_train, y_train) + tree_with_missing.fit(X_missing_train, y_train, sample_weight=sample_weight_train) score_with_missing = tree_with_missing.score(X_missing_test, y_test) # Train tree without missing values X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) tree = Tree(random_state=rng) - tree.fit(X_train, y_train) + tree.fit(X_train, y_train, sample_weight=sample_weight_train) score_without_missing = tree.score(X_test, y_test) # Score is still 90 percent of the tree's score that had no missing values @@ -2601,3 +2605,32 @@ def test_missing_value_is_predictive(): assert tree.score(X_train, y_train) >= 0.85 assert tree.score(X_test, y_test) >= 0.85 + + +@pytest.mark.parametrize( + "make_data, Tree", + [ + (datasets.make_regression, DecisionTreeRegressor), + (datasets.make_classification, DecisionTreeClassifier), + ], +) +def test_sample_weight_non_uniform(make_data, Tree): + """Check sample weight is correctly handled with missing values.""" + rng = np.random.RandomState(0) + n_samples, n_features = 1000, 10 + X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng) + + # Create dataset with missing values + X[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan + + # Zero sample weight is the same as removing the sample + sample_weight = np.ones(X.shape[0]) + sample_weight[::2] = 0.0 + + tree_with_sw = Tree(random_state=0) + tree_with_sw.fit(X, y, sample_weight=sample_weight) + + tree_samples_removed = Tree(random_state=0) + tree_samples_removed.fit(X[1::2, :], y[1::2]) + + assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))