diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 1d203049d94e1..b856cad3b98c6 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -96,6 +96,13 @@ Changelog can be silenced using the `reg_param` attribute. :pr:`19731` by :user:`Alihan Zihna `. +:mod:`sklearn.impute` +..................... + +- |Fix| :class:`impute.KNNImputer` excludes samples with nan distances when + computing the mean value for uniform weights. + :pr:`29135` by :user:`Xuefeng Xu `. + :mod:`sklearn.linear_model` ........................... diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 64f55693356d6..025e4ce76c134 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -195,6 +195,9 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col): # fill nans with zeros if weight_matrix is not None: weight_matrix[np.isnan(weight_matrix)] = 0.0 + else: + weight_matrix = np.ones_like(donors_dist) + weight_matrix[np.isnan(donors_dist)] = 0.0 # Retrieve donor values and calculate kNN average donors = fit_X_col.take(donors_idx) diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py index 141c2ea90dbd9..dc516e04b9402 100644 --- a/sklearn/impute/tests/test_knn.py +++ b/sklearn/impute/tests/test_knn.py @@ -239,7 +239,9 @@ def test_knn_imputer_one_n_neighbors(na): def test_knn_imputer_all_samples_are_neighbors(na): X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]]) - X_imputed = np.array([[0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13]]) + X_imputed = np.array( + [[0, 0], [6.25, 2], [4, 3], [5, 5.75], [7, 7], [6.25, 8], [14, 13]] + ) n_neighbors = X.shape[0] - 1 imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na) @@ -505,6 +507,27 @@ def test_knn_imputer_not_enough_valid_distances(na, weights): assert_allclose(knn.transform(X2), X2_imputed) +@pytest.mark.parametrize("na", [-1, np.nan]) +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +def test_knn_imputer_nan_distance(na, weights): + # Samples with nan distance should be excluded from the mean computation + X1_train = np.array([[1, 1], [na, 2]]) + X1_test = np.array([[0, na]]) + X1_test_expected = np.array([[0, 1]]) + + knn1 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights) + knn1.fit(X1_train) + assert_allclose(knn1.transform(X1_test), X1_test_expected) + + X2_train = np.array([[na, 1, 1], [2, na, 2], [3, 3, na]]) + X2_test = np.array([[na, 0, na], [0, na, na], [na, na, 0]]) + X2_test_expected = np.array([[3, 0, 1], [0, 3, 2], [2, 1, 0]]) + + knn2 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights) + knn2.fit(X2_train) + assert_allclose(knn2.transform(X2_test), X2_test_expected) + + @pytest.mark.parametrize("na", [-1, np.nan]) def test_knn_imputer_drops_all_nan_features(na): X1 = np.array([[na, 1], [na, 2]])