From 29257947bee9a15d6188ef931bb6b4bd878da7c5 Mon Sep 17 00:00:00 2001 From: Xuefeng Xu Date: Wed, 22 May 2024 17:36:36 +0800 Subject: [PATCH 1/4] FIX exclude samples with nan distance in KNNImputer for uniform weights --- sklearn/impute/_knn.py | 5 +++++ sklearn/impute/tests/test_knn.py | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 64f55693356d6..f8af9327dc919 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -196,6 +196,11 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col): if weight_matrix is not None: weight_matrix[np.isnan(weight_matrix)] = 0.0 + # For uniform weights, also fill nans with zeros + if weight_matrix is None: + weight_matrix = np.ones_like(donors_dist) + weight_matrix[np.isnan(donors_dist)] = 0.0 + # Retrieve donor values and calculate kNN average donors = fit_X_col.take(donors_idx) donors_mask = mask_fit_X_col.take(donors_idx) diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py index 141c2ea90dbd9..8163a2bff32b0 100644 --- a/sklearn/impute/tests/test_knn.py +++ b/sklearn/impute/tests/test_knn.py @@ -239,7 +239,9 @@ def test_knn_imputer_one_n_neighbors(na): def test_knn_imputer_all_samples_are_neighbors(na): X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]]) - X_imputed = np.array([[0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13]]) + X_imputed = np.array( + [[0, 0], [6.25, 2], [4, 3], [5, 5.75], [7, 7], [6.25, 8], [14, 13]] + ) n_neighbors = X.shape[0] - 1 imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na) @@ -505,6 +507,19 @@ def test_knn_imputer_not_enough_valid_distances(na, weights): assert_allclose(knn.transform(X2), X2_imputed) +@pytest.mark.parametrize("na", [-1, np.nan]) +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +def test_knn_imputer_nan_distance(na, weights): + # Samples with nan distance should excluded from mean computation + X_train = np.array([[1, 1], [na, 2]]) + X_test = np.array([[0, na]]) + X_test_expected = np.array([[0, 1]]) + + knn = KNNImputer(missing_values=na, n_neighbors=2, weights=weights) + knn.fit(X_train) + assert_allclose(knn.transform(X_test), X_test_expected) + + @pytest.mark.parametrize("na", [-1, np.nan]) def test_knn_imputer_drops_all_nan_features(na): X1 = np.array([[na, 1], [na, 2]]) From 1d114f6d2b008fd3203f648e2ea927dc74c611ca Mon Sep 17 00:00:00 2001 From: Xuefeng Xu Date: Thu, 30 May 2024 14:55:22 +0800 Subject: [PATCH 2/4] update weight_matrix --- sklearn/impute/_knn.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index f8af9327dc919..025e4ce76c134 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -195,9 +195,7 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col): # fill nans with zeros if weight_matrix is not None: weight_matrix[np.isnan(weight_matrix)] = 0.0 - - # For uniform weights, also fill nans with zeros - if weight_matrix is None: + else: weight_matrix = np.ones_like(donors_dist) weight_matrix[np.isnan(donors_dist)] = 0.0 From 87610012cf9e693cafd2fee2a9f75cf7acc39e80 Mon Sep 17 00:00:00 2001 From: Xuefeng Xu Date: Thu, 6 Jun 2024 16:12:04 +0800 Subject: [PATCH 3/4] add changelog --- doc/whats_new/v1.6.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 3fd07fd51578e..f4d1465256c89 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -89,6 +89,13 @@ Changelog whether a given estimator is of category clusterer. :pr:`28936` by :user:`Christian Veenhuis `. +:mod:`sklearn.impute` +..................... + +- |Fix| :class:`impute.KNNImputer` excludes samples with nan distances when + computing the mean value for uniform weights. + :pr:`29135` by :user:`Xuefeng Xu `. + :mod:`sklearn.linear_model` ........................... From be7dab84e0c711d48484683db03991e39d237ab2 Mon Sep 17 00:00:00 2001 From: Xuefeng Xu Date: Thu, 6 Jun 2024 23:11:05 +0800 Subject: [PATCH 4/4] add another test data with more rows and columns --- sklearn/impute/tests/test_knn.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py index 8163a2bff32b0..dc516e04b9402 100644 --- a/sklearn/impute/tests/test_knn.py +++ b/sklearn/impute/tests/test_knn.py @@ -510,14 +510,22 @@ def test_knn_imputer_not_enough_valid_distances(na, weights): @pytest.mark.parametrize("na", [-1, np.nan]) @pytest.mark.parametrize("weights", ["uniform", "distance"]) def test_knn_imputer_nan_distance(na, weights): - # Samples with nan distance should excluded from mean computation - X_train = np.array([[1, 1], [na, 2]]) - X_test = np.array([[0, na]]) - X_test_expected = np.array([[0, 1]]) - - knn = KNNImputer(missing_values=na, n_neighbors=2, weights=weights) - knn.fit(X_train) - assert_allclose(knn.transform(X_test), X_test_expected) + # Samples with nan distance should be excluded from the mean computation + X1_train = np.array([[1, 1], [na, 2]]) + X1_test = np.array([[0, na]]) + X1_test_expected = np.array([[0, 1]]) + + knn1 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights) + knn1.fit(X1_train) + assert_allclose(knn1.transform(X1_test), X1_test_expected) + + X2_train = np.array([[na, 1, 1], [2, na, 2], [3, 3, na]]) + X2_test = np.array([[na, 0, na], [0, na, na], [na, na, 0]]) + X2_test_expected = np.array([[3, 0, 1], [0, 3, 2], [2, 1, 0]]) + + knn2 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights) + knn2.fit(X2_train) + assert_allclose(knn2.transform(X2_test), X2_test_expected) @pytest.mark.parametrize("na", [-1, np.nan])