From 29257947bee9a15d6188ef931bb6b4bd878da7c5 Mon Sep 17 00:00:00 2001
From: Xuefeng Xu <xuxf100@qq.com>
Date: Wed, 22 May 2024 17:36:36 +0800
Subject: [PATCH 1/4] FIX exclude samples with nan distance in KNNImputer for
 uniform weights

---
 sklearn/impute/_knn.py           |  5 +++++
 sklearn/impute/tests/test_knn.py | 17 ++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index 64f55693356d6..f8af9327dc919 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -196,6 +196,11 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
         if weight_matrix is not None:
             weight_matrix[np.isnan(weight_matrix)] = 0.0
 
+        # For uniform weights, also fill nans with zeros
+        if weight_matrix is None:
+            weight_matrix = np.ones_like(donors_dist)
+            weight_matrix[np.isnan(donors_dist)] = 0.0
+
         # Retrieve donor values and calculate kNN average
         donors = fit_X_col.take(donors_idx)
         donors_mask = mask_fit_X_col.take(donors_idx)
diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py
index 141c2ea90dbd9..8163a2bff32b0 100644
--- a/sklearn/impute/tests/test_knn.py
+++ b/sklearn/impute/tests/test_knn.py
@@ -239,7 +239,9 @@ def test_knn_imputer_one_n_neighbors(na):
 def test_knn_imputer_all_samples_are_neighbors(na):
     X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
 
-    X_imputed = np.array([[0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13]])
+    X_imputed = np.array(
+        [[0, 0], [6.25, 2], [4, 3], [5, 5.75], [7, 7], [6.25, 8], [14, 13]]
+    )
 
     n_neighbors = X.shape[0] - 1
     imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
@@ -505,6 +507,19 @@ def test_knn_imputer_not_enough_valid_distances(na, weights):
     assert_allclose(knn.transform(X2), X2_imputed)
 
 
+@pytest.mark.parametrize("na", [-1, np.nan])
+@pytest.mark.parametrize("weights", ["uniform", "distance"])
+def test_knn_imputer_nan_distance(na, weights):
+    # Samples with nan distance should excluded from mean computation
+    X_train = np.array([[1, 1], [na, 2]])
+    X_test = np.array([[0, na]])
+    X_test_expected = np.array([[0, 1]])
+
+    knn = KNNImputer(missing_values=na, n_neighbors=2, weights=weights)
+    knn.fit(X_train)
+    assert_allclose(knn.transform(X_test), X_test_expected)
+
+
 @pytest.mark.parametrize("na", [-1, np.nan])
 def test_knn_imputer_drops_all_nan_features(na):
     X1 = np.array([[na, 1], [na, 2]])

From 1d114f6d2b008fd3203f648e2ea927dc74c611ca Mon Sep 17 00:00:00 2001
From: Xuefeng Xu <xuxf100@qq.com>
Date: Thu, 30 May 2024 14:55:22 +0800
Subject: [PATCH 2/4] update weight_matrix

---
 sklearn/impute/_knn.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index f8af9327dc919..025e4ce76c134 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -195,9 +195,7 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
         # fill nans with zeros
         if weight_matrix is not None:
             weight_matrix[np.isnan(weight_matrix)] = 0.0
-
-        # For uniform weights, also fill nans with zeros
-        if weight_matrix is None:
+        else:
             weight_matrix = np.ones_like(donors_dist)
             weight_matrix[np.isnan(donors_dist)] = 0.0
 

From 87610012cf9e693cafd2fee2a9f75cf7acc39e80 Mon Sep 17 00:00:00 2001
From: Xuefeng Xu <xuxf100@qq.com>
Date: Thu, 6 Jun 2024 16:12:04 +0800
Subject: [PATCH 3/4] add changelog

---
 doc/whats_new/v1.6.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
index 3fd07fd51578e..f4d1465256c89 100644
--- a/doc/whats_new/v1.6.rst
+++ b/doc/whats_new/v1.6.rst
@@ -89,6 +89,13 @@ Changelog
   whether a given estimator is of category clusterer.
   :pr:`28936` by :user:`Christian Veenhuis <ChVeen>`.
 
+:mod:`sklearn.impute`
+.....................
+
+- |Fix| :class:`impute.KNNImputer` excludes samples with nan distances when
+  computing the mean value for uniform weights.
+  :pr:`29135` by :user:`Xuefeng Xu <xuefeng-xu>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 

From be7dab84e0c711d48484683db03991e39d237ab2 Mon Sep 17 00:00:00 2001
From: Xuefeng Xu <xuxf100@qq.com>
Date: Thu, 6 Jun 2024 23:11:05 +0800
Subject: [PATCH 4/4] add another test data with more rows and columns

---
 sklearn/impute/tests/test_knn.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py
index 8163a2bff32b0..dc516e04b9402 100644
--- a/sklearn/impute/tests/test_knn.py
+++ b/sklearn/impute/tests/test_knn.py
@@ -510,14 +510,22 @@ def test_knn_imputer_not_enough_valid_distances(na, weights):
 @pytest.mark.parametrize("na", [-1, np.nan])
 @pytest.mark.parametrize("weights", ["uniform", "distance"])
 def test_knn_imputer_nan_distance(na, weights):
-    # Samples with nan distance should excluded from mean computation
-    X_train = np.array([[1, 1], [na, 2]])
-    X_test = np.array([[0, na]])
-    X_test_expected = np.array([[0, 1]])
-
-    knn = KNNImputer(missing_values=na, n_neighbors=2, weights=weights)
-    knn.fit(X_train)
-    assert_allclose(knn.transform(X_test), X_test_expected)
+    # Samples with nan distance should be excluded from the mean computation
+    X1_train = np.array([[1, 1], [na, 2]])
+    X1_test = np.array([[0, na]])
+    X1_test_expected = np.array([[0, 1]])
+
+    knn1 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
+    knn1.fit(X1_train)
+    assert_allclose(knn1.transform(X1_test), X1_test_expected)
+
+    X2_train = np.array([[na, 1, 1], [2, na, 2], [3, 3, na]])
+    X2_test = np.array([[na, 0, na], [0, na, na], [na, na, 0]])
+    X2_test_expected = np.array([[3, 0, 1], [0, 3, 2], [2, 1, 0]])
+
+    knn2 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
+    knn2.fit(X2_train)
+    assert_allclose(knn2.transform(X2_test), X2_test_expected)
 
 
 @pytest.mark.parametrize("na", [-1, np.nan])