scikit-learn · OmarManzoor · Jun 6, 2024 · May 22, 2024 · May 30, 2024 · Jun 6, 2024
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -96,6 +96,13 @@ Changelog
   can be silenced using the `reg_param` attribute.
   :pr:`19731` by :user:`Alihan Zihna <azihna>`.
 
+:mod:`sklearn.impute`
+.....................
+
+- |Fix| :class:`impute.KNNImputer` excludes samples with nan distances when
+  computing the mean value for uniform weights.
+  :pr:`29135` by :user:`Xuefeng Xu <xuefeng-xu>`.
+
 :mod:`sklearn.linear_model`
 ...........................
 

diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
@@ -195,6 +195,9 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
         # fill nans with zeros
         if weight_matrix is not None:
             weight_matrix[np.isnan(weight_matrix)] = 0.0
+        else:
+            weight_matrix = np.ones_like(donors_dist)
+            weight_matrix[np.isnan(donors_dist)] = 0.0
 
         # Retrieve donor values and calculate kNN average
         donors = fit_X_col.take(donors_idx)

diff --git a/sklearn/impute/tests/test_knn.py b/sklearn/impute/tests/test_knn.py
@@ -239,7 +239,9 @@ def test_knn_imputer_one_n_neighbors(na):
 def test_knn_imputer_all_samples_are_neighbors(na):
     X = np.array([[0, 0], [na, 2], [4, 3], [5, na], [7, 7], [na, 8], [14, 13]])
 
-    X_imputed = np.array([[0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13]])
+    X_imputed = np.array(
+        [[0, 0], [6.25, 2], [4, 3], [5, 5.75], [7, 7], [6.25, 8], [14, 13]]
+    )
 
     n_neighbors = X.shape[0] - 1
     imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=na)
@@ -505,6 +507,27 @@ def test_knn_imputer_not_enough_valid_distances(na, weights):
     assert_allclose(knn.transform(X2), X2_imputed)
 
 
+@pytest.mark.parametrize("na", [-1, np.nan])
+@pytest.mark.parametrize("weights", ["uniform", "distance"])
+def test_knn_imputer_nan_distance(na, weights):
+    # Samples with nan distance should be excluded from the mean computation
+    X1_train = np.array([[1, 1], [na, 2]])
+    X1_test = np.array([[0, na]])
+    X1_test_expected = np.array([[0, 1]])
+
+    knn1 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
+    knn1.fit(X1_train)
+    assert_allclose(knn1.transform(X1_test), X1_test_expected)
+
+    X2_train = np.array([[na, 1, 1], [2, na, 2], [3, 3, na]])
+    X2_test = np.array([[na, 0, na], [0, na, na], [na, na, 0]])
+    X2_test_expected = np.array([[3, 0, 1], [0, 3, 2], [2, 1, 0]])
+
+    knn2 = KNNImputer(n_neighbors=2, missing_values=na, weights=weights)
+    knn2.fit(X2_train)
+    assert_allclose(knn2.transform(X2_test), X2_test_expected)
+
+
 @pytest.mark.parametrize("na", [-1, np.nan])
 def test_knn_imputer_drops_all_nan_features(na):
     X1 = np.array([[na, 1], [na, 2]])