FIX Improves nan support in LabelEncoder (#22629)

thomasjpfan · web-flow · commit 9c4f023528ca · 2022-10-28T08:32:09.000+02:00
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -539,6 +539,9 @@ Changelog
   is now deprecated and will be removed in version 1.4. Use `sparse_output` instead.
   :pr:`24412` by :user:`Rushil Desai <rusdes>`.
 
+- |Fix| :class:`preprocessing.LabelEncoder` correctly encodes NaNs in `transform`.
+  :pr:`22629` by `Thomas Fan`_.
+
 :mod:`sklearn.svm`
 ..................
 
@@ -560,6 +563,9 @@ Changelog
   deterministic SVD used by the randomized SVD algorithm.
   :pr:`20617` by :user:`Srinath Kailasa <skailasa>`
 
+- |Enhancement| :func:`utils.validation.column_or_1d` now accepts a `dtype`
+  parameter to specific `y`'s dtype. :pr:`22629` by `Thomas Fan`_.
+
 - |FIX| :func:`utils.multiclass.type_of_target` now properly handles sparse matrices.
   :pr:`14862` by :user:`Léonard Binet <leonardbinet>`.
 
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
@@ -131,7 +131,7 @@ def transform(self, y):
             Labels as normalized encodings.
         """
         check_is_fitted(self)
-        y = column_or_1d(y, warn=True)
+        y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
         # transform of empty array is empty array
         if _num_samples(y) == 0:
             return np.array([])
diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py
@@ -643,3 +643,15 @@ def test_inverse_binarize_multiclass():
         csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
     )
     assert_array_equal(got, np.array([1, 1, 0]))
+
+
+def test_nan_label_encoder():
+    """Check that label encoder encodes nans in transform.
+
+    Non-regression test for #22628.
+    """
+    le = LabelEncoder()
+    le.fit(["a", "a", "b", np.nan])
+
+    y_trans = le.transform([np.nan])
+    assert_array_equal(y_trans, [2])
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -1140,14 +1140,19 @@ def _check_y(y, multi_output=False, y_numeric=False, estimator=None):
     return y
 
 
-def column_or_1d(y, *, warn=False):
+def column_or_1d(y, *, dtype=None, warn=False):
     """Ravel column or 1d numpy array, else raises an error.
 
     Parameters
     ----------
     y : array-like
        Input data.
 
+    dtype : data-type, default=None
+        Data type for `y`.
+
+        .. versionadded:: 1.2
+
     warn : bool, default=False
        To control display of warnings.
 
@@ -1162,7 +1167,7 @@ def column_or_1d(y, *, warn=False):
         If `y` is not a 1D array or a 2D array with a single row or column.
     """
     xp, _ = get_namespace(y)
-    y = xp.asarray(y)
+    y = xp.asarray(y, dtype=dtype)
     shape = y.shape
     if len(shape) == 1:
         return _asarray_with_order(xp.reshape(y, -1), order="C", xp=xp)