diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index 43b6ac642284c..1d95ec8e97eb5 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -109,13 +109,23 @@ def _encode(values, uniques=None, encode=False, check_unknown=True): """ if values.dtype == object: try: - res = _encode_python(values, uniques, encode) + return _encode_python(values, uniques, encode) except TypeError: - types = sorted(t.__qualname__ - for t in set(type(v) for v in values)) - raise TypeError("Encoders require their input to be uniformly " - f"strings or numbers. Got {types}") - return res + unique_types = set(type(v) for v in values) + type_names = sorted(t.__qualname__ for t in unique_types) + error_msg = ("Encoders require their input to be uniformly " + f"strings or numbers. Got: {type_names}.") + if unique_types == {str, type(None)}: + # Be extra helpful to the user for this specific yet common + # case: + error_msg += ( + '\nPreprocess the features with sklearn.impute.' + 'SimpleImputer(strategy="constant", missing_values=None, ' + 'fill_value="missing") to deal with None-encoded ' + 'missing values in other-wise string encoded categorical' + 'data.' + ) + raise TypeError(error_msg) else: return _encode_numpy(values, uniques, encode, check_unknown=check_unknown) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 126a923a952dd..8ebe3547630a9 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -692,7 +692,19 @@ def test_encoders_has_categorical_tags(Encoder): @pytest.mark.parametrize('Encoder', [OneHotEncoder, OrdinalEncoder]) def test_encoders_does_not_support_none_values(Encoder): + values = [["a"], [1]] + expected_msg = ( + r"Encoders require their input to be uniformly strings or numbers\. " + r"Got: \['int', 'str'\]\." + ) + with pytest.raises(TypeError, match=expected_msg): + Encoder().fit(values) + + # Special case with extra missing value imputation suggestion: values = [["a"], [None]] - with pytest.raises(TypeError, match="Encoders require their input to be " - "uniformly strings or numbers."): + expected_msg = ( + r'SimpleImputer\(strategy="constant", missing_values=None, ' + r'fill_value="missing"\)' + ) + with pytest.raises(TypeError, match=expected_msg): Encoder().fit(values)