From 74384c634aa5b1ef9acb360d6616987fd64b7697 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 6 Jun 2018 13:22:01 +0200 Subject: [PATCH 01/31] added tests for constant impute strategy in simpleImputer --- sklearn/tests/test_impute.py | 133 ++++++++++++++++++++++++++++++++--- 1 file changed, 125 insertions(+), 8 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 954a016a835bb..6457b9ec83a27 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -24,16 +24,14 @@ def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. - Test: - - along the two axes - - with dense and sparse arrays + Test with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ - "axis = {0}, sparse = {1}" % (strategy, missing_values) + "sparse = {0}" % (strategy, missing_values) assert_ae = assert_array_equal if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': @@ -43,8 +41,8 @@ def _check_statistics(X, X_true, imputer = SimpleImputer(missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, - err_msg=err_msg.format(0, False)) - assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False)) + err_msg=err_msg.format(False)) + assert_ae(X_trans, X_true, err_msg=err_msg.format(False)) # Sparse matrix imputer = SimpleImputer(missing_values, strategy=strategy) @@ -55,8 +53,8 @@ def _check_statistics(X, X_true, X_trans = X_trans.toarray() assert_ae(imputer.statistics_, statistics, - err_msg=err_msg.format(0, True)) - assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True)) + err_msg=err_msg.format(True)) + assert_ae(X_trans, X_true, err_msg=err_msg.format(True)) def test_imputation_shape(): @@ -210,6 +208,125 @@ def test_imputation_most_frequent(): _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1) +def test_imputation_constant_integer(): + # Test imputation using the constant strategy + # on integers + X = np.array([ + [-1, 2, 3, -1], + [4, -1, 5, -1], + [6, 7, -1, -1], + [8, 9, 0, -1] + ]) + + X_true = np.array([ + [0, 2, 3], + [4, 0, 5], + [6, 7, 0], + [8, 9, 0] + ]) + + imputer = SimpleImputer(missing_value=-1, strategy="constant", + fill_value=0) + X_trans = imputer.fit(X).transform(X) + + assert_array_equal(X_trans, X_true) + + +def test_imputation_constant_float(): + # Test imputation using the constant strategy + # on floats + X = np.array([ + [np.nan, 1.1, 2.2, np.nan], + [3.3, np.nan, 4.4, np.nan], + [5.5, 6.6, np.nan, np.nan], + [7.7, 8.8, 9.9, np.nan] + ]) + + X_true = np.array([ + [0, 1.1, 2.2], + [3.3, 0, 4.4], + [5.5, 6.6, 0], + [7.7, 8.8, 9.9] + ]) + + imputer = SimpleImputer(strategy="constant", fill_value=0) + X_trans = imputer.fit(X).transform(X) + + assert_allclose(X_trans, X_true) + + +def test_imputation_constant_object(): + # Test imputation using the constant strategy + # on objects + X = np.array([ + [None, "a", "b", None], + ["c", None, "d", None], + ["e", "f", None, None], + ["g", "h", "i", None] + ], dtype=object) + + X_true = np.array([ + ["Z", "a", "b"], + ["c", "Z", "d"], + ["e", "f", "Z"], + ["g", "h", "i"] + ]) + + imputer = SimpleImputer(None, strategy="constant", fill_value="Z") + X_trans = imputer.fit(X).transform(X) + + assert_array_equal(X_trans, X_true) + + +def test_imputation_constant_object_nan(): + # Test imputation using the constant strategy + # on objects + X = np.array([ + [np.nan, "a", "b", np.nan], + ["c", np.nan, "d", np.nan], + ["e", "f", np.nan, np.nan], + ["g", "h", "i", np.nan] + ], dtype=object) + + X_true = np.array([ + ["missing", "a", "b"], + ["c", "missing", "d"], + ["e", "f", "missing"], + ["g", "h", "i"] + ], dtype=object) + + imputer = SimpleImputer(None, strategy="constant", fill_value="missing") + X_trans = imputer.fit(X).transform(X) + + assert_array_equal(X_trans, X_true) + + +def test_imputation_constant_pandas(): + # Test imputation using the constant strategy + # on pandas df + pd = pytest.importskip("pandas") + + for dtype in [object, "category"]: + df = pd.DataFrame([ + [np.nan, "a", "b", np.nan], + ["c", np.nan, "d", np.nan], + ["e", "f", np.nan, np.nan], + ["g", "h", "i", np.nan] + ], dtype=dtype) + + X_true = np.array([ + ["missing", "a", "b"], + ["c", "missing", "d"], + ["e", "f", "missing"], + ["g", "h", "i"] + ], dtype=object) + + imputer = SimpleImputer(strategy="constant", fill_value="missing") + X_trans = imputer.fit(df).transform(df) + + assert_array_equal(X_trans, X_true) + + def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. pipeline = Pipeline([('imputer', SimpleImputer(missing_values=0)), From 6dd6a5ef3da6d0ee1343ae4823f0ce4dd81f6fa8 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 6 Jun 2018 13:30:57 +0200 Subject: [PATCH 02/31] typos --- sklearn/tests/test_impute.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 6457b9ec83a27..93859a0a9e0c2 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -225,7 +225,7 @@ def test_imputation_constant_integer(): [8, 9, 0] ]) - imputer = SimpleImputer(missing_value=-1, strategy="constant", + imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) X_trans = imputer.fit(X).transform(X) @@ -266,13 +266,13 @@ def test_imputation_constant_object(): ], dtype=object) X_true = np.array([ - ["Z", "a", "b"], - ["c", "Z", "d"], - ["e", "f", "Z"], + ["missing", "a", "b"], + ["c", "missing", "d"], + ["e", "f", "missing"], ["g", "h", "i"] ]) - imputer = SimpleImputer(None, strategy="constant", fill_value="Z") + imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true) @@ -295,7 +295,7 @@ def test_imputation_constant_object_nan(): ["g", "h", "i"] ], dtype=object) - imputer = SimpleImputer(None, strategy="constant", fill_value="missing") + imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true) @@ -323,7 +323,7 @@ def test_imputation_constant_pandas(): imputer = SimpleImputer(strategy="constant", fill_value="missing") X_trans = imputer.fit(df).transform(df) - + assert_array_equal(X_trans, X_true) From 2b101fbbee1175da653f793c2a47eb28c11fb18e Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 6 Jun 2018 13:37:53 +0200 Subject: [PATCH 03/31] typos --- sklearn/tests/test_impute.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 93859a0a9e0c2..e46ca7e9837df 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -250,7 +250,7 @@ def test_imputation_constant_float(): ]) imputer = SimpleImputer(strategy="constant", fill_value=0) - X_trans = imputer.fit(X).transform(X) + X_trans = imputer.fit(X).transform(X) assert_allclose(X_trans, X_true) @@ -272,8 +272,9 @@ def test_imputation_constant_object(): ["g", "h", "i"] ]) - imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing") - X_trans = imputer.fit(X).transform(X) + imputer = SimpleImputer(missing_values=None, strategy="constant", + fill_value="missing") + X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true) @@ -295,8 +296,9 @@ def test_imputation_constant_object_nan(): ["g", "h", "i"] ], dtype=object) - imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing") - X_trans = imputer.fit(X).transform(X) + imputer = SimpleImputer(missing_values=None, strategy="constant", + fill_value="missing") + X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true) From 6e13e68eef49eec22d1116ce896f591c69cb481e Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 6 Jun 2018 13:40:39 +0200 Subject: [PATCH 04/31] typos --- sklearn/tests/test_impute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index e46ca7e9837df..22e53a56f73c8 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -306,7 +306,7 @@ def test_imputation_constant_object_nan(): def test_imputation_constant_pandas(): # Test imputation using the constant strategy # on pandas df - pd = pytest.importskip("pandas") + pd = pytest.importorskip("pandas") for dtype in [object, "category"]: df = pd.DataFrame([ From f300fe24e959c07f9a72a35007d06d905544c0d7 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 6 Jun 2018 18:05:42 +0200 Subject: [PATCH 05/31] added constant strategy to the SimpleImputer. --- sklearn/impute.py | 139 +++++++++++++++++++++++++---------- sklearn/tests/test_impute.py | 77 ++++++++++--------- 2 files changed, 143 insertions(+), 73 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index fe772d6a3a0cb..5711eb55c3196 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -7,6 +7,7 @@ import warnings from time import time +import numbers import numpy as np import numpy.ma as ma @@ -36,11 +37,20 @@ 'MICEImputer', ] +def _is_scalar_nan(x): + """Work around limitations of numpy ufuncs""" + return False if x is None else np.isnan(x) + def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" - if value_to_mask == "NaN" or np.isnan(value_to_mask): - return np.isnan(X) + if value_to_mask == "NaN" or _is_scalar_nan(value_to_mask): + if X.dtype.kind == "O": + # np.isnan does not work for dtype objects. We use the trick that + # nan values are never equal to themselves. + return np.logical_not(X == X) + else: + return np.isnan(X) else: return X == value_to_mask @@ -94,6 +104,13 @@ class SimpleImputer(BaseEstimator, TransformerMixin): each column. - If "most_frequent", then replace missing using the most frequent value along each column. + - If "constant", then replace missing values with fill_value + + fill_value : string or numerical value, optional (default=None) + When strategy == "constant", fill_value is used to replace all + occurrences of missing_values. + If left to the default, fill_value will be 0 when imputing numerical + data and "missing_value" for strings or object data types. verbose : integer, optional (default=0) Controls the verbosity of the imputer. @@ -115,16 +132,41 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Notes ----- Columns which only contained missing values at `fit` are discarded upon - `transform`. + `transform` is strategy is not "constant" """ def __init__(self, missing_values="NaN", strategy="mean", - verbose=0, copy=True): + fill_value=None, verbose=0, copy=True): self.missing_values = missing_values self.strategy = strategy + self.fill_value = fill_value self.verbose = verbose self.copy = copy + def _validate_input(self, X): + allowed_strategies = ["mean", "median", "most_frequent", "constant"] + if self.strategy not in allowed_strategies: + raise ValueError("Can only use these strategies: {0} " + " got strategy={1}".format(allowed_strategies, + self.strategy)) + + if self.strategy in ("most_frequent", "constant"): + dtype = None + else: + dtype = FLOAT_DTYPES + + if self.missing_values is None: + force_all_finite = "allow-nan" + else: + if self.missing_values == "NaN" or np.isnan(self.missing_values): + force_all_finite = "allow-nan" + else: + force_all_finite = True + + return check_array(X, accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite) + + def fit(self, X, y=None): """Fit the imputer on X. @@ -138,30 +180,37 @@ def fit(self, X, y=None): ------- self : SimpleImputer """ - # Check parameters - allowed_strategies = ["mean", "median", "most_frequent"] - if self.strategy not in allowed_strategies: - raise ValueError("Can only use these strategies: {0} " - " got strategy={1}".format(allowed_strategies, - self.strategy)) - - X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, - force_all_finite='allow-nan' - if self.missing_values == 'NaN' - or np.isnan(self.missing_values) else True) + X = self._validate_input(X) + + if self.strategy == "constant": + if (X.dtype.kind in ("i", "f") + and not isinstance(self.fill_value, numbers.Real)): + raise ValueError( + "fill_value={0} is invalid. Expected a numerical value " + "to numerical data".format(self.fill_value)) + + if self.fill_value is None: + if X.dtype.kind in ("i", "f"): + fill_value = 0 + else: + fill_value = "missing_value" + else: + fill_value = self.fill_value if sparse.issparse(X): self.statistics_ = self._sparse_fit(X, self.strategy, - self.missing_values) + self.missing_values, + fill_value) else: self.statistics_ = self._dense_fit(X, self.strategy, - self.missing_values) + self.missing_values, + fill_value) return self - def _sparse_fit(self, X, strategy, missing_values): + def _sparse_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on sparse data.""" # Count the zeros if missing_values == 0: @@ -233,12 +282,14 @@ def _sparse_fit(self, X, strategy, missing_values): n_zeros_axis[i]) return most_frequent + + # Constant + elif strategy == "constant": + + return np.full(X.shape[0], fill_value) - def _dense_fit(self, X, strategy, missing_values): + def _dense_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on dense data.""" - X = check_array(X, force_all_finite='allow-nan' - if self.missing_values == 'NaN' - or np.isnan(self.missing_values) else True) mask = _get_mask(X, missing_values) masked_X = ma.masked_array(X, mask=mask) @@ -280,6 +331,16 @@ def _dense_fit(self, X, strategy, missing_values): return most_frequent + # Constant + elif strategy == "constant": + if isinstance(fill_value, numbers.Real): + dtype = None + else: + dtype = object + + return np.full(X.shape[0], fill_value, dtype=dtype) + + def transform(self, X): """Impute all missing values in X. @@ -289,27 +350,29 @@ def transform(self, X): The input data to complete. """ check_is_fitted(self, 'statistics_') - X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, - force_all_finite='allow-nan' - if self.missing_values == 'NaN' - or np.isnan(self.missing_values) else True, - copy=self.copy) + + X = self._validate_input(X) + statistics = self.statistics_ if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) - # Delete the invalid columns - invalid_mask = np.isnan(statistics) - valid_mask = np.logical_not(invalid_mask) - valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.flatnonzero(valid_mask) - missing = np.arange(X.shape[1])[invalid_mask] - - if invalid_mask.any(): - if self.verbose: - warnings.warn("Deleting features without " - "observed values: %s" % missing) + # Delete the invalid columns if strategy is not constant + if self.strategy == "constant": + valid_statistics = statistics + else: + invalid_mask = np.isnan(statistics) + valid_mask = np.logical_not(invalid_mask) + + if invalid_mask.any(): + missing = np.arange(X.shape[1])[invalid_mask] + if self.verbose: + warnings.warn("Deleting features without " + "observed values: %s" % missing) + + valid_statistics = statistics[valid_mask] + valid_statistics_indexes = np.flatnonzero(valid_mask) X = X[:, valid_statistics_indexes] # Do actual imputation diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 22e53a56f73c8..9e788012b85c2 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -219,10 +219,10 @@ def test_imputation_constant_integer(): ]) X_true = np.array([ - [0, 2, 3], - [4, 0, 5], - [6, 7, 0], - [8, 9, 0] + [0, 2, 3, 0], + [4, 0, 5, 0], + [6, 7, 0, 0], + [8, 9, 0, 0] ]) imputer = SimpleImputer(missing_values=-1, strategy="constant", @@ -235,24 +235,32 @@ def test_imputation_constant_integer(): def test_imputation_constant_float(): # Test imputation using the constant strategy # on floats - X = np.array([ - [np.nan, 1.1, 2.2, np.nan], - [3.3, np.nan, 4.4, np.nan], - [5.5, 6.6, np.nan, np.nan], - [7.7, 8.8, 9.9, np.nan] - ]) - - X_true = np.array([ - [0, 1.1, 2.2], - [3.3, 0, 4.4], - [5.5, 6.6, 0], - [7.7, 8.8, 9.9] - ]) + for format in ["csr", "array"]: + X = np.array([ + [np.nan, 1.1, 2.2, np.nan], + [3.3, np.nan, 4.4, np.nan], + [5.5, 6.6, np.nan, np.nan], + [7.7, 8.8, 9.9, np.nan] + ]) - imputer = SimpleImputer(strategy="constant", fill_value=0) - X_trans = imputer.fit(X).transform(X) + X = sparse.csr_matrix(X) if format == "csr" else X - assert_allclose(X_trans, X_true) + X_true = np.array([ + [0, 1.1, 2.2, 0], + [3.3, 0, 4.4, 0], + [5.5, 6.6, 0, 0], + [7.7, 8.8, 9.9, 0] + ]) + + X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true + + imputer = SimpleImputer(strategy="constant", fill_value=0) + X_trans = imputer.fit(X).transform(X) + + if format == "csr": + assert_allclose(X_trans.toarray(), X_true.toarray()) + else: + assert_allclose(X_trans, X_true) def test_imputation_constant_object(): @@ -266,11 +274,11 @@ def test_imputation_constant_object(): ], dtype=object) X_true = np.array([ - ["missing", "a", "b"], - ["c", "missing", "d"], - ["e", "f", "missing"], - ["g", "h", "i"] - ]) + ["missing", "a", "b", "missing"], + ["c", "missing", "d", "missing"], + ["e", "f", "missing", "missing"], + ["g", "h", "i", "missing"] + ], dtype=object) imputer = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing") @@ -290,14 +298,13 @@ def test_imputation_constant_object_nan(): ], dtype=object) X_true = np.array([ - ["missing", "a", "b"], - ["c", "missing", "d"], - ["e", "f", "missing"], - ["g", "h", "i"] + ["missing_value", "a", "b", "missing_value"], + ["c", "missing_value", "d", "missing_value"], + ["e", "f", "missing_value", "missing_value"], + ["g", "h", "i", "missing_value"] ], dtype=object) - imputer = SimpleImputer(missing_values=None, strategy="constant", - fill_value="missing") + imputer = SimpleImputer(strategy="constant") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true) @@ -317,10 +324,10 @@ def test_imputation_constant_pandas(): ], dtype=dtype) X_true = np.array([ - ["missing", "a", "b"], - ["c", "missing", "d"], - ["e", "f", "missing"], - ["g", "h", "i"] + ["missing", "a", "b", "missing"], + ["c", "missing", "d", "missing"], + ["e", "f", "missing", "missing"], + ["g", "h", "i", "missing"] ], dtype=object) imputer = SimpleImputer(strategy="constant", fill_value="missing") From 35e30ac4f26d66fcc5cd9ca398c333531990be27 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 7 Jun 2018 15:29:54 +0200 Subject: [PATCH 06/31] bug fixes on the SimpleImputer and change for default value to np.nan on MICEImputer --- sklearn/impute.py | 87 +++++++++++++++--------------------- sklearn/tests/test_impute.py | 71 +++++++++++------------------ 2 files changed, 62 insertions(+), 96 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 5711eb55c3196..390b2abcb9879 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -37,20 +37,12 @@ 'MICEImputer', ] -def _is_scalar_nan(x): - """Work around limitations of numpy ufuncs""" - return False if x is None else np.isnan(x) - def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" - if value_to_mask == "NaN" or _is_scalar_nan(value_to_mask): - if X.dtype.kind == "O": - # np.isnan does not work for dtype objects. We use the trick that - # nan values are never equal to themselves. - return np.logical_not(X == X) - else: - return np.isnan(X) + if value_to_mask is np.nan: + # nan values are never equal to themselves + return X != X else: return X == value_to_mask @@ -90,10 +82,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : integer or "NaN", optional (default="NaN") + missing_values : real number, string, np.nan or None, + optional (default=np.nan). The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. For missing values encoded as np.nan, - use the string value "NaN". + `missing_values` will be imputed. strategy : string, optional (default="mean") The imputation strategy. @@ -104,7 +96,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): each column. - If "most_frequent", then replace missing using the most frequent value along each column. - - If "constant", then replace missing values with fill_value + - If "constant", then replace missing values with fill_value. fill_value : string or numerical value, optional (default=None) When strategy == "constant", fill_value is used to replace all @@ -132,10 +124,10 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Notes ----- Columns which only contained missing values at `fit` are discarded upon - `transform` is strategy is not "constant" + `transform` if strategy is not "constant" """ - def __init__(self, missing_values="NaN", strategy="mean", + def __init__(self, missing_values=np.nan, strategy="mean", fill_value=None, verbose=0, copy=True): self.missing_values = missing_values self.strategy = strategy @@ -151,21 +143,17 @@ def _validate_input(self, X): self.strategy)) if self.strategy in ("most_frequent", "constant"): - dtype = None + dtype = None else: dtype = FLOAT_DTYPES - if self.missing_values is None: - force_all_finite = "allow-nan" + if self.missing_values is not np.nan: + force_all_finite = True else: - if self.missing_values == "NaN" or np.isnan(self.missing_values): - force_all_finite = "allow-nan" - else: - force_all_finite = True + force_all_finite = "allow-nan" return check_array(X, accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite) - + force_all_finite=force_all_finite, copy=self.copy) def fit(self, X, y=None): """Fit the imputer on X. @@ -182,13 +170,8 @@ def fit(self, X, y=None): """ X = self._validate_input(X) - if self.strategy == "constant": - if (X.dtype.kind in ("i", "f") - and not isinstance(self.fill_value, numbers.Real)): - raise ValueError( - "fill_value={0} is invalid. Expected a numerical value " - "to numerical data".format(self.fill_value)) - + # default missing_values is 0 for numerical input and "missing_value" + # otherwise if self.fill_value is None: if X.dtype.kind in ("i", "f"): fill_value = 0 @@ -197,6 +180,14 @@ def fit(self, X, y=None): else: fill_value = self.fill_value + # fill_value should be numerical in case of numerical input + if self.strategy == "constant": + if (X.dtype.kind in ("i", "f") + and not isinstance(fill_value, numbers.Real)): + raise ValueError( + "fill_value={0} is invalid. Expected a numerical value " + "to numerical data".format(fill_value)) + if sparse.issparse(X): self.statistics_ = self._sparse_fit(X, self.strategy, @@ -282,11 +273,11 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value): n_zeros_axis[i]) return most_frequent - + # Constant elif strategy == "constant": - return np.full(X.shape[0], fill_value) + return np.full(X.shape[1], fill_value) def _dense_fit(self, X, strategy, missing_values, fill_value): """Fit the transformer on dense data.""" @@ -338,8 +329,7 @@ def _dense_fit(self, X, strategy, missing_values, fill_value): else: dtype = object - return np.full(X.shape[0], fill_value, dtype=dtype) - + return np.full(X.shape[1], fill_value, dtype=dtype) def transform(self, X): """Impute all missing values in X. @@ -364,16 +354,15 @@ def transform(self, X): else: invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) + valid_statistics = statistics[valid_mask] + valid_statistics_indexes = np.flatnonzero(valid_mask) if invalid_mask.any(): missing = np.arange(X.shape[1])[invalid_mask] if self.verbose: warnings.warn("Deleting features without " - "observed values: %s" % missing) - - valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.flatnonzero(valid_mask) - X = X[:, valid_statistics_indexes] + "observed values: %s" % missing) + X = X[:, valid_statistics_indexes] # Do actual imputation if sparse.issparse(X) and self.missing_values != 0: @@ -390,7 +379,6 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=0) values = np.repeat(valid_statistics, n_missing) - coordinates = np.where(mask.transpose())[::-1] X[coordinates] = values @@ -409,10 +397,9 @@ class MICEImputer(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : int or "NaN", optional (default="NaN") + missing_values : int, np.nan, optional (default=np.nan) The placeholder for the missing values. All occurrences of - ``missing_values`` will be imputed. For missing values encoded as - np.nan, use the string value "NaN". + ``missing_values`` will be imputed. imputation_order : str, optional (default="ascending") The order in which the features will be imputed. Possible values: @@ -507,7 +494,7 @@ class MICEImputer(BaseEstimator, TransformerMixin): """ def __init__(self, - missing_values='NaN', + missing_values=np.nan, imputation_order='ascending', n_imputations=100, n_burn_in=10, @@ -757,10 +744,10 @@ def _initial_imputation(self, X): Input data's missing indicator matrix, where "n_samples" is the number of samples and "n_features" is the number of features. """ + X = check_array(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite='allow-nan' - if self.missing_values == 'NaN' - or np.isnan(self.missing_values) else True) + force_all_finite="allow-nan" + if self.missing_values is np.nan else True) mask_missing_values = _get_mask(X, self.missing_values) if self.initial_imputer_ is None: diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 9e788012b85c2..a9d2e189bf179 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -62,7 +62,7 @@ def test_imputation_shape(): X = np.random.randn(10, 2) X[::2] = np.nan - for strategy in ['mean', 'median', 'most_frequent']: + for strategy in ['mean', 'median', 'most_frequent', "constant"]: imputer = SimpleImputer(strategy=strategy) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert X_imputed.shape == (10, 2) @@ -99,9 +99,10 @@ def test_imputation_mean_median(): values = np.arange(1, shape[0] + 1) values[4::2] = - values[4::2] - tests = [("mean", "NaN", lambda z, v, p: safe_mean(np.hstack((z, v)))), + tests = [("mean", np.nan, lambda z, v, p: safe_mean(np.hstack((z, v)))), ("mean", 0, lambda z, v, p: np.mean(v)), - ("median", "NaN", lambda z, v, p: safe_median(np.hstack((z, v)))), + ("median", np.nan, + lambda z, v, p: safe_median(np.hstack((z, v)))), ("median", 0, lambda z, v, p: np.median(v))] for strategy, test_missing_values, true_value_fun in tests: @@ -182,7 +183,7 @@ def test_imputation_median_special_cases(): statistics_median = [0, 5, 0, -2.5, 2.5, 4.5, -4.5, .5] _check_statistics(X, X_imputed_median, "median", - statistics_median, 'NaN') + statistics_median, np.nan) def test_imputation_most_frequent(): @@ -225,7 +226,7 @@ def test_imputation_constant_integer(): [8, 9, 0, 0] ]) - imputer = SimpleImputer(missing_values=-1, strategy="constant", + imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) X_trans = imputer.fit(X).transform(X) @@ -251,7 +252,7 @@ def test_imputation_constant_float(): [5.5, 6.6, 0, 0], [7.7, 8.8, 9.9, 0] ]) - + X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true imputer = SimpleImputer(strategy="constant", fill_value=0) @@ -266,48 +267,26 @@ def test_imputation_constant_float(): def test_imputation_constant_object(): # Test imputation using the constant strategy # on objects - X = np.array([ - [None, "a", "b", None], - ["c", None, "d", None], - ["e", "f", None, None], - ["g", "h", "i", None] - ], dtype=object) - - X_true = np.array([ - ["missing", "a", "b", "missing"], - ["c", "missing", "d", "missing"], - ["e", "f", "missing", "missing"], - ["g", "h", "i", "missing"] - ], dtype=object) - - imputer = SimpleImputer(missing_values=None, strategy="constant", - fill_value="missing") - X_trans = imputer.fit(X).transform(X) - - assert_array_equal(X_trans, X_true) - - -def test_imputation_constant_object_nan(): - # Test imputation using the constant strategy - # on objects - X = np.array([ - [np.nan, "a", "b", np.nan], - ["c", np.nan, "d", np.nan], - ["e", "f", np.nan, np.nan], - ["g", "h", "i", np.nan] - ], dtype=object) + for marker in (None, np.nan, "NAN", 0): + X = np.array([ + [marker, "a", "b", marker], + ["c", marker, "d", marker], + ["e", "f", marker, marker], + ["g", "h", "i", marker] + ], dtype=object) - X_true = np.array([ - ["missing_value", "a", "b", "missing_value"], - ["c", "missing_value", "d", "missing_value"], - ["e", "f", "missing_value", "missing_value"], - ["g", "h", "i", "missing_value"] - ], dtype=object) + X_true = np.array([ + ["missing", "a", "b", "missing"], + ["c", "missing", "d", "missing"], + ["e", "f", "missing", "missing"], + ["g", "h", "i", "missing"] + ], dtype=object) - imputer = SimpleImputer(strategy="constant") - X_trans = imputer.fit(X).transform(X) + imputer = SimpleImputer(missing_values=marker, strategy="constant", + fill_value="missing") + X_trans = imputer.fit(X).transform(X) - assert_array_equal(X_trans, X_true) + assert_array_equal(X_trans, X_true) def test_imputation_constant_pandas(): @@ -331,7 +310,7 @@ def test_imputation_constant_pandas(): ], dtype=object) imputer = SimpleImputer(strategy="constant", fill_value="missing") - X_trans = imputer.fit(df).transform(df) + X_trans = imputer.fit(df).transform(df) assert_array_equal(X_trans, X_true) From ea4a929358f1dd309cbd535520061421dc09b2e6 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 7 Jun 2018 17:35:54 +0200 Subject: [PATCH 07/31] object dtypes support for "most_frequent" strategy in SimpleImputer --- sklearn/impute.py | 13 +++++++++---- sklearn/tests/test_impute.py | 27 ++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 390b2abcb9879..31f1e652d2425 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -41,7 +41,8 @@ def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" if value_to_mask is np.nan: - # nan values are never equal to themselves + # nan values are never equal to themselves. We use this trick because + # np.isnan does not work on object dtypes. return X != X else: return X == value_to_mask @@ -82,7 +83,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : real number, string, np.nan or None, + missing_values : real number, string, np.nan or None, \ optional (default=np.nan). The placeholder for the missing values. All occurrences of `missing_values` will be imputed. @@ -313,7 +314,10 @@ def _dense_fit(self, X, strategy, missing_values, fill_value): X = X.transpose() mask = mask.transpose() - most_frequent = np.empty(X.shape[0]) + if X.dtype.kind == "O": + most_frequent = np.empty(X.shape[0], dtype=object) + else: + most_frequent = np.empty(X.shape[0]) for i, (row, row_mask) in enumerate(zip(X[:], mask[:])): row_mask = np.logical_not(row_mask).astype(np.bool) @@ -352,7 +356,8 @@ def transform(self, X): if self.strategy == "constant": valid_statistics = statistics else: - invalid_mask = np.isnan(statistics) + # same as np.isnan but also works for object dtypes + invalid_mask = statistics != statistics valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.flatnonzero(valid_mask) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index a9d2e189bf179..035bb3c2923ba 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -34,6 +34,7 @@ def _check_statistics(X, X_true, "sparse = {0}" % (strategy, missing_values) assert_ae = assert_array_equal + if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': assert_ae = assert_array_almost_equal @@ -209,6 +210,30 @@ def test_imputation_most_frequent(): _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1) +def test_imputation_most_frequent_objects(): + # Test imputation using the most-frequent strategy. + for marker in (None, np.nan, "NAN", "", 0): + X = np.array([ + [marker, marker, "a", "f"], + [marker, "c", marker, "d"], + [marker, "b", "d", marker], + [marker, "c", "d", "h"], + ], dtype=object) + + X_true = np.array([ + ["c", "a", "f"], + ["c", "d", "d"], + ["b", "d", "d"], + ["c", "d", "h"], + ], dtype=object) + + imputer = SimpleImputer(missing_values=marker, + strategy="most_frequent") + X_trans = imputer.fit(X).transform(X) + + assert_array_equal(X_trans, X_true) + + def test_imputation_constant_integer(): # Test imputation using the constant strategy # on integers @@ -267,7 +292,7 @@ def test_imputation_constant_float(): def test_imputation_constant_object(): # Test imputation using the constant strategy # on objects - for marker in (None, np.nan, "NAN", 0): + for marker in (None, np.nan, "NAN", "", 0): X = np.array([ [marker, "a", "b", marker], ["c", marker, "d", marker], From 10f165b6d64da70ad0447c7506c549f999ac28ab Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 11 Jun 2018 09:49:39 +0200 Subject: [PATCH 08/31] minor fixes regarding the change of default missing_values="NaN" to np.nan --- doc/modules/impute.rst | 14 +++++++++----- sklearn/impute.py | 16 ++++++++-------- sklearn/model_selection/tests/test_search.py | 2 +- sklearn/model_selection/tests/test_validation.py | 4 ++-- sklearn/tests/test_impute.py | 12 ++++-------- sklearn/utils/estimator_checks.py | 1 + 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index f161825105975..a28e0d4b47e38 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -20,9 +20,10 @@ Univariate feature imputation ============================= The :class:`SimpleImputer` class provides basic strategies for imputing missing -values, either using the mean, the median or the most frequent value of -the row or column in which the missing values are located. This class -also allows for different missing values encodings. +values. Missing values can be imputed with a provided value, or using the +statistics (mean, median or most frequent) of each column in which the missing +values are located. This class also allows for different missing values +encodings. The following snippet demonstrates how to replace missing values, encoded as ``np.nan``, using the mean value of the columns (axis 0) @@ -30,9 +31,9 @@ that contain the missing values:: >>> import numpy as np >>> from sklearn.impute import SimpleImputer - >>> imp = SimpleImputer(missing_values='NaN', strategy='mean') + >>> imp = SimpleImputer(missing_values=np.nan, strategy='mean') >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(copy=True, missing_values='NaN', strategy='mean', verbose=0) + SimpleImputer(copy=True, missing_values=nan, strategy='mean', verbose=0) >>> X = [[np.nan, 2], [6, np.nan], [7, 6]] >>> print(imp.transform(X)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS [[4. 2. ] @@ -52,6 +53,9 @@ The :class:`SimpleImputer` class also supports sparse matrices:: [6. 3.666...] [7. 6. ]] +Object + + Note that, here, missing values are encoded by 0 and are thus implicitly stored in the matrix. This format is thus suitable when there are many more missing values than observed values. diff --git a/sklearn/impute.py b/sklearn/impute.py index 31f1e652d2425..c2832a35c0c48 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -185,7 +185,7 @@ def fit(self, X, y=None): if self.strategy == "constant": if (X.dtype.kind in ("i", "f") and not isinstance(fill_value, numbers.Real)): - raise ValueError( + raise TypeError( "fill_value={0} is invalid. Expected a numerical value " "to numerical data".format(fill_value)) @@ -244,7 +244,7 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value): with np.errstate(all="ignore"): return np.ravel(sums) / np.ravel(n_non_missing) - # Median + Most frequent + # Median + Most frequent + Constant else: # Remove the missing values, for each column columns_all = np.hsplit(X.data, X.indptr[1:-1]) @@ -277,7 +277,6 @@ def _sparse_fit(self, X, strategy, missing_values, fill_value): # Constant elif strategy == "constant": - return np.full(X.shape[1], fill_value) def _dense_fit(self, X, strategy, missing_values, fill_value): @@ -328,12 +327,12 @@ def _dense_fit(self, X, strategy, missing_values, fill_value): # Constant elif strategy == "constant": - if isinstance(fill_value, numbers.Real): + """if isinstance(fill_value, numbers.Real): dtype = None else: dtype = object - - return np.full(X.shape[1], fill_value, dtype=dtype) + """ + return np.full(X.shape[1], fill_value, dtype=X.dtype) def transform(self, X): """Impute all missing values in X. @@ -749,10 +748,11 @@ def _initial_imputation(self, X): Input data's missing indicator matrix, where "n_samples" is the number of samples and "n_features" is the number of features. """ + force_all_finite = "allow-nan" if self.missing_values is np.nan \ + else True X = check_array(X, dtype=FLOAT_DTYPES, order="F", - force_all_finite="allow-nan" - if self.missing_values is np.nan else True) + force_all_finite=force_all_finite) mask_missing_values = _get_mask(X, self.missing_values) if self.initial_imputer_ is None: diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index f436c7b55cf36..876a5af11fe3e 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -1313,7 +1313,7 @@ def test_grid_search_allows_nans(): X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ - ('imputer', SimpleImputer(strategy='mean', missing_values='NaN')), + ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)), ('classifier', MockClassifier()), ]) GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y) diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 2929916619769..92d3b5988629c 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -744,7 +744,7 @@ def test_permutation_test_score_allow_nans(): X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ - ('imputer', SimpleImputer(strategy='mean', missing_values='NaN')), + ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)), ('classifier', MockClassifier()), ]) permutation_test_score(p, X, y, cv=5) @@ -756,7 +756,7 @@ def test_cross_val_score_allow_nans(): X[2, :] = np.nan y = np.repeat([0, 1], X.shape[0] / 2) p = Pipeline([ - ('imputer', SimpleImputer(strategy='mean', missing_values='NaN')), + ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)), ('classifier', MockClassifier()), ]) cross_val_score(p, X, y, cv=5) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 035bb3c2923ba..8522f6f50b1cd 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -235,8 +235,7 @@ def test_imputation_most_frequent_objects(): def test_imputation_constant_integer(): - # Test imputation using the constant strategy - # on integers + # Test imputation using the constant strategy on integers X = np.array([ [-1, 2, 3, -1], [4, -1, 5, -1], @@ -259,8 +258,7 @@ def test_imputation_constant_integer(): def test_imputation_constant_float(): - # Test imputation using the constant strategy - # on floats + # Test imputation using the constant strategy on floats for format in ["csr", "array"]: X = np.array([ [np.nan, 1.1, 2.2, np.nan], @@ -290,8 +288,7 @@ def test_imputation_constant_float(): def test_imputation_constant_object(): - # Test imputation using the constant strategy - # on objects + # Test imputation using the constant strategy on objects for marker in (None, np.nan, "NAN", "", 0): X = np.array([ [marker, "a", "b", marker], @@ -315,8 +312,7 @@ def test_imputation_constant_object(): def test_imputation_constant_pandas(): - # Test imputation using the constant strategy - # on pandas df + # Test imputation using the constant strategy on pandas df pd = pytest.importorskip("pandas") for dtype in [object, "category"]: diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5e4c454f4b1ab..d57937cfe944b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1605,6 +1605,7 @@ def check_classifiers_predictions(X, y, name, classifier_orig): def choose_check_classifiers_labels(name, y, y_names): return y if name in ["LabelPropagation", "LabelSpreading"] else y_names + def check_classifiers_classes(name, classifier_orig): X_multiclass, y_multiclass = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) From a6c33b1a69c3cafc091311776578588034f29784 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 11 Jun 2018 14:40:12 +0200 Subject: [PATCH 09/31] Changed the test in estimator_check to allow np.nan as default value in constructor ; + minor corrections --- sklearn/impute.py | 22 +++++++++++++++++----- sklearn/tests/test_impute.py | 18 ++++++++++++++++++ sklearn/utils/estimator_checks.py | 12 +++++++++++- 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index c2832a35c0c48..f87cdf2d67902 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -84,7 +84,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Parameters ---------- missing_values : real number, string, np.nan or None, \ - optional (default=np.nan). +optional (default=np.nan). The placeholder for the missing values. All occurrences of `missing_values` will be imputed. @@ -183,11 +183,23 @@ def fit(self, X, y=None): # fill_value should be numerical in case of numerical input if self.strategy == "constant": - if (X.dtype.kind in ("i", "f") - and not isinstance(fill_value, numbers.Real)): + if X.dtype.kind in ("i", "f"): + if not isinstance(fill_value, numbers.Real): + raise TypeError( + "fill_value={0} is invalid. Expected a numerical value" + " to numerical data".format(fill_value)) + + elif X.dtype.kind == "O": + if not isinstance(fill_value, six.string_types): + raise TypeError( + "fill_value={0} is invalid. Expected an str instance " + "when imputing categorical data.".format(fill_value)) + + else: raise TypeError( - "fill_value={0} is invalid. Expected a numerical value " - "to numerical data".format(fill_value)) + "SimpleImputer cannot work on data with dtype={0}: " + "expecting numerical or categorical data with " + "dtype=object.".format(X.dtype)) if sparse.issparse(X): self.statistics_ = self._sparse_fit(X, diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 8522f6f50b1cd..6c63b79de6bf2 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -75,6 +75,24 @@ def test_imputation_shape(): assert X_imputed.shape == (10, 2) +def test_imputation_valid_types(): + # Verify that exceptions are raised on invalid inputs + tests = [(1, 0, "fill_value", None), + (1., np.nan, "fill_value", None), + ("a", "", 0, object), + (True, "nan", "fill_value", "c")] + + for X_data, missing_value, fill_value, dtype in tests: + X = np.full((3, 5), X_data, dtype=dtype) + X[0, 0] = missing_value + + with pytest.raises(TypeError): + imputer = SimpleImputer(missing_values=missing_value, + strategy="constant", + fill_value=fill_value) + imputer.fit(X).transform(X) + + def safe_median(arr, *args, **kwargs): # np.median([]) raises a TypeError for numpy >= 1.10.1 length = arr.size if hasattr(arr, 'size') else len(arr) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d57937cfe944b..b8b81a67d4a95 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2001,11 +2001,17 @@ def param_filter(p): init_params = [p for p in signature(init).parameters.values() if param_filter(p)] + print("init_params:") + print(init_params) + print() except (TypeError, ValueError): # init is not a python function. # true for mixins return params = estimator.get_params() + print("params:") + print(params) + print() if name in META_ESTIMATORS: # they can need a non-default argument init_params = init_params[1:] @@ -2031,7 +2037,11 @@ def param_filter(p): if isinstance(param_value, np.ndarray): assert_array_equal(param_value, init_param.default) else: - assert_equal(param_value, init_param.default, init_param.name) + # Allows to set default parameters to np.nan + if (param_value is not np.nan or + init_param.default is not np.nan): + assert_equal(param_value, init_param.default, + init_param.name) def multioutput_estimator_convert_y_2d(estimator, y): From 9c2a407de7b7202c76af318824e12a8ce9b51dd5 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 11 Jun 2018 15:51:57 +0200 Subject: [PATCH 10/31] fix for older versions of numpy --- sklearn/impute.py | 4 +++- sklearn/tests/test_impute.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index f87cdf2d67902..2dcca8ec968d7 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -45,7 +45,9 @@ def _get_mask(X, value_to_mask): # np.isnan does not work on object dtypes. return X != X else: - return X == value_to_mask + # X == value_to_mask with object dytpes does not always perform + # element-wise for old versions of numpy + return np.equal(X, value_to_mask) def _most_frequent(array, extra_value, n_repeat): diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 6c63b79de6bf2..19d7e0a588c43 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -230,7 +230,7 @@ def test_imputation_most_frequent(): def test_imputation_most_frequent_objects(): # Test imputation using the most-frequent strategy. - for marker in (None, np.nan, "NAN", "", 0): + for marker in (np.nan, "NAN", "", 0): X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], From df8608ba545fb154b80a69144d6f06685d24942b Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 11 Jun 2018 15:59:33 +0200 Subject: [PATCH 11/31] . --- sklearn/tests/test_impute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 19d7e0a588c43..6c63b79de6bf2 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -230,7 +230,7 @@ def test_imputation_most_frequent(): def test_imputation_most_frequent_objects(): # Test imputation using the most-frequent strategy. - for marker in (np.nan, "NAN", "", 0): + for marker in (None, np.nan, "NAN", "", 0): X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], From 45176876ab7a18eb6cb1efac390c91568b0bf35b Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Tue, 12 Jun 2018 17:48:24 +0200 Subject: [PATCH 12/31] fix for old versions of numpy v2 --- sklearn/impute.py | 25 +++++++++++++++---------- sklearn/utils/estimator_checks.py | 8 ++------ sklearn/utils/fixes.py | 16 ++++++++++++++++ 3 files changed, 33 insertions(+), 16 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 2dcca8ec968d7..9a145b63515ea 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -22,6 +22,7 @@ from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES +from .utils.fixes import custom_isnan from .externals import six @@ -38,12 +39,20 @@ ] +def _custom_isnan(x): + # np.nan is never equal to np.nan. Return true only if x is np.nan + return x != x + + def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" if value_to_mask is np.nan: - # nan values are never equal to themselves. We use this trick because - # np.isnan does not work on object dtypes. - return X != X + if X.dtype.kind in ("i", "u", "f"): + return np.isnan(X) + else: + # np.isnan does not work on object dtypes. + return custom_isnan(X) + else: # X == value_to_mask with object dytpes does not always perform # element-wise for old versions of numpy @@ -185,7 +194,7 @@ def fit(self, X, y=None): # fill_value should be numerical in case of numerical input if self.strategy == "constant": - if X.dtype.kind in ("i", "f"): + if X.dtype.kind in ("i", "u", "f"): if not isinstance(fill_value, numbers.Real): raise TypeError( "fill_value={0} is invalid. Expected a numerical value" @@ -341,11 +350,6 @@ def _dense_fit(self, X, strategy, missing_values, fill_value): # Constant elif strategy == "constant": - """if isinstance(fill_value, numbers.Real): - dtype = None - else: - dtype = object - """ return np.full(X.shape[1], fill_value, dtype=X.dtype) def transform(self, X): @@ -361,6 +365,7 @@ def transform(self, X): X = self._validate_input(X) statistics = self.statistics_ + if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) @@ -370,7 +375,7 @@ def transform(self, X): valid_statistics = statistics else: # same as np.isnan but also works for object dtypes - invalid_mask = statistics != statistics + invalid_mask = _get_mask(statistics, np.nan) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.flatnonzero(valid_mask) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b8b81a67d4a95..8425b98980a84 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2001,17 +2001,13 @@ def param_filter(p): init_params = [p for p in signature(init).parameters.values() if param_filter(p)] - print("init_params:") - print(init_params) - print() + except (TypeError, ValueError): # init is not a python function. # true for mixins return params = estimator.get_params() - print("params:") - print(params) - print() + if name in META_ESTIMATORS: # they can need a non-default argument init_params = init_params[1:] diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index f7d9d6a29f9f6..588cf1de182e2 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -334,3 +334,19 @@ def nanpercentile(a, q): return np.array([np.nan] * size_q) else: from numpy import nanpercentile # noqa + + +# Fix for behavior inconsistency on numpy.equal for object dtypes. +# For numpy versions < 1.13, numpy.equal tests identity of objects instead of +# equality + +test_array = np.array([np.nan], dtype=object) +test_mask = test_array != test_array + +if np.array_equal(test_mask, np.array([True])): + def custom_isnan(X): + return X != X + +else: + def custom_isnan(X): + return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool) From 1f1c6a01537b39cea3fce55ac481d34e40de149b Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 13 Jun 2018 16:37:44 +0200 Subject: [PATCH 13/31] minor fixes and added doc example for categorical inputs --- doc/conftest.py | 9 ++++++ doc/modules/impute.rst | 30 ++++++++++++----- sklearn/impute.py | 11 ++----- sklearn/tests/test_impute.py | 53 ++++++++++++++++++++++++------- sklearn/utils/estimator_checks.py | 9 ++++++ sklearn/utils/fixes.py | 10 +++--- 6 files changed, 90 insertions(+), 32 deletions(-) diff --git a/doc/conftest.py b/doc/conftest.py index 158fff5830acf..f6fe644583730 100644 --- a/doc/conftest.py +++ b/doc/conftest.py @@ -62,6 +62,13 @@ def setup_compose(): raise SkipTest("Skipping compose.rst, pandas not installed") +def setup_impute(): + try: + import pandas # noqa + except ImportError: + raise SkipTest("Skipping impute.rst, pandas not installed") + + def pytest_runtest_setup(item): fname = item.fspath.strpath if fname.endswith('datasets/labeled_faces.rst'): @@ -76,6 +83,8 @@ def pytest_runtest_setup(item): setup_working_with_text_data() elif fname.endswith('modules/compose.rst'): setup_compose() + elif fname.endswith('modules/impute.rst'): + setup_impute() def pytest_runtest_teardown(item): diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index a28e0d4b47e38..2667b123e5fdc 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -20,9 +20,9 @@ Univariate feature imputation ============================= The :class:`SimpleImputer` class provides basic strategies for imputing missing -values. Missing values can be imputed with a provided value, or using the -statistics (mean, median or most frequent) of each column in which the missing -values are located. This class also allows for different missing values +values. Missing values can be imputed with a provided constant value, or using +the statistics (mean, median or most frequent) of each column in which the +missing values are located. This class also allows for different missing values encodings. The following snippet demonstrates how to replace missing values, @@ -33,7 +33,7 @@ that contain the missing values:: >>> from sklearn.impute import SimpleImputer >>> imp = SimpleImputer(missing_values=np.nan, strategy='mean') >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(copy=True, missing_values=nan, strategy='mean', verbose=0) + SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0) >>> X = [[np.nan, 2], [6, np.nan], [7, 6]] >>> print(imp.transform(X)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS [[4. 2. ] @@ -46,20 +46,34 @@ The :class:`SimpleImputer` class also supports sparse matrices:: >>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]]) >>> imp = SimpleImputer(missing_values=0, strategy='mean') >>> imp.fit(X) # doctest: +NORMALIZE_WHITESPACE - SimpleImputer(copy=True, missing_values=0, strategy='mean', verbose=0) + SimpleImputer(copy=True, fill_value=None, missing_values=0, strategy='mean', verbose=0) >>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]]) >>> print(imp.transform(X_test)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS [[4. 2. ] [6. 3.666...] [7. 6. ]] -Object - - Note that, here, missing values are encoded by 0 and are thus implicitly stored in the matrix. This format is thus suitable when there are many more missing values than observed values. +The :class:`SimpleImputer` class also supports categorical datas represented as +string values or pandas categoricals when using the "most_frequent" or +"constant" strategy:: + + >>> import pandas as pd + >>> df = pd.DataFrame([["a", "x"], + ... ["", "y"], + ... ["a", ""], + ... ["b", "y"]], dtype="category") + ... + >>> imp = SimpleImputer(missing_values="", strategy="most_frequent") + >>> print(imp.fit_transform(df)) # doctest: +NORMALIZE_WHITESPACE + [['a' 'x'] + ['a' 'y'] + ['a' 'y'] + ['b' 'y']] + .. _mice: Multivariate feature imputation diff --git a/sklearn/impute.py b/sklearn/impute.py index 9a145b63515ea..f4cb476b547f9 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -22,7 +22,7 @@ from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES -from .utils.fixes import custom_isnan +from .utils.fixes import _compat_isnan from .externals import six @@ -39,11 +39,6 @@ ] -def _custom_isnan(x): - # np.nan is never equal to np.nan. Return true only if x is np.nan - return x != x - - def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" if value_to_mask is np.nan: @@ -51,7 +46,7 @@ def _get_mask(X, value_to_mask): return np.isnan(X) else: # np.isnan does not work on object dtypes. - return custom_isnan(X) + return _compat_isnan(X) else: # X == value_to_mask with object dytpes does not always perform @@ -198,7 +193,7 @@ def fit(self, X, y=None): if not isinstance(fill_value, numbers.Real): raise TypeError( "fill_value={0} is invalid. Expected a numerical value" - " to numerical data".format(fill_value)) + " when imputing numerical data".format(fill_value)) elif X.dtype.kind == "O": if not isinstance(fill_value, six.string_types): diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 6c63b79de6bf2..83d78793729dd 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -5,6 +5,8 @@ import numpy as np from scipy import sparse +import io + from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal @@ -252,6 +254,33 @@ def test_imputation_most_frequent_objects(): assert_array_equal(X_trans, X_true) +def test_imputation_most_frequent_pandas(): + # Test imputation using the most frequent strategy on pandas df + pd = pytest.importorskip("pandas") + + f = io.StringIO(u"Cat1,Cat2,Cat3,Cat4\n" + ",i,x,\n" + "a,,y,\n" + "a,j,,\n" + "b,j,x,") + + for dtype in (object, "category"): + df = pd.read_csv(f, dtype=dtype) + f.seek(0) + + X_true = np.array([ + ["a", "i", "x"], + ["a", "j", "y"], + ["a", "j", "x"], + ["b", "j", "x"] + ], dtype=object) + + imputer = SimpleImputer(strategy="most_frequent") + X_trans = imputer.fit(df).transform(df) + + assert_array_equal(X_trans, X_true) + + def test_imputation_constant_integer(): # Test imputation using the constant strategy on integers X = np.array([ @@ -333,19 +362,21 @@ def test_imputation_constant_pandas(): # Test imputation using the constant strategy on pandas df pd = pytest.importorskip("pandas") - for dtype in [object, "category"]: - df = pd.DataFrame([ - [np.nan, "a", "b", np.nan], - ["c", np.nan, "d", np.nan], - ["e", "f", np.nan, np.nan], - ["g", "h", "i", np.nan] - ], dtype=dtype) + f = io.StringIO(u"Cat1,Cat2,Cat3,Cat4\n" + ",i,x,\n" + "a,,y,\n" + "a,j,,\n" + "b,j,x,") + + for dtype in (object, "category"): + df = pd.read_csv(f, dtype=dtype) + f.seek(0) X_true = np.array([ - ["missing", "a", "b", "missing"], - ["c", "missing", "d", "missing"], - ["e", "f", "missing", "missing"], - ["g", "h", "i", "missing"] + ["missing", "i", "x", "missing"], + ["a", "missing", "y", "missing"], + ["a", "j", "missing", "missing"], + ["b", "j", "x", "missing"] ], dtype=object) imputer = SimpleImputer(strategy="constant", fill_value="missing") diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 8425b98980a84..6120d1cefd7ea 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -8,6 +8,7 @@ from copy import deepcopy import struct from functools import partial +import numbers import numpy as np from scipy import sparse @@ -2039,6 +2040,14 @@ def param_filter(p): assert_equal(param_value, init_param.default, init_param.name) + def _isscalarnan(x): + return isinstance(x, numbers.Real) and np.isnan(x) + + if _isscalarnan(param_value): + assert param_value is init_param.default, init_param.name + else: + assert param_value == init_param.default, init_param.name + def multioutput_estimator_convert_y_2d(estimator, y): # Estimators in mono_output_task_error raise ValueError if y is of 1-D diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 588cf1de182e2..ba61c54778948 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -340,13 +340,13 @@ def nanpercentile(a, q): # For numpy versions < 1.13, numpy.equal tests identity of objects instead of # equality -test_array = np.array([np.nan], dtype=object) -test_mask = test_array != test_array +_nan_object_array = np.array([np.nan], dtype=object) +_nan_object_mask = _nan_object_array != _nan_object_array -if np.array_equal(test_mask, np.array([True])): - def custom_isnan(X): +if np.array_equal(_nan_object_mask, np.array([True])): + def _compat_isnan(X): return X != X else: - def custom_isnan(X): + def _compat_isnan(X): return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool) From 2f6d0b1e857b84c2fece45477a7914f2d496abe3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Jun 2018 18:06:20 +0200 Subject: [PATCH 14/31] DOCTEST fix printing estimator --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 2667b123e5fdc..49e5fbef40fb6 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -94,7 +94,7 @@ Here is an example snippet:: >>> imp = MICEImputer(n_imputations=10, random_state=0) >>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]]) MICEImputer(imputation_order='ascending', initial_strategy='mean', - max_value=None, min_value=None, missing_values='NaN', n_burn_in=10, + max_value=None, min_value=None, missing_values=nan, n_burn_in=10, n_imputations=10, n_nearest_features=None, predictor=None, random_state=0, verbose=False) >>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] From 3884d4e195e3c1552ee177e8f14e1177ff1f90cd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Jun 2018 18:34:32 +0200 Subject: [PATCH 15/31] EXA fix example using constant strategy --- ...=> plot_column_transformer_mixed_types.py} | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) rename examples/compose/{column_transformer_mixed_types.py => plot_column_transformer_mixed_types.py} (74%) diff --git a/examples/compose/column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py similarity index 74% rename from examples/compose/column_transformer_mixed_types.py rename to examples/compose/plot_column_transformer_mixed_types.py index d5767ad231452..95fd7aa6bef34 100644 --- a/examples/compose/column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -27,14 +27,16 @@ from __future__ import print_function import pandas as pd +import numpy as np -from sklearn.compose import make_column_transformer -from sklearn.pipeline import make_pipeline +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, CategoricalEncoder from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split, GridSearchCV +np.random.seed(0) # Read data from Titanic dataset. titanic_url = ('https://raw.githubusercontent.com/amueller/' @@ -49,27 +51,27 @@ # - embarked: categories encoded as strings {'C', 'S', 'Q'}. # - sex: categories encoded as strings {'female', 'male'}. # - pclass: ordinal integers {1, 2, 3}. -numeric_features = ['age', 'fare'] -categorical_features = ['embarked', 'sex', 'pclass'] - -# Provisionally, use pd.fillna() to impute missing values for categorical -# features; SimpleImputer will eventually support strategy="constant". -data[categorical_features] = data[categorical_features].fillna(value='missing') # We create the preprocessing pipelines for both numeric and categorical data. -numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) -categorical_transformer = CategoricalEncoder('onehot-dense', - handle_unknown='ignore') +numeric_features = ['age', 'fare'] +numeric_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler())]) + +categorical_features = ['embarked', 'sex', 'pclass'] +categorical_transformer = Pipeline(steps=[ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))]) -preprocessing_pl = make_column_transformer( - (numeric_features, numeric_transformer), - (categorical_features, categorical_transformer), - remainder='drop' -) +preprocessor = ColumnTransformer(transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features)], + remainder='drop') # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. -clf = make_pipeline(preprocessing_pl, LogisticRegression()) +clf = Pipeline(steps=[('preprocessor', preprocessor), + ('classifier', LogisticRegression())]) X = data.drop('survived', axis=1) y = data.survived.values @@ -78,7 +80,7 @@ shuffle=True) clf.fit(X_train, y_train) -print("model score: %f" % clf.score(X_test, y_test)) +print("model score: %.3f" % clf.score(X_test, y_test)) ############################################################################### @@ -93,12 +95,12 @@ param_grid = { - 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'], - 'logisticregression__C': [0.1, 1.0, 1.0], + 'preprocessor__num__imputer__strategy': ['mean', 'median'], + 'classifier__C': [0.1, 1.0, 10, 100], } grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False) grid_search.fit(X_train, y_train) -print(("best logistic regression from grid search: %f" +print(("best logistic regression from grid search: %.3f" % grid_search.score(X_test, y_test))) From 72eb6b5def911a85531147ebe8e19687c90d440e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Jun 2018 18:37:11 +0200 Subject: [PATCH 16/31] COSMIT --- examples/compose/plot_column_transformer_mixed_types.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 95fd7aa6bef34..1847a40034fee 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -63,10 +63,11 @@ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))]) -preprocessor = ColumnTransformer(transformers=[ - ('num', numeric_transformer, numeric_features), - ('cat', categorical_transformer, categorical_features)], - remainder='drop') +preprocessor = ColumnTransformer( + transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features)], + remainder='drop') # Append classifier to preprocessing pipeline. # Now we have a full prediction pipeline. From cc9aa6f20b78b84f5ba27620f25ef02880a63624 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 13 Jun 2018 18:44:28 +0200 Subject: [PATCH 17/31] COSMIT --- examples/compose/plot_column_transformer_mixed_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py index 1847a40034fee..f1c7d146c9643 100644 --- a/examples/compose/plot_column_transformer_mixed_types.py +++ b/examples/compose/plot_column_transformer_mixed_types.py @@ -75,7 +75,7 @@ ('classifier', LogisticRegression())]) X = data.drop('survived', axis=1) -y = data.survived.values +y = data['survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) From d4e52264fef23ae252f16df0e65152f5419346e7 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 14 Jun 2018 11:55:28 +0200 Subject: [PATCH 18/31] adressed @glemaitre remarks --- doc/modules/impute.rst | 4 +- sklearn/impute.py | 11 +- sklearn/tests/test_impute.py | 215 +++++++++++++++--------------- sklearn/utils/estimator_checks.py | 7 +- sklearn/utils/fixes.py | 5 +- 5 files changed, 122 insertions(+), 120 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 49e5fbef40fb6..45b866a7123ab 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -58,8 +58,8 @@ in the matrix. This format is thus suitable when there are many more missing values than observed values. The :class:`SimpleImputer` class also supports categorical datas represented as -string values or pandas categoricals when using the "most_frequent" or -"constant" strategy:: +string values or pandas categoricals when using the ``most_frequent`` or +``constant`` strategy:: >>> import pandas as pd >>> df = pd.DataFrame([["a", "x"], diff --git a/sklearn/impute.py b/sklearn/impute.py index f4cb476b547f9..8b038d1706795 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -131,7 +131,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Notes ----- Columns which only contained missing values at `fit` are discarded upon - `transform` if strategy is not "constant" + `transform` if strategy is not "constant". """ def __init__(self, missing_values=np.nan, strategy="mean", @@ -192,14 +192,15 @@ def fit(self, X, y=None): if X.dtype.kind in ("i", "u", "f"): if not isinstance(fill_value, numbers.Real): raise TypeError( - "fill_value={0} is invalid. Expected a numerical value" - " when imputing numerical data".format(fill_value)) + "'fill_value'={0} is invalid. Expected a numerical" + " value when imputing numerical" + " data".format(fill_value)) elif X.dtype.kind == "O": if not isinstance(fill_value, six.string_types): raise TypeError( - "fill_value={0} is invalid. Expected an str instance " - "when imputing categorical data.".format(fill_value)) + "'fill_value'={0} is invalid. Expected an str instance" + " when imputing categorical data.".format(fill_value)) else: raise TypeError( diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 83d78793729dd..28bf7df71836e 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -8,6 +8,7 @@ import io from sklearn.utils.testing import assert_allclose +from sklearn.utils.testing import assert_allclose_dense_sparse from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises @@ -77,22 +78,22 @@ def test_imputation_shape(): assert X_imputed.shape == (10, 2) -def test_imputation_valid_types(): +@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match", + [(1, 0, "x", None, "imputing numerical"), + (1., np.nan, "x", None, "imputing numerical"), + ("a", "", 0, object, "imputing categorical"), + (True, "nan", "x", "c", "cannot work")]) +def test_imputation_error_invalid_types(X_data, missing_value, + fill_value, dtype, match): # Verify that exceptions are raised on invalid inputs - tests = [(1, 0, "fill_value", None), - (1., np.nan, "fill_value", None), - ("a", "", 0, object), - (True, "nan", "fill_value", "c")] + X = np.full((3, 5), X_data, dtype=dtype) + X[0, 0] = missing_value - for X_data, missing_value, fill_value, dtype in tests: - X = np.full((3, 5), X_data, dtype=dtype) - X[0, 0] = missing_value - - with pytest.raises(TypeError): - imputer = SimpleImputer(missing_values=missing_value, - strategy="constant", - fill_value=fill_value) - imputer.fit(X).transform(X) + with pytest.raises(TypeError, match=match): + imputer = SimpleImputer(missing_values=missing_value, + strategy="constant", + fill_value=fill_value) + imputer.fit_transform(X) def safe_median(arr, *args, **kwargs): @@ -230,31 +231,32 @@ def test_imputation_most_frequent(): _check_statistics(X, X_true, "most_frequent", [np.nan, 2, 3, 3], -1) -def test_imputation_most_frequent_objects(): +@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0]) +def test_imputation_most_frequent_objects(marker): # Test imputation using the most-frequent strategy. - for marker in (None, np.nan, "NAN", "", 0): - X = np.array([ - [marker, marker, "a", "f"], - [marker, "c", marker, "d"], - [marker, "b", "d", marker], - [marker, "c", "d", "h"], - ], dtype=object) - - X_true = np.array([ - ["c", "a", "f"], - ["c", "d", "d"], - ["b", "d", "d"], - ["c", "d", "h"], - ], dtype=object) + X = np.array([ + [marker, marker, "a", "f"], + [marker, "c", marker, "d"], + [marker, "b", "d", marker], + [marker, "c", "d", "h"], + ], dtype=object) - imputer = SimpleImputer(missing_values=marker, - strategy="most_frequent") - X_trans = imputer.fit(X).transform(X) + X_true = np.array([ + ["c", "a", "f"], + ["c", "d", "d"], + ["b", "d", "d"], + ["c", "d", "h"], + ], dtype=object) + + imputer = SimpleImputer(missing_values=marker, + strategy="most_frequent") + X_trans = imputer.fit(X).transform(X) - assert_array_equal(X_trans, X_true) + assert_array_equal(X_trans, X_true) -def test_imputation_most_frequent_pandas(): +@pytest.mark.parametrize("dtype", [object, "category"]) +def test_imputation_most_frequent_pandas(dtype): # Test imputation using the most frequent strategy on pandas df pd = pytest.importorskip("pandas") @@ -264,21 +266,19 @@ def test_imputation_most_frequent_pandas(): "a,j,,\n" "b,j,x,") - for dtype in (object, "category"): - df = pd.read_csv(f, dtype=dtype) - f.seek(0) + df = pd.read_csv(f, dtype=dtype) - X_true = np.array([ - ["a", "i", "x"], - ["a", "j", "y"], - ["a", "j", "x"], - ["b", "j", "x"] - ], dtype=object) + X_true = np.array([ + ["a", "i", "x"], + ["a", "j", "y"], + ["a", "j", "x"], + ["b", "j", "x"] + ], dtype=object) - imputer = SimpleImputer(strategy="most_frequent") - X_trans = imputer.fit(df).transform(df) + imputer = SimpleImputer(strategy="most_frequent") + X_trans = imputer.fit_transform(df) - assert_array_equal(X_trans, X_true) + assert_array_equal(X_trans, X_true) def test_imputation_constant_integer(): @@ -299,66 +299,73 @@ def test_imputation_constant_integer(): imputer = SimpleImputer(missing_values=-1, strategy="constant", fill_value=0) - X_trans = imputer.fit(X).transform(X) + X_trans = imputer.fit_transform(X) assert_array_equal(X_trans, X_true) -def test_imputation_constant_float(): +@pytest.mark.parametrize("format", ["csr", "array"]) +def test_imputation_constant_float(format): # Test imputation using the constant strategy on floats - for format in ["csr", "array"]: - X = np.array([ - [np.nan, 1.1, 2.2, np.nan], - [3.3, np.nan, 4.4, np.nan], - [5.5, 6.6, np.nan, np.nan], - [7.7, 8.8, 9.9, np.nan] - ]) - - X = sparse.csr_matrix(X) if format == "csr" else X - - X_true = np.array([ - [0, 1.1, 2.2, 0], - [3.3, 0, 4.4, 0], - [5.5, 6.6, 0, 0], - [7.7, 8.8, 9.9, 0] - ]) - - X_true = sparse.csr_matrix(X_true) if format == "csr" else X_true - - imputer = SimpleImputer(strategy="constant", fill_value=0) - X_trans = imputer.fit(X).transform(X) - - if format == "csr": - assert_allclose(X_trans.toarray(), X_true.toarray()) - else: - assert_allclose(X_trans, X_true) + X = np.array([ + [np.nan, 1.1, 2.2, np.nan], + [3.3, np.nan, 4.4, np.nan], + [5.5, 6.6, np.nan, np.nan], + [7.7, 8.8, 9.9, np.nan] + ]) + X = sparse.csr_matrix(X) if format == "csr" else X -def test_imputation_constant_object(): + X_true = np.array([ + [0, 1.1, 2.2, 0], + [3.3, 0, 4.4, 0], + [5.5, 6.6, 0, 0], + [7.7, 8.8, 9.9, 0] + ]) + + if format == "csr": + X_true = sparse.csr_matrix(X_true) + X_true[np.array([[True, False, False, True], + [False, True, False, True], + [False, False, True, True], + [False, False, False, True]])] = 0 + + imputer = SimpleImputer(strategy="constant", fill_value=0) + X_trans = imputer.fit_transform(X) + + print(X_trans) + print() + print(X_true) + + assert_allclose_dense_sparse(X_trans, X_true) + + +@pytest.mark.parametrize("marker", [None, np.nan, "NAN", "", 0]) +def test_imputation_constant_object(marker): # Test imputation using the constant strategy on objects - for marker in (None, np.nan, "NAN", "", 0): - X = np.array([ - [marker, "a", "b", marker], - ["c", marker, "d", marker], - ["e", "f", marker, marker], - ["g", "h", "i", marker] - ], dtype=object) + X = np.array([ + [marker, "a", "b", marker], + ["c", marker, "d", marker], + ["e", "f", marker, marker], + ["g", "h", "i", marker] + ], dtype=object) - X_true = np.array([ - ["missing", "a", "b", "missing"], - ["c", "missing", "d", "missing"], - ["e", "f", "missing", "missing"], - ["g", "h", "i", "missing"] - ], dtype=object) + X_true = np.array([ + ["missing", "a", "b", "missing"], + ["c", "missing", "d", "missing"], + ["e", "f", "missing", "missing"], + ["g", "h", "i", "missing"] + ], dtype=object) - imputer = SimpleImputer(missing_values=marker, strategy="constant", - fill_value="missing") - X_trans = imputer.fit(X).transform(X) + imputer = SimpleImputer(missing_values=marker, strategy="constant", + fill_value="missing") + X_trans = imputer.fit_transform(X) - assert_array_equal(X_trans, X_true) + assert_array_equal(X_trans, X_true) -def test_imputation_constant_pandas(): +@pytest.mark.parametrize("dtype", [object, "category"]) +def test_imputation_constant_pandas(dtype): # Test imputation using the constant strategy on pandas df pd = pytest.importorskip("pandas") @@ -368,21 +375,19 @@ def test_imputation_constant_pandas(): "a,j,,\n" "b,j,x,") - for dtype in (object, "category"): - df = pd.read_csv(f, dtype=dtype) - f.seek(0) + df = pd.read_csv(f, dtype=dtype) - X_true = np.array([ - ["missing", "i", "x", "missing"], - ["a", "missing", "y", "missing"], - ["a", "j", "missing", "missing"], - ["b", "j", "x", "missing"] - ], dtype=object) + X_true = np.array([ + ["missing", "i", "x", "missing"], + ["a", "missing", "y", "missing"], + ["a", "j", "missing", "missing"], + ["b", "j", "x", "missing"] + ], dtype=object) - imputer = SimpleImputer(strategy="constant", fill_value="missing") - X_trans = imputer.fit(df).transform(df) + imputer = SimpleImputer(strategy="constant", fill_value="missing") + X_trans = imputer.fit_transform(df) - assert_array_equal(X_trans, X_true) + assert_array_equal(X_trans, X_true) def test_imputation_pipeline_grid_search(): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 1ba8da83c56f8..18e0f9d3c3ed8 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2036,16 +2036,11 @@ def param_filter(p): if isinstance(param_value, np.ndarray): assert_array_equal(param_value, init_param.default) else: - # Allows to set default parameters to np.nan - if (param_value is not np.nan or - init_param.default is not np.nan): - assert_equal(param_value, init_param.default, - init_param.name) - def _isscalarnan(x): return isinstance(x, numbers.Real) and np.isnan(x) if _isscalarnan(param_value): + # Allows to set default parameters to np.nan assert param_value is init_param.default, init_param.name else: assert param_value == init_param.default, init_param.name diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 6b6e6ac1f6249..74ecf0db0960f 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -270,8 +270,9 @@ def nanpercentile(a, q): # Fix for behavior inconsistency on numpy.equal for object dtypes. -# For numpy versions < 1.13, numpy.equal tests identity of objects instead of -# equality +# For numpy versions < 1.13, numpy.equal tests element-wise identity of objects +# instead of equality. This fix returns the mask of NaNs in an array of +# numerical or object values for all nupy versions. _nan_object_array = np.array([np.nan], dtype=object) _nan_object_mask = _nan_object_array != _nan_object_array From 6efd1221b54eae5f030913ab8294ea1e106c0b72 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 14 Jun 2018 12:08:36 +0200 Subject: [PATCH 19/31] small corrections --- sklearn/impute.py | 8 ++++---- sklearn/tests/test_impute.py | 4 ---- sklearn/utils/fixes.py | 4 ++-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 8b038d1706795..972891cbbbcd0 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -22,7 +22,7 @@ from .utils.sparsefuncs import _get_median from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES -from .utils.fixes import _compat_isnan +from .utils.fixes import _object_dtype_isnan from .externals import six @@ -46,7 +46,7 @@ def _get_mask(X, value_to_mask): return np.isnan(X) else: # np.isnan does not work on object dtypes. - return _compat_isnan(X) + return _object_dtype_isnan(X) else: # X == value_to_mask with object dytpes does not always perform @@ -90,7 +90,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Parameters ---------- missing_values : real number, string, np.nan or None, \ -optional (default=np.nan). +optional (default=np.nan) The placeholder for the missing values. All occurrences of `missing_values` will be imputed. @@ -105,7 +105,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): value along each column. - If "constant", then replace missing values with fill_value. - fill_value : string or numerical value, optional (default=None) + fill_value : string or numerical value, optional When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 28bf7df71836e..0fb4dc7ffbc2e 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -333,10 +333,6 @@ def test_imputation_constant_float(format): imputer = SimpleImputer(strategy="constant", fill_value=0) X_trans = imputer.fit_transform(X) - print(X_trans) - print() - print(X_true) - assert_allclose_dense_sparse(X_trans, X_true) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 74ecf0db0960f..6595e5ac0fa43 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -278,9 +278,9 @@ def nanpercentile(a, q): _nan_object_mask = _nan_object_array != _nan_object_array if np.array_equal(_nan_object_mask, np.array([True])): - def _compat_isnan(X): + def _object_dtype_isnan(X): return X != X else: - def _compat_isnan(X): + def _object_dtype_isnan(X): return np.frompyfunc(lambda x: x != x, 1, 1)(X).astype(bool) From fbaaa381083b091d10fdf35503d74b51d3732a17 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Thu, 14 Jun 2018 14:27:14 +0200 Subject: [PATCH 20/31] small corrections --- doc/modules/impute.rst | 8 +++---- sklearn/impute.py | 10 ++++++-- sklearn/tests/test_impute.py | 46 ++++++++++++++++++------------------ 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 45b866a7123ab..493ff6fb7439e 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -57,17 +57,17 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored in the matrix. This format is thus suitable when there are many more missing values than observed values. -The :class:`SimpleImputer` class also supports categorical datas represented as +The :class:`SimpleImputer` class also supports categorical data represented as string values or pandas categoricals when using the ``most_frequent`` or ``constant`` strategy:: >>> import pandas as pd >>> df = pd.DataFrame([["a", "x"], - ... ["", "y"], - ... ["a", ""], + ... [np.nan, "y"], + ... ["a", np.nan], ... ["b", "y"]], dtype="category") ... - >>> imp = SimpleImputer(missing_values="", strategy="most_frequent") + >>> imp = SimpleImputer(strategy="most_frequent") >>> print(imp.fit_transform(df)) # doctest: +NORMALIZE_WHITESPACE [['a' 'x'] ['a' 'y'] diff --git a/sklearn/impute.py b/sklearn/impute.py index 972891cbbbcd0..c5b67069dd41e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -60,7 +60,13 @@ def _most_frequent(array, extra_value, n_repeat): of the array.""" # Compute the most frequent value in array only if array.size > 0: - mode = stats.mode(array) + with warnings.catch_warnings(): + # stats.mode raises a warning when input array contains objects due + # to incapacity to detect NaNs. Irrelevant here since input array + # has already been NaN-masked. + warnings.simplefilter("ignore", RuntimeWarning) + mode = stats.mode(array) + most_frequent_value = mode[0][0] most_frequent_count = mode[1][0] else: @@ -177,7 +183,7 @@ def fit(self, X, y=None): """ X = self._validate_input(X) - # default missing_values is 0 for numerical input and "missing_value" + # default fill_value is 0 for numerical input and "missing_value" # otherwise if self.fill_value is None: if X.dtype.kind in ("i", "f"): diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 0fb4dc7ffbc2e..459b7c47c774d 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -78,24 +78,6 @@ def test_imputation_shape(): assert X_imputed.shape == (10, 2) -@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match", - [(1, 0, "x", None, "imputing numerical"), - (1., np.nan, "x", None, "imputing numerical"), - ("a", "", 0, object, "imputing categorical"), - (True, "nan", "x", "c", "cannot work")]) -def test_imputation_error_invalid_types(X_data, missing_value, - fill_value, dtype, match): - # Verify that exceptions are raised on invalid inputs - X = np.full((3, 5), X_data, dtype=dtype) - X[0, 0] = missing_value - - with pytest.raises(TypeError, match=match): - imputer = SimpleImputer(missing_values=missing_value, - strategy="constant", - fill_value=fill_value) - imputer.fit_transform(X) - - def safe_median(arr, *args, **kwargs): # np.median([]) raises a TypeError for numpy >= 1.10.1 length = arr.size if hasattr(arr, 'size') else len(arr) @@ -281,6 +263,24 @@ def test_imputation_most_frequent_pandas(dtype): assert_array_equal(X_trans, X_true) +@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match", + [(1, 0, "x", None, "imputing numerical"), + (1., np.nan, "x", None, "imputing numerical"), + ("a", "", 0, object, "imputing categorical"), + (True, "nan", "x", "c", "cannot work")]) +def test_imputation_constant_error_invalid_types(X_data, missing_value, + fill_value, dtype, match): + # Verify that exceptions are raised on invalid inputs + X = np.full((3, 5), X_data, dtype=dtype) + X[0, 0] = missing_value + + with pytest.raises(TypeError, match=match): + imputer = SimpleImputer(missing_values=missing_value, + strategy="constant", + fill_value=fill_value) + imputer.fit_transform(X) + + def test_imputation_constant_integer(): # Test imputation using the constant strategy on integers X = np.array([ @@ -374,13 +374,13 @@ def test_imputation_constant_pandas(dtype): df = pd.read_csv(f, dtype=dtype) X_true = np.array([ - ["missing", "i", "x", "missing"], - ["a", "missing", "y", "missing"], - ["a", "j", "missing", "missing"], - ["b", "j", "x", "missing"] + ["missing_value", "i", "x", "missing_value"], + ["a", "missing_value", "y", "missing_value"], + ["a", "j", "missing_value", "missing_value"], + ["b", "j", "x", "missing_value"] ], dtype=object) - imputer = SimpleImputer(strategy="constant", fill_value="missing") + imputer = SimpleImputer(strategy="constant") X_trans = imputer.fit_transform(df) assert_array_equal(X_trans, X_true) From 724a4a135e6a4c09f91a39ba9d6ec02af7e54b2c Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 15 Jun 2018 09:27:58 +0200 Subject: [PATCH 21/31] fixed np.nan is not np.float('nan') issue --- doc/modules/impute.rst | 4 ++-- sklearn/impute.py | 11 +++++++---- sklearn/utils/__init__.py | 19 +++++++++++++++++++ sklearn/utils/estimator_checks.py | 7 ++----- 4 files changed, 30 insertions(+), 11 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 493ff6fb7439e..6356bc4ecf81c 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -58,8 +58,8 @@ in the matrix. This format is thus suitable when there are many more missing values than observed values. The :class:`SimpleImputer` class also supports categorical data represented as -string values or pandas categoricals when using the ``most_frequent`` or -``constant`` strategy:: +string values or pandas categoricals when using the `most_frequent` or +`constant` strategy:: >>> import pandas as pd >>> df = pd.DataFrame([["a", "x"], diff --git a/sklearn/impute.py b/sklearn/impute.py index c5b67069dd41e..4aeff59b4b810 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -23,6 +23,7 @@ from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES from .utils.fixes import _object_dtype_isnan +from .utils import is_scalar_nan from .externals import six @@ -160,7 +161,7 @@ def _validate_input(self, X): else: dtype = FLOAT_DTYPES - if self.missing_values is not np.nan: + if not is_scalar_nan(self.missing_values): force_all_finite = True else: force_all_finite = "allow-nan" @@ -186,7 +187,7 @@ def fit(self, X, y=None): # default fill_value is 0 for numerical input and "missing_value" # otherwise if self.fill_value is None: - if X.dtype.kind in ("i", "f"): + if X.dtype.kind in ("i", "u", "f"): fill_value = 0 else: fill_value = "missing_value" @@ -769,8 +770,10 @@ def _initial_imputation(self, X): Input data's missing indicator matrix, where "n_samples" is the number of samples and "n_features" is the number of features. """ - force_all_finite = "allow-nan" if self.missing_values is np.nan \ - else True + if is_scalar_nan(self.missing_values): + force_all_finite = "allow-nan" + else: + force_all_finite = True X = check_array(X, dtype=FLOAT_DTYPES, order="F", force_all_finite=force_all_finite) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index e3d1e7faaabd1..385e170ea9bb6 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -2,6 +2,7 @@ The :mod:`sklearn.utils` module includes various utilities. """ from collections import Sequence +import numbers import numpy as np from scipy.sparse import issparse @@ -553,3 +554,21 @@ def get_chunk_n_rows(row_bytes, max_n_rows=None, (working_memory, np.ceil(row_bytes * 2 ** -20))) chunk_n_rows = 1 return chunk_n_rows + + +def is_scalar_nan(x): + """Tests if x is NaN + + This function is meant to overcome the issue that np.isnan does not allow + non-numerical types as input, and that np.nan is not np.float('nan'). + + Parameters + ---------- + x : any type + + Returns + ------- + boolean + """ + + return isinstance(x, numbers.Real) and np.isnan(x) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 18e0f9d3c3ed8..cc41e7a2fad73 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -8,7 +8,6 @@ from copy import deepcopy import struct from functools import partial -import numbers import numpy as np from scipy import sparse @@ -37,6 +36,7 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_dict_equal from sklearn.utils.testing import create_memmap_backed_data +from sklearn.utils import is_scalar_nan from sklearn.discriminant_analysis import LinearDiscriminantAnalysis @@ -2036,10 +2036,7 @@ def param_filter(p): if isinstance(param_value, np.ndarray): assert_array_equal(param_value, init_param.default) else: - def _isscalarnan(x): - return isinstance(x, numbers.Real) and np.isnan(x) - - if _isscalarnan(param_value): + if is_scalar_nan(param_value): # Allows to set default parameters to np.nan assert param_value is init_param.default, init_param.name else: From e5f4a1bc12b2f66401c583e5cef76917672599c1 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 15 Jun 2018 11:28:28 +0200 Subject: [PATCH 22/31] add tests for is_scalar_nan --- doc/modules/impute.rst | 4 ++-- sklearn/utils/__init__.py | 13 +++++++++++++ sklearn/utils/tests/test_utils.py | 16 ++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index 6356bc4ecf81c..fd01321599dbe 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -58,8 +58,8 @@ in the matrix. This format is thus suitable when there are many more missing values than observed values. The :class:`SimpleImputer` class also supports categorical data represented as -string values or pandas categoricals when using the `most_frequent` or -`constant` strategy:: +string values or pandas categoricals when using the ``'most_frequent'`` or +``'constant'`` strategy:: >>> import pandas as pd >>> df = pd.DataFrame([["a", "x"], diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 385e170ea9bb6..7821d92f97d78 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -569,6 +569,19 @@ def is_scalar_nan(x): Returns ------- boolean + + Examples + -------- + >>> is_scalar_nan(np.nan) + True + >>> is_scalar_nan(float("nan")) + True + >>> is_scalar_nan(None) + False + >>> is_scalar_nan("") + False + >>> is_scalar_nan([np.nan]) + False """ return isinstance(x, numbers.Real) and np.isnan(x) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 1f1efed825c80..9d63bee96bf15 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -21,6 +21,7 @@ from sklearn.utils import shuffle from sklearn.utils import gen_even_slices from sklearn.utils import get_chunk_n_rows +from sklearn.utils import is_scalar_nan from sklearn.utils.extmath import pinvh from sklearn.utils.arpack import eigsh from sklearn.utils.mocking import MockDataFrame @@ -314,3 +315,18 @@ def check_warning(*args, **kw): max_n_rows=max_n_rows) assert actual == expected assert type(actual) is type(expected) + + +@pytest.mark.parametrize("input, result", [(float("nan"), True), + (np.nan, True), + (np.float("nan"), True), + (np.float32("nan"), True), + (np.float64("nan"), True), + (0, False), + (0., False), + (None, False), + ("", False), + ("nan", False), + ([np.nan], False)]) +def test_is_scalar_nan(input, result): + assert is_scalar_nan(input) is result From d69f85588c3c64b9b26f1f538ee1066d3349f4b4 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 15 Jun 2018 11:56:10 +0200 Subject: [PATCH 23/31] fixed --- sklearn/impute.py | 4 ++-- sklearn/utils/__init__.py | 4 +++- sklearn/utils/tests/test_utils.py | 6 +++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 4aeff59b4b810..84728cc44f7cb 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -462,8 +462,8 @@ class MICEImputer(BaseEstimator, TransformerMixin): initial_strategy : str, optional (default="mean") Which strategy to use to initialize the missing values. Same as the - ``strategy`` parameter in :class:`sklearn.preprocessing.Imputer` - Valid values: {"mean", "median", or "most_frequent"}. + ``strategy`` parameter in :class:`sklearn.impute.SimpleImputer` + Valid values: {"mean", "median", "most_frequent", or "constant"}. min_value : float, optional (default=None) Minimum possible imputed value. Default of ``None`` will set minimum diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 7821d92f97d78..312efaa8a533a 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -584,4 +584,6 @@ def is_scalar_nan(x): False """ - return isinstance(x, numbers.Real) and np.isnan(x) + # convert from numpy.bool_ to python bool to ensure that testing + # is_scalar_nan(x) is True does not fail. + return bool(isinstance(x, numbers.Real) and np.isnan(x)) diff --git a/sklearn/utils/tests/test_utils.py b/sklearn/utils/tests/test_utils.py index 9d63bee96bf15..c2474c58c13f7 100644 --- a/sklearn/utils/tests/test_utils.py +++ b/sklearn/utils/tests/test_utils.py @@ -317,7 +317,7 @@ def check_warning(*args, **kw): assert type(actual) is type(expected) -@pytest.mark.parametrize("input, result", [(float("nan"), True), +@pytest.mark.parametrize("value, result", [(float("nan"), True), (np.nan, True), (np.float("nan"), True), (np.float32("nan"), True), @@ -328,5 +328,5 @@ def check_warning(*args, **kw): ("", False), ("nan", False), ([np.nan], False)]) -def test_is_scalar_nan(input, result): - assert is_scalar_nan(input) is result +def test_is_scalar_nan(value, result): + assert is_scalar_nan(value) is result From c3a730d214d85b38f2536dd8b1c72b4bfd5fc318 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 15 Jun 2018 16:22:42 +0200 Subject: [PATCH 24/31] fixed v2 --- sklearn/tests/test_impute.py | 33 ++++++++++++++------------------- sklearn/utils/__init__.py | 4 +++- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 459b7c47c774d..d728a5a777cca 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -304,33 +304,28 @@ def test_imputation_constant_integer(): assert_array_equal(X_trans, X_true) -@pytest.mark.parametrize("format", ["csr", "array"]) -def test_imputation_constant_float(format): +@pytest.mark.parametrize("array_constructor", [sparse.csr_matrix, np.asarray]) +def test_imputation_constant_float(array_constructor): # Test imputation using the constant strategy on floats X = np.array([ - [np.nan, 1.1, 2.2, np.nan], - [3.3, np.nan, 4.4, np.nan], - [5.5, 6.6, np.nan, np.nan], - [7.7, 8.8, 9.9, np.nan] + [np.nan, 1.1, 0, np.nan], + [1.2, np.nan, 1.3, np.nan], + [0, 0, np.nan, np.nan], + [1.4, 1.5, 0, np.nan] ]) - X = sparse.csr_matrix(X) if format == "csr" else X - X_true = np.array([ - [0, 1.1, 2.2, 0], - [3.3, 0, 4.4, 0], - [5.5, 6.6, 0, 0], - [7.7, 8.8, 9.9, 0] + [-1, 1.1, 0, -1], + [1.2, -1, 1.3, -1], + [0, 0, -1, -1], + [1.4, 1.5, 0, -1] ]) - if format == "csr": - X_true = sparse.csr_matrix(X_true) - X_true[np.array([[True, False, False, True], - [False, True, False, True], - [False, False, True, True], - [False, False, False, True]])] = 0 + X = array_constructor(X) + + X_true = array_constructor(X_true) - imputer = SimpleImputer(strategy="constant", fill_value=0) + imputer = SimpleImputer(strategy="constant", fill_value=-1) X_trans = imputer.fit_transform(X) assert_allclose_dense_sparse(X_trans, X_true) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 312efaa8a533a..bb1f383505fe9 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -586,4 +586,6 @@ def is_scalar_nan(x): # convert from numpy.bool_ to python bool to ensure that testing # is_scalar_nan(x) is True does not fail. - return bool(isinstance(x, numbers.Real) and np.isnan(x)) + # Redondant np.floating is needed because numbers can't match np.float32 + # in python 2. + return bool(isinstance(x, (numbers.Real, np.floating)) and np.isnan(x)) From 94d7964cf494f2ceb01521fa8084a1fd1d310a91 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Fri, 15 Jun 2018 20:59:01 +0200 Subject: [PATCH 25/31] adressed @jnothman remark --- sklearn/impute.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 84728cc44f7cb..3adc0a58ef70b 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -43,8 +43,11 @@ def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" if value_to_mask is np.nan: - if X.dtype.kind in ("i", "u", "f"): + if X.dtype.kind == "f": return np.isnan(X) + elif X.dtype.kind in ("i", "u"): + # can't have NaNs in integer array. + return np.zeros(X.shape, dtype=bool) else: # np.isnan does not work on object dtypes. return _object_dtype_isnan(X) From 20456f41ebbabad7197d5b6a16c38f5a2e1380b5 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Sun, 17 Jun 2018 17:35:12 +0200 Subject: [PATCH 26/31] add tests for warnings and errors catch --- sklearn/tests/test_impute.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index d728a5a777cca..b052bf1904cb3 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -78,6 +78,26 @@ def test_imputation_shape(): assert X_imputed.shape == (10, 2) +@pytest.mark.parametrize("strategy", ["const", 101, None]) +def test_imputation_error_invalid_strategy(strategy): + X = np.ones((3, 5)) + X[0, 0] = np.nan + + with pytest.raises(ValueError, match=str(strategy)): + imputer = SimpleImputer(strategy=strategy) + imputer.fit_transform(X) + + +@pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) +def test_imputation_deletion_warning(strategy): + X = np.ones((3, 5)) + X[:, 0] = np.nan + + with pytest.warns(UserWarning, match="Deleting"): + imputer = SimpleImputer(strategy=strategy, verbose=True) + imputer.fit_transform(X) + + def safe_median(arr, *args, **kwargs): # np.median([]) raises a TypeError for numpy >= 1.10.1 length = arr.size if hasattr(arr, 'size') else len(arr) From 7d3d1b54daced3d9c1e1eef5e31306e75346d128 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 18 Jun 2018 14:30:25 +0200 Subject: [PATCH 27/31] dtype checks modifications + more tests --- doc/whats_new/v0.20.rst | 11 ++++++++ sklearn/impute.py | 50 +++++++++++++++----------------- sklearn/tests/test_impute.py | 55 ++++++++++++++++++++++++++++-------- sklearn/utils/validation.py | 4 +++ 4 files changed, 82 insertions(+), 38 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 87569d8649d86..2df84310eb6bf 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -613,6 +613,17 @@ Imputer SimpleImputer().fit_transform(X.T).T)``). :issue:`10829` by :user:`Guillaume Lemaitre ` and :user:`Gilberto Olimpio `. +- The :class:`impute.SimpleImputer` has a new strategy, ``'constant'``, to + complete missing values with a fixed one, given by the ``fill_value`` + parameter. This strategy supports numeric and non-numeric data, and so does + the ``'most_frequent'`` strategy now. :issue:`11211` by :user:`Jeremie du + Boisberranger `. + +- The NaN marker for the missing values has been changed between the + :class:`preprocessing.Imputer` and the :class:`impute.SimpleImputer`. + ``missing_values='NaN'`` should now be ``missing_values=np.nan``. + :issue:`11211` by :user:`Jeremie du Boisberranger `. + Outlier Detection models - More consistent outlier detection API: diff --git a/sklearn/impute.py b/sklearn/impute.py index 3adc0a58ef70b..cd70d0f58b894 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -99,8 +99,7 @@ class SimpleImputer(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : real number, string, np.nan or None, \ -optional (default=np.nan) + missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of `missing_values` will be imputed. @@ -108,12 +107,13 @@ class SimpleImputer(BaseEstimator, TransformerMixin): The imputation strategy. - If "mean", then replace missing values using the mean along - each column. + each column. Can only be used with numeric data. - If "median", then replace missing values using the median along - each column. + each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent - value along each column. - - If "constant", then replace missing values with fill_value. + value along each column. Can be used with strings or numeric data. + - If "constant", then replace missing values with fill_value. Can be + used with strings or numeric data. fill_value : string or numerical value, optional When strategy == "constant", fill_value is used to replace all @@ -169,8 +169,17 @@ def _validate_input(self, X): else: force_all_finite = "allow-nan" - return check_array(X, accept_sparse='csc', dtype=dtype, - force_all_finite=force_all_finite, copy=self.copy) + try: + X = check_array(X, accept_sparse='csc', dtype=dtype, + force_all_finite=force_all_finite, copy=self.copy) + except TypeError: + raise TypeError("Cannot use {0} strategy with non-numeric " + "data.".format(self.strategy)) + + if X.dtype.kind not in ("i", "u", "f", "O"): + X = X.astype(object) + + return X def fit(self, X, y=None): """Fit the imputer on X. @@ -198,25 +207,12 @@ def fit(self, X, y=None): fill_value = self.fill_value # fill_value should be numerical in case of numerical input - if self.strategy == "constant": - if X.dtype.kind in ("i", "u", "f"): - if not isinstance(fill_value, numbers.Real): - raise TypeError( - "'fill_value'={0} is invalid. Expected a numerical" - " value when imputing numerical" - " data".format(fill_value)) - - elif X.dtype.kind == "O": - if not isinstance(fill_value, six.string_types): - raise TypeError( - "'fill_value'={0} is invalid. Expected an str instance" - " when imputing categorical data.".format(fill_value)) - - else: - raise TypeError( - "SimpleImputer cannot work on data with dtype={0}: " - "expecting numerical or categorical data with " - "dtype=object.".format(X.dtype)) + if (self.strategy == "constant" and + X.dtype.kind in ("i", "u", "f") and + not isinstance(fill_value, numbers.Real)): + raise TypeError("'fill_value'={0} is invalid. Expected a numerical" + " value when imputing numerical" + " data".format(fill_value)) if sparse.issparse(X): self.statistics_ = self._sparse_fit(X, diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index b052bf1904cb3..cab1996c0c14a 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -210,6 +210,44 @@ def test_imputation_median_special_cases(): statistics_median, np.nan) +@pytest.mark.parametrize("strategy", ["mean", "median"]) +@pytest.mark.parametrize("dtype", [None, object, str]) +def test_imputation_mean_median_error_invalid_type(strategy, dtype): + X = np.array([["a", "b", 3], + [4, "e", 6], + ["g", "h", 9]], dtype=dtype) + + with pytest.raises(TypeError, match="non-numeric data"): + imputer = SimpleImputer(strategy=strategy) + imputer.fit_transform(X) + + +@pytest.mark.parametrize("strategy", ["constant", "most_frequent"]) +@pytest.mark.parametrize("dtype", [None, object, str]) +def test_imputation_non_numeric(strategy, dtype): + # Test imputation on non-numeric data using "most_frequent" and "constant" + # strategy + X = np.array([ + ["", "a", "f"], + ["c", "d", "d"], + ["b", "d", "d"], + ["c", "d", "h"], + ], dtype=dtype) + + X_true = np.array([ + ["c", "a", "f"], + ["c", "d", "d"], + ["b", "d", "d"], + ["c", "d", "h"], + ], dtype=dtype) + + imputer = SimpleImputer(missing_values="", strategy=strategy, + fill_value="c") + X_trans = imputer.fit(X).transform(X) + + assert_array_equal(X_trans, X_true) + + def test_imputation_most_frequent(): # Test imputation using the most-frequent strategy. X = np.array([ @@ -283,21 +321,16 @@ def test_imputation_most_frequent_pandas(dtype): assert_array_equal(X_trans, X_true) -@pytest.mark.parametrize("X_data, missing_value, fill_value, dtype, match", - [(1, 0, "x", None, "imputing numerical"), - (1., np.nan, "x", None, "imputing numerical"), - ("a", "", 0, object, "imputing categorical"), - (True, "nan", "x", "c", "cannot work")]) -def test_imputation_constant_error_invalid_types(X_data, missing_value, - fill_value, dtype, match): - # Verify that exceptions are raised on invalid inputs - X = np.full((3, 5), X_data, dtype=dtype) +@pytest.mark.parametrize("X_data, missing_value", [(1, 0), (1., np.nan)]) +def test_imputation_constant_error_invalid_type(X_data, missing_value): + # Verify that exceptions are raised on invalid fill_value type + X = np.full((3, 5), X_data) X[0, 0] = missing_value - with pytest.raises(TypeError, match=match): + with pytest.raises(TypeError, match="imputing numerical"): imputer = SimpleImputer(missing_values=missing_value, strategy="constant", - fill_value=fill_value) + fill_value="x") imputer.fit_transform(X) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 5fd54dc49b078..2f62299a206b0 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -492,6 +492,10 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None, try: warnings.simplefilter('error', ComplexWarning) array = np.asarray(array, dtype=dtype, order=order) + except ValueError as ve: + if "convert" in ve: + raise TypeError("Invalid dtype conversion from {0} to " + "{1}".format(dtype_orig, dtype)) except ComplexWarning: raise ValueError("Complex data not supported\n" "{}\n".format(array)) From 972668b750a4378cbc771fdc7c18d95f21397481 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Mon, 18 Jun 2018 16:53:24 +0200 Subject: [PATCH 28/31] fixed exception catching + go back to not allow any but object dtype --- sklearn/impute.py | 15 +++++++++++---- sklearn/tests/test_impute.py | 28 ++++++++++------------------ sklearn/utils/validation.py | 4 ---- 3 files changed, 21 insertions(+), 26 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index cd70d0f58b894..e6a4614da5f62 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -172,12 +172,19 @@ def _validate_input(self, X): try: X = check_array(X, accept_sparse='csc', dtype=dtype, force_all_finite=force_all_finite, copy=self.copy) - except TypeError: - raise TypeError("Cannot use {0} strategy with non-numeric " - "data.".format(self.strategy)) + except ValueError as ve: + if "could not convert" in str(ve): + raise TypeError("Cannot use {0} strategy with non-numeric " + "data. Received datatype :{1}." + "".format(self.strategy, X.dtype.kind)) + else: + raise ve if X.dtype.kind not in ("i", "u", "f", "O"): - X = X.astype(object) + raise TypeError("The SimpleImputer does not support this datatype" + " ({0}). Please provide either numeric data or" + " categorical data represented by integer or " + "object datatypes.".format(X.dtype)) return X diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index cab1996c0c14a..7dbbe6bd378f1 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -223,29 +223,21 @@ def test_imputation_mean_median_error_invalid_type(strategy, dtype): @pytest.mark.parametrize("strategy", ["constant", "most_frequent"]) -@pytest.mark.parametrize("dtype", [None, object, str]) -def test_imputation_non_numeric(strategy, dtype): +@pytest.mark.parametrize("dtype", [str, np.dtype('U'), np.dtype('S')]) +def test_imputation_const_mostf_error_invalid_types(strategy, dtype): # Test imputation on non-numeric data using "most_frequent" and "constant" # strategy X = np.array([ - ["", "a", "f"], - ["c", "d", "d"], - ["b", "d", "d"], - ["c", "d", "h"], + [np.nan, np.nan, "a", "f"], + [np.nan, "c", np.nan, "d"], + [np.nan, "b", "d", np.nan], + [np.nan, "c", "d", "h"], ], dtype=dtype) - X_true = np.array([ - ["c", "a", "f"], - ["c", "d", "d"], - ["b", "d", "d"], - ["c", "d", "h"], - ], dtype=dtype) - - imputer = SimpleImputer(missing_values="", strategy=strategy, - fill_value="c") - X_trans = imputer.fit(X).transform(X) - - assert_array_equal(X_trans, X_true) + err_msg = "SimpleImputer does not support this datatype" + with pytest.raises(TypeError, match=err_msg): + imputer = SimpleImputer(strategy=strategy) + imputer.fit(X).transform(X) def test_imputation_most_frequent(): diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 2f62299a206b0..5fd54dc49b078 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -492,10 +492,6 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None, try: warnings.simplefilter('error', ComplexWarning) array = np.asarray(array, dtype=dtype, order=order) - except ValueError as ve: - if "convert" in ve: - raise TypeError("Invalid dtype conversion from {0} to " - "{1}".format(dtype_orig, dtype)) except ComplexWarning: raise ValueError("Complex data not supported\n" "{}\n".format(array)) From f1da7b8a218c99e78800fafa68b92f754071f5ae Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 20 Jun 2018 10:31:04 +0200 Subject: [PATCH 29/31] error message update --- sklearn/impute.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index e6a4614da5f62..a17e653117ad1 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -174,17 +174,19 @@ def _validate_input(self, X): force_all_finite=force_all_finite, copy=self.copy) except ValueError as ve: if "could not convert" in str(ve): - raise TypeError("Cannot use {0} strategy with non-numeric " - "data. Received datatype :{1}." - "".format(self.strategy, X.dtype.kind)) + raise ValueError("Cannot use {0} strategy with non-numeric " + "data. Received datatype :{1}." + "".format(self.strategy, X.dtype.kind)) else: raise ve if X.dtype.kind not in ("i", "u", "f", "O"): - raise TypeError("The SimpleImputer does not support this datatype" - " ({0}). Please provide either numeric data or" - " categorical data represented by integer or " - "object datatypes.".format(X.dtype)) + raise ValueError("SimpleImputer does not work on data with dtype " + "{0}. Please provide either a numeric array (with" + " a floating point or integer dtype) or " + "categorical data represented either as an array " + "with integer dtype or an array of string values " + "with an object dtype.".format(X.dtype)) return X From fb1a4e9f4adde5ea5221ef3519034107ca387548 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 20 Jun 2018 10:40:15 +0200 Subject: [PATCH 30/31] with tests update is better --- sklearn/impute.py | 2 +- sklearn/tests/test_impute.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index a17e653117ad1..eace23aebf1d8 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -181,7 +181,7 @@ def _validate_input(self, X): raise ve if X.dtype.kind not in ("i", "u", "f", "O"): - raise ValueError("SimpleImputer does not work on data with dtype " + raise ValueError("SimpleImputer does not support data with dtype " "{0}. Please provide either a numeric array (with" " a floating point or integer dtype) or " "categorical data represented either as an array " diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 7dbbe6bd378f1..211d7122a40ee 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -217,7 +217,7 @@ def test_imputation_mean_median_error_invalid_type(strategy, dtype): [4, "e", 6], ["g", "h", 9]], dtype=dtype) - with pytest.raises(TypeError, match="non-numeric data"): + with pytest.raises(ValueError, match="non-numeric data"): imputer = SimpleImputer(strategy=strategy) imputer.fit_transform(X) @@ -234,8 +234,8 @@ def test_imputation_const_mostf_error_invalid_types(strategy, dtype): [np.nan, "c", "d", "h"], ], dtype=dtype) - err_msg = "SimpleImputer does not support this datatype" - with pytest.raises(TypeError, match=err_msg): + err_msg = "SimpleImputer does not support data" + with pytest.raises(ValueError, match=err_msg): imputer = SimpleImputer(strategy=strategy) imputer.fit(X).transform(X) From c8246f2549ddbd2680fb3996dfd39689a2665bd0 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 20 Jun 2018 15:23:36 +0200 Subject: [PATCH 31/31] TypeError -> ValueError --- sklearn/impute.py | 6 +++--- sklearn/tests/test_impute.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index eace23aebf1d8..15e719b7f1d13 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -219,9 +219,9 @@ def fit(self, X, y=None): if (self.strategy == "constant" and X.dtype.kind in ("i", "u", "f") and not isinstance(fill_value, numbers.Real)): - raise TypeError("'fill_value'={0} is invalid. Expected a numerical" - " value when imputing numerical" - " data".format(fill_value)) + raise ValueError("'fill_value'={0} is invalid. Expected a " + "numerical value when imputing numerical " + "data".format(fill_value)) if sparse.issparse(X): self.statistics_ = self._sparse_fit(X, diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 211d7122a40ee..170d94333bf44 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -319,7 +319,7 @@ def test_imputation_constant_error_invalid_type(X_data, missing_value): X = np.full((3, 5), X_data) X[0, 0] = missing_value - with pytest.raises(TypeError, match="imputing numerical"): + with pytest.raises(ValueError, match="imputing numerical"): imputer = SimpleImputer(missing_values=missing_value, strategy="constant", fill_value="x")