diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a25fd9fb49b3b..bbe65ec15f9ca 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -388,9 +388,10 @@ values. However, this comes at the price of losing data which may be valuable i.e., to infer them from the known part of the data. The :class:`Imputer` class provides basic strategies for imputing missing -values, either using the mean, the median or the most frequent value of -the row or column in which the missing values are located. This class -also allows for different missing values encodings. +values. It can use the mean, the median, the most frequent value of +the row or column in which the missing values are located. Alternatively it can fill +with the mean of only the k-nearest neighbors computed using samples without missing +values. The placeholder for missing values is configurable. The following snippet demonstrates how to replace missing values, encoded as ``np.nan``, using the mean value of the columns (axis 0) @@ -399,8 +400,8 @@ that contain the missing values:: >>> import numpy as np >>> from sklearn.preprocessing import Imputer >>> imp = Imputer(missing_values='NaN', strategy='mean', axis=0) - >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) - Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) + >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE + Imputer(axis=0, copy=True, missing_values='NaN', n_neighbors=1, strategy='mean', verbose=0) >>> X = [[np.nan, 2], [6, np.nan], [7, 6]] >>> print(imp.transform(X)) # doctest: +ELLIPSIS [[ 4. 2. ] @@ -412,8 +413,8 @@ The :class:`Imputer` class also supports sparse matrices:: >>> import scipy.sparse as sp >>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]]) >>> imp = Imputer(missing_values=0, strategy='mean', axis=0) - >>> imp.fit(X) - Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0) + >>> imp.fit(X) # doctest: +NORMALIZE_WHITESPACE + Imputer(axis=0, copy=True, missing_values=0, n_neighbors=1, strategy='mean', verbose=0) >>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]]) >>> print(imp.transform(X_test)) # doctest: +ELLIPSIS [[ 4. 2. ] @@ -424,5 +425,9 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored in the matrix. This format is thus suitable when there are many more missing values than observed values. +When using ``strategy=knn``, only samples without any missing features will be used for imputation. +If all samples +have missing features, this strategy will fail. + :class:`Imputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. See :ref:`example_missing_values.py` diff --git a/examples/missing_values.py b/examples/missing_values.py index 59444b36490e3..1c208887d94ee 100644 --- a/examples/missing_values.py +++ b/examples/missing_values.py @@ -8,23 +8,24 @@ Imputing does not always improve the predictions, so please check via cross-validation. Sometimes dropping rows or using marker values is more effective. -Missing values can be replaced by the mean, the median or the most frequent -value using the ``strategy`` hyper-parameter. +Missing values can be replaced by the mean, the median, the most frequent +value or the mean of values of k-nearest neighbors using the ``strategy`` hyper-parameter. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). Script output:: - Score with the entire dataset = 0.56 - Score without the samples containing missing values = 0.48 - Score after imputation of the missing values = 0.55 + Score with the entire dataset = 0.43 + Score without the samples containing missing values = 0.35 + Score after mean imputation of the missing values = 0.42 + Score after knn imputation with 7 neighbors of the missing values = 0.43 In this case, imputing helps the classifier get close to the original score. - + """ import numpy as np -from sklearn.datasets import load_boston +from sklearn.datasets import load_diabetes from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer @@ -32,41 +33,56 @@ rng = np.random.RandomState(0) -dataset = load_boston() +dataset = load_diabetes() X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] +#Create a random matrix to randomly make missing values +missing_matrix = rng.rand(n_samples, n_features) + +# each sample has (1-th)^n_features of probability to have full features +th = 0.14 +mask = missing_matrix < th +missing_samples = mask.any(axis=1) +full_percentage = (n_samples - missing_samples.sum()) / float(n_samples) +print("Percentage of samples with full features: %f" % full_percentage) + # Estimate the score on the entire dataset, with no missing values + estimator = RandomForestRegressor(random_state=0, n_estimators=100) score = cross_val_score(estimator, X_full, y_full).mean() print("Score with the entire dataset = %.2f" % score) -# Add missing values in 75% of the lines -missing_rate = 0.75 -n_missing_samples = np.floor(n_samples * missing_rate) -missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) -rng.shuffle(missing_samples) -missing_features = rng.randint(0, n_features, n_missing_samples) # Estimate the score without the lines containing missing values + X_filtered = X_full[~missing_samples, :] y_filtered = y_full[~missing_samples] estimator = RandomForestRegressor(random_state=0, n_estimators=100) score = cross_val_score(estimator, X_filtered, y_filtered).mean() print("Score without the samples containing missing values = %.2f" % score) -# Estimate the score after imputation of the missing values +# Estimate the score after mean imputation of the missing values + X_missing = X_full.copy() -X_missing[np.where(missing_samples)[0], missing_features] = 0 +X_missing[mask] = np.nan y_missing = y_full.copy() -estimator = Pipeline([("imputer", Imputer(missing_values=0, - strategy="mean", + +estimator = Pipeline([("imputer", Imputer(strategy="mean", axis=0)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) score = cross_val_score(estimator, X_missing, y_missing).mean() -print("Score after imputation of the missing values = %.2f" % score) +print("Score after mean imputation of the missing values = %.2f" % score) + +# Estimate the score after knn imputation of the missing values + +neigh = 7 # Number of neighbors to be used +estimator2 = Pipeline([("imputer", Imputer(strategy="knn", + axis=0, n_neighbors=neigh)), + ("forest", RandomForestRegressor(random_state=0, + n_estimators=100))]) +score = cross_val_score(estimator2, X_missing, y_missing).mean() +print("Score after knn imputation with %d neighbors of the missing values =" + " %.2f" % (neigh, score)) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 0ef23c471bd60..ab0a3d1602058 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -2,7 +2,6 @@ # License: BSD 3 clause import warnings - import numpy as np import numpy.ma as ma from scipy import sparse @@ -14,7 +13,7 @@ from ..utils.fixes import astype from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted - +from ..utils import gen_batches from ..externals import six zip = six.moves.zip @@ -61,6 +60,7 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value + class Imputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. @@ -82,12 +82,15 @@ class Imputer(BaseEstimator, TransformerMixin): the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. + - If "knn", then replace missing using the mean of the k-nearest + neighbors along the axis. Only samples with no missing values are + considered as neighbors. axis : integer, optional (default=0) The axis along which to impute. - - If `axis=0`, then impute along columns. - - If `axis=1`, then impute along rows. + - If ``axis=0``, then impute along columns. + - If ``axis=1``, then impute along rows. verbose : integer, optional (default=0) Controls the verbosity of the imputer. @@ -99,13 +102,18 @@ class Imputer(BaseEstimator, TransformerMixin): - If X is not an array of floating values; - If X is sparse and `missing_values=0`; - - If `axis=0` and X is encoded as a CSR matrix; - - If `axis=1` and X is encoded as a CSC matrix. + - If ``axis=0`` and X is encoded as a CSR matrix; + - If ``axis=1`` and X is encoded as a CSC matrix. + + n_neighbors : int, optional (default=1) + Controls the number of nearest neighbors used to compute the mean + along the axis. Only used when ``strategy=knn`` Attributes ---------- statistics_ : array of shape (n_features,) The imputation fill value for each feature if axis == 0. + If ``strategy=knn``, then it contains those samples having no missing value. Notes ----- @@ -114,14 +122,16 @@ class Imputer(BaseEstimator, TransformerMixin): - When ``axis=1``, an exception is raised if there are rows for which it is not possible to fill in the missing values (e.g., because they only contain missing values). + - Knn strategy currently doesn't support sparse matrix. """ def __init__(self, missing_values="NaN", strategy="mean", - axis=0, verbose=0, copy=True): + axis=0, verbose=0, copy=True, n_neighbors=1): self.missing_values = missing_values self.strategy = strategy self.axis = axis self.verbose = verbose self.copy = copy + self.n_neighbors = n_neighbors def fit(self, X, y=None): """Fit the imputer on X. @@ -138,7 +148,7 @@ def fit(self, X, y=None): Returns self. """ # Check parameters - allowed_strategies = ["mean", "median", "most_frequent"] + allowed_strategies = ["mean", "median", "most_frequent", "knn"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " " got strategy={1}".format(allowed_strategies, @@ -248,6 +258,12 @@ def _sparse_fit(self, X, strategy, missing_values, axis): return most_frequent + # KNN + elif strategy == "knn": + raise ValueError("strategy='knn' does not support sparse " + "matrix input") + + def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" X = check_array(X, force_all_finite=False) @@ -299,6 +315,27 @@ def _dense_fit(self, X, strategy, missing_values, axis): return most_frequent + # KNN + elif strategy == "knn": + + if axis == 1: + X = X.transpose() + mask = mask.transpose() + + # Get samples with complete features + full_data = X[np.logical_not(mask.any(axis=1))] + if full_data.size == 0: + raise ValueError("There is no sample with complete data.") + if full_data.shape[0] < self.n_neighbors: + raise ValueError("There are only %d complete samples, " + "but n_neighbors=%d." + % (full_data.shape[0], self.n_neighbors)) + # Transpose back + if axis == 1: + full_data = full_data.transpose() + + return full_data + def transform(self, X): """Impute all missing values in X. @@ -341,7 +378,9 @@ def transform(self, X): valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] - missing = np.arange(X.shape[not self.axis])[invalid_mask] + + if self.strategy != "knn": + missing = np.arange(X.shape[not self.axis])[invalid_mask] if self.axis == 0 and invalid_mask.any(): if self.verbose: @@ -366,13 +405,53 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=self.axis) - values = np.repeat(valid_statistics, n_missing) - if self.axis == 0: - coordinates = np.where(mask.transpose())[::-1] + if self.strategy == 'knn': + if self.axis == 1: + X = X.transpose() + mask = mask.transpose() + statistics = statistics.transpose() + + batch_size = 1 # set batch size for block query + + missing_index = np.where(mask.any(axis=1))[0] + D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], + statistics.shape[1]])) + + # Preallocate output array for np.multiply(test1, test1, out=D2) + for sl in gen_batches(len(missing_index), batch_size): + X_sl = X[missing_index[sl]] + mask_sl = mask[missing_index[sl]] + X_sl[mask_sl] = np.nan + impute_dist = X_sl[:][:, np.newaxis, :] - statistics + + # For the last slice, the length may not be the same + # as batch_size + if impute_dist.shape != D2.shape: + D2 = np.empty_like(impute_dist) + + np.multiply(impute_dist, impute_dist, out=D2) + D2[np.isnan(D2)] = 0 + missing_row, missing_col = np.where(np.isnan(X_sl)) + sqdist = D2.sum(axis=2) + target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] + means = np.mean(statistics[target_index], axis=1) + X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], + missing_col] + X[missing_index[sl]] = X_sl + + if self.axis == 1: + X = X.transpose() + else: - coordinates = mask + values = np.repeat(valid_statistics, n_missing) - X[coordinates] = values + if self.axis == 0: + coordinates = np.where(mask.transpose())[::-1] + else: + coordinates = mask + + X[coordinates] = values return X + diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index bfcfc2a753b6a..0db13ff950390 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -4,6 +4,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_true @@ -345,3 +346,101 @@ def test_imputation_copy(): # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is # made, even if copy=False. + +def test_imputation_knn(): + # Test imputation using knn strategy. + X = np.array([ + [np.nan, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + X_true_1 = np.array([ + [-1, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + X2 = np.array([ + [np.nan, -1, 0, np.nan], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + X_true_2 = np.array([ + [-0.5, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + X3 = np.array([ + [np.nan, -1, 0, 5], + [0, np.nan, -1, 3], + [-1, -1, np.nan, 5], + [-1, 2, 3, np.nan], + ]) + + X4 = np.array([ + [np.nan, -1, 0, 5], + [np.nan, 2, -1, 3], + [-1, -1, 0, 5], + [0, 2, -1, 6], + ]) + + X_true_4 = np.array([ + [-1, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [0, 2, -1, 6], + ]) + + X5 = np.array([ + [999, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=1) + X_impute = imputer.fit(X).transform(X) + assert_array_equal(X_true_1, X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=1, n_neighbors=1) + X_impute = imputer.fit(X.transpose()).transform(X.transpose()) + assert_array_equal(X_true_1.transpose(), X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=2) + X_impute = imputer.fit(X).transform(X) + assert_array_equal(X_true_2, X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=1) + X_impute = imputer.fit(X2).transform(X2) + assert_array_equal(X_true_1, X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=1) + X_impute = imputer.fit(X4).transform(X4) + assert_array_equal(X_true_4, X_impute) + + imputer = Imputer(missing_values=999, strategy="knn", + axis=0, n_neighbors=1, copy=False) + X5 = X5.astype(float) + X_impute = imputer.fit(X5).transform(X5) + assert_array_equal(X_true_1, X5) + assert_array_equal(X_impute, X5) + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0) + msg = "There is no sample with complete data." + assert_raise_message(ValueError, msg, imputer.fit, X3) + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=4) + msg = "There are only 3 complete samples, but n_neighbors=4." + assert_raise_message(ValueError, msg, imputer.fit, X)