From ac8fdbd88217a67ca7d228d69c90eb1e6f976e51 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Wed, 10 Jun 2015 14:22:56 -0400 Subject: [PATCH 01/17] add simple knn imputation/ only works for dense matrix --- sklearn/preprocessing/imputation.py | 73 ++++++++++++++++--- .../preprocessing/tests/test_imputation.py | 66 ++++++++++++++++- 2 files changed, 129 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 0ef23c471bd60..cfcaef271c05c 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -8,6 +8,7 @@ from scipy import sparse from scipy import stats +from ..neighbors import KDTree, NearestNeighbors from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils import as_float_array @@ -61,6 +62,7 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value + class Imputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. @@ -82,6 +84,8 @@ class Imputer(BaseEstimator, TransformerMixin): the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. + - If "knn", then replace missing using the mean of the k-nearest neighbors + along the axis. axis : integer, optional (default=0) The axis along which to impute. @@ -102,6 +106,10 @@ class Imputer(BaseEstimator, TransformerMixin): - If `axis=0` and X is encoded as a CSR matrix; - If `axis=1` and X is encoded as a CSC matrix. + kneighbor : int, optional (default=1) + It only has effect if the strategy is "knn". It controls the number of nearest + neighbors used to compute the mean along the axis. + Attributes ---------- statistics_ : array of shape (n_features,) @@ -116,12 +124,13 @@ class Imputer(BaseEstimator, TransformerMixin): contain missing values). """ def __init__(self, missing_values="NaN", strategy="mean", - axis=0, verbose=0, copy=True): + axis=0, verbose=0, copy=True, kneighbor=1): self.missing_values = missing_values self.strategy = strategy self.axis = axis self.verbose = verbose self.copy = copy + self.kneighbor = kneighbor def fit(self, X, y=None): """Fit the imputer on X. @@ -138,7 +147,7 @@ def fit(self, X, y=None): Returns self. """ # Check parameters - allowed_strategies = ["mean", "median", "most_frequent"] + allowed_strategies = ["mean", "median", "most_frequent", "knn"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " " got strategy={1}".format(allowed_strategies, @@ -248,6 +257,10 @@ def _sparse_fit(self, X, strategy, missing_values, axis): return most_frequent + elif strategy == "knn": + raise ValueError("Sparse matrix not supported!") + + def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" X = check_array(X, force_all_finite=False) @@ -299,6 +312,22 @@ def _dense_fit(self, X, strategy, missing_values, axis): return most_frequent + # KNN + elif strategy == "knn": + + if axis == 1: + X = X.copy().transpose() + + full_data = X[np.logical_not(mask.any(1))] + if full_data.size == 0: + raise ValueError("There is no row with complete data!") + if full_data.shape[0] < self.kneighbor: + raise ValueError("There are at most %d neighbors!" %(full_data.shape[0])) + if axis == 1: + full_data = full_data.transpose() + + return full_data + def transform(self, X): """Impute all missing values in X. @@ -341,7 +370,9 @@ def transform(self, X): valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] valid_statistics_indexes = np.where(valid_mask)[0] - missing = np.arange(X.shape[not self.axis])[invalid_mask] + + if self.strategy != "knn": + missing = np.arange(X.shape[not self.axis])[invalid_mask] if self.axis == 0 and invalid_mask.any(): if self.verbose: @@ -366,13 +397,37 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=self.axis) - values = np.repeat(valid_statistics, n_missing) - if self.axis == 0: - coordinates = np.where(mask.transpose())[::-1] + if self.strategy == 'knn': + if self.axis == 1: + X = X.transpose() + mask = mask.transpose() + statistics = statistics.transpose() + missing_index = np.where(mask.any(1))[0] + for i, row in zip(missing_index, X[missing_index]): + col_index = np.where(np.logical_not(np.isnan(row)))[0] + impute_index = np.where(np.isnan(row))[0] + neigh = NearestNeighbors(self.kneighbor) + neigh = neigh.fit(statistics[:, col_index]) + _dist, ind = neigh.kneighbors(row[np.logical_not(np.isnan(row))], + self.kneighbor) + #tree = KDTree(statistics[:, col_index]) + #dist, ind = tree.query(row[np.logical_not(np.isnan(row))], + # k=self.kneighbor) + nn_index = ind[0] + X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0) + + if self.axis == 1: + X = X.transpose() + else: - coordinates = mask + values = np.repeat(valid_statistics, n_missing) + + if self.axis == 0: + coordinates = np.where(mask.transpose())[::-1] + else: + coordinates = mask - X[coordinates] = values + X[coordinates] = values - return X + return X \ No newline at end of file diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index bfcfc2a753b6a..fbd228367bbab 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -4,6 +4,7 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_true @@ -15,7 +16,7 @@ def _check_statistics(X, X_true, - strategy, statistics, missing_values): + strategy, statistics, missing_values, kneighbor=1): """Utility function for testing imputation for a given strategy. Test: @@ -345,3 +346,66 @@ def test_imputation_copy(): # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is # made, even if copy=False. + +def test_imputation_knn(): + # Test imputation using knn strategy. + X = np.array([ + [np.nan, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + X2 = np.array([ + [np.nan, -1, 0, np.nan], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + X3 = np.array([ + [np.nan, -1, 0, 5], + [0, np.nan, -1, 3], + [-1, -1, np.nan, 5], + [-1, 2, 3, np.nan], + ]) + + X_true_1 = np.array([ + [-1, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + X_true_2 = np.array([ + [-0.5, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1) + X_impute = imputer.fit(X).transform(X) + assert_array_equal(X_true_1, X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, kneighbor=1) + X_impute = imputer.fit(X.transpose()).transform(X.transpose()) + assert_array_equal(X_true_1.transpose(), X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=2) + X_impute = imputer.fit(X).transform(X) + assert_array_equal(X_true_2, X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1) + X_impute = imputer.fit(X2).transform(X2) + assert_array_equal(X_true_1, X_impute) + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0) + msg = "There is no row with complete data!" + assert_raise_message(ValueError, msg, imputer.fit, X3) + + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=4) + msg = "There are at most 3 neighbors!" + assert_raise_message(ValueError, msg, imputer.fit, X) + From de2182d4750ad800d0a529c2aaeccee8cf76da93 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Thu, 11 Jun 2015 13:40:20 -0400 Subject: [PATCH 02/17] add examples for knn imputation, fix error messages, add possible euclidean calculation method and add some tests --- examples/missing_values.py | 45 +++++++++++------ sklearn/preprocessing/imputation.py | 49 ++++++++++++------- .../preprocessing/tests/test_imputation.py | 34 ++++++++++--- 3 files changed, 86 insertions(+), 42 deletions(-) diff --git a/examples/missing_values.py b/examples/missing_values.py index 59444b36490e3..8952590e7dcfb 100644 --- a/examples/missing_values.py +++ b/examples/missing_values.py @@ -8,23 +8,24 @@ Imputing does not always improve the predictions, so please check via cross-validation. Sometimes dropping rows or using marker values is more effective. -Missing values can be replaced by the mean, the median or the most frequent -value using the ``strategy`` hyper-parameter. +Missing values can be replaced by the mean, the median, the most frequent +value or the mean of values of k-nearest neighbors using the ``strategy`` hyper-parameter. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). Script output:: - Score with the entire dataset = 0.56 - Score without the samples containing missing values = 0.48 - Score after imputation of the missing values = 0.55 + Score with the entire dataset = 0.43 + Score without the samples containing missing values = 0.36 + Score after mean imputation of the missing values = 0.42 + Score after knn imputation with 10 neighbors of the missing values = 0.43 In this case, imputing helps the classifier get close to the original score. - + """ import numpy as np -from sklearn.datasets import load_boston +from sklearn.datasets import load_diabetes from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer @@ -32,7 +33,7 @@ rng = np.random.RandomState(0) -dataset = load_boston() +dataset = load_diabetes() X_full, y_full = dataset.data, dataset.target n_samples = X_full.shape[0] n_features = X_full.shape[1] @@ -42,15 +43,18 @@ score = cross_val_score(estimator, X_full, y_full).mean() print("Score with the entire dataset = %.2f" % score) -# Add missing values in 75% of the lines -missing_rate = 0.75 +# Add missing values in 60% of the lines +missing_rate = 0.60 # 60% of samples have missing value +missing2_rate = 0.80 # 80% of samples with missing value have 2 missing features n_missing_samples = np.floor(n_samples * missing_rate) +n_missing2_samples = np.floor(n_samples * missing_rate * missing2_rate) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), np.ones(n_missing_samples, dtype=np.bool))) rng.shuffle(missing_samples) missing_features = rng.randint(0, n_features, n_missing_samples) +missing2_features = rng.randint(0, n_features, n_missing2_samples) # Estimate the score without the lines containing missing values X_filtered = X_full[~missing_samples, :] @@ -59,14 +63,25 @@ score = cross_val_score(estimator, X_filtered, y_filtered).mean() print("Score without the samples containing missing values = %.2f" % score) -# Estimate the score after imputation of the missing values +# Estimate the score after mean imputation of the missing values +missing_index = np.where(missing_samples)[0] +missing2_index = np.random.choice(missing_index, n_missing2_samples) X_missing = X_full.copy() -X_missing[np.where(missing_samples)[0], missing_features] = 0 +X_missing[np.where(missing_samples)[0], missing_features] = np.nan +X_missing[missing2_index, missing2_features] = np.nan y_missing = y_full.copy() -estimator = Pipeline([("imputer", Imputer(missing_values=0, - strategy="mean", +estimator = Pipeline([("imputer", Imputer(strategy="mean", axis=0)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) score = cross_val_score(estimator, X_missing, y_missing).mean() -print("Score after imputation of the missing values = %.2f" % score) +print("Score after mean imputation of the missing values = %.2f" % score) + +# Estimate the score after knn imputation of the missing values +neigh = 7 +estimator2 = Pipeline([("imputer", Imputer(strategy="knn", + axis=0, n_neighbors=neigh)), + ("forest", RandomForestRegressor(random_state=0, + n_estimators=100))]) +score = cross_val_score(estimator2, X_missing, y_missing).mean() +print("Score after knn imputation with %d neighbors of the missing values = %.2f" % (neigh, score)) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index cfcaef271c05c..90159dfb11215 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -106,7 +106,7 @@ class Imputer(BaseEstimator, TransformerMixin): - If `axis=0` and X is encoded as a CSR matrix; - If `axis=1` and X is encoded as a CSC matrix. - kneighbor : int, optional (default=1) + n_neighbors : int, optional (default=1) It only has effect if the strategy is "knn". It controls the number of nearest neighbors used to compute the mean along the axis. @@ -124,13 +124,13 @@ class Imputer(BaseEstimator, TransformerMixin): contain missing values). """ def __init__(self, missing_values="NaN", strategy="mean", - axis=0, verbose=0, copy=True, kneighbor=1): + axis=0, verbose=0, copy=True, n_neighbors=1): self.missing_values = missing_values self.strategy = strategy self.axis = axis self.verbose = verbose self.copy = copy - self.kneighbor = kneighbor + self.n_neighbors = n_neighbors def fit(self, X, y=None): """Fit the imputer on X. @@ -258,7 +258,7 @@ def _sparse_fit(self, X, strategy, missing_values, axis): return most_frequent elif strategy == "knn": - raise ValueError("Sparse matrix not supported!") + raise ValueError("strategy='knn' does not support sparse matrix input") def _dense_fit(self, X, strategy, missing_values, axis): @@ -320,9 +320,9 @@ def _dense_fit(self, X, strategy, missing_values, axis): full_data = X[np.logical_not(mask.any(1))] if full_data.size == 0: - raise ValueError("There is no row with complete data!") - if full_data.shape[0] < self.kneighbor: - raise ValueError("There are at most %d neighbors!" %(full_data.shape[0])) + raise ValueError("There is no sample with complete data.") + if full_data.shape[0] < self.n_neighbors: + raise ValueError("There are only %d complete samples, but n_neighbors=%d." %(full_data.shape[0], self.n_neighbors)) if axis == 1: full_data = full_data.transpose() @@ -404,18 +404,29 @@ def transform(self, X): mask = mask.transpose() statistics = statistics.transpose() missing_index = np.where(mask.any(1))[0] - for i, row in zip(missing_index, X[missing_index]): - col_index = np.where(np.logical_not(np.isnan(row)))[0] - impute_index = np.where(np.isnan(row))[0] - neigh = NearestNeighbors(self.kneighbor) - neigh = neigh.fit(statistics[:, col_index]) - _dist, ind = neigh.kneighbors(row[np.logical_not(np.isnan(row))], - self.kneighbor) - #tree = KDTree(statistics[:, col_index]) - #dist, ind = tree.query(row[np.logical_not(np.isnan(row))], - # k=self.kneighbor) - nn_index = ind[0] - X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0) + if True: + for i, row in zip(missing_index, X[missing_index]): + col_na_mask = np.isnan(row) + col_full_mask = np.logical_not(col_na_mask) + col_index = np.where(col_full_mask)[0] + impute_index = np.where(col_na_mask)[0] + neigh = NearestNeighbors(self.n_neighbors) + neigh = neigh.fit(statistics[:, col_index]) + _dist, ind = neigh.kneighbors(row[col_full_mask], + self.n_neighbors) + nn_index = ind[0] + X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0) + else: + + #@jnothman 's method + + D2 = (X[missing_index, np.newaxis] - statistics) ** 2 + D2[np.isnan(D2)] = 0 + missing_row, missing_col = np.where(np.isnan(X)) + sqdist = D2.sum(axis=2) + ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] + means = np.mean(statistics[ind], axis=1) + X[missing_row, missing_col] = means[np.where(np.isnan(X[missing_index]))[0], missing_col] if self.axis == 1: X = X.transpose() diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index fbd228367bbab..e79c4e1375bb7 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -16,7 +16,7 @@ def _check_statistics(X, X_true, - strategy, statistics, missing_values, kneighbor=1): + strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: @@ -370,6 +370,13 @@ def test_imputation_knn(): [-1, 2, 3, np.nan], ]) + X4 = np.array([ + [np.nan, -1, 0, 5], + [np.nan, 2, -1, 3], + [-1, -1, 0, 5], + [0, 2, -1, 6], + ]) + X_true_1 = np.array([ [-1, -1, 0, 5], [0, 2, -1, 3], @@ -384,28 +391,39 @@ def test_imputation_knn(): [-1, 2, 3, 7], ]) + X_true_4 = np.array([ + [-1, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [0, 2, -1, 6], + ]) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1) + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1) X_impute = imputer.fit(X).transform(X) assert_array_equal(X_true_1, X_impute) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, kneighbor=1) + imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, n_neighbors=1) X_impute = imputer.fit(X.transpose()).transform(X.transpose()) assert_array_equal(X_true_1.transpose(), X_impute) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=2) + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=2) X_impute = imputer.fit(X).transform(X) assert_array_equal(X_true_2, X_impute) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=1) + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1) X_impute = imputer.fit(X2).transform(X2) assert_array_equal(X_true_1, X_impute) + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1) + X_impute = imputer.fit(X4).transform(X4) + assert_array_equal(X_true_4, X_impute) + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0) - msg = "There is no row with complete data!" + msg = "There is no sample with complete data." assert_raise_message(ValueError, msg, imputer.fit, X3) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, kneighbor=4) - msg = "There are at most 3 neighbors!" + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=4) + msg = "There are only 3 complete samples, but n_neighbors=4." assert_raise_message(ValueError, msg, imputer.fit, X) + From 160809785977354f941a63cf7db72d6dfd5f35ae Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Thu, 11 Jun 2015 14:46:31 -0400 Subject: [PATCH 03/17] change to block query --- sklearn/preprocessing/imputation.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 90159dfb11215..1cbaade0d3e3b 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -15,7 +15,7 @@ from ..utils.fixes import astype from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted - +from ..utils import gen_batches from ..externals import six zip = six.moves.zip @@ -404,7 +404,7 @@ def transform(self, X): mask = mask.transpose() statistics = statistics.transpose() missing_index = np.where(mask.any(1))[0] - if True: + if False: for i, row in zip(missing_index, X[missing_index]): col_na_mask = np.isnan(row) col_full_mask = np.logical_not(col_na_mask) @@ -419,14 +419,19 @@ def transform(self, X): else: #@jnothman 's method - - D2 = (X[missing_index, np.newaxis] - statistics) ** 2 - D2[np.isnan(D2)] = 0 - missing_row, missing_col = np.where(np.isnan(X)) - sqdist = D2.sum(axis=2) - ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] - means = np.mean(statistics[ind], axis=1) - X[missing_row, missing_col] = means[np.where(np.isnan(X[missing_index]))[0], missing_col] + for sl in list(gen_batches(len(missing_index),100)): + index_start, index_stop = missing_index[sl][0],missing_index[sl][-1]+1 + X_sl = X[index_start: index_stop].copy() + mask_sl = _get_mask(X_sl, self.missing_values) + missing_index_sl = np.where(mask_sl.any(1))[0] + D2 = (X_sl[missing_index_sl, np.newaxis] - statistics) ** 2 + D2[np.isnan(D2)] = 0 + missing_row, missing_col = np.where(np.isnan(X_sl)) + sqdist = D2.sum(axis=2) + ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] + means = np.mean(statistics[ind], axis=1) + X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0], missing_col] + X[index_start: index_stop] = X_sl if self.axis == 1: X = X.transpose() From 4ff1dd7e20f9f8c699706bc542999e7672cf349b Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Mon, 15 Jun 2015 16:11:27 -0400 Subject: [PATCH 04/17] fix doc fix numpy compatibility;TODO groupby missing features, speed comparision, examples --- doc/modules/preprocessing.rst | 8 +++---- sklearn/preprocessing/imputation.py | 21 ++++++++++--------- .../preprocessing/tests/test_imputation.py | 2 -- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a25fd9fb49b3b..cc0cf59dd5f2b 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -399,8 +399,8 @@ that contain the missing values:: >>> import numpy as np >>> from sklearn.preprocessing import Imputer >>> imp = Imputer(missing_values='NaN', strategy='mean', axis=0) - >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) - Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0) + >>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE + Imputer(axis=0, copy=True, missing_values='NaN', n_neighbors=1, strategy='mean', verbose=0) >>> X = [[np.nan, 2], [6, np.nan], [7, 6]] >>> print(imp.transform(X)) # doctest: +ELLIPSIS [[ 4. 2. ] @@ -412,8 +412,8 @@ The :class:`Imputer` class also supports sparse matrices:: >>> import scipy.sparse as sp >>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]]) >>> imp = Imputer(missing_values=0, strategy='mean', axis=0) - >>> imp.fit(X) - Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0) + >>> imp.fit(X) # doctest: +NORMALIZE_WHITESPACE + Imputer(axis=0, copy=True, missing_values=0, n_neighbors=1, strategy='mean', verbose=0) >>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]]) >>> print(imp.transform(X_test)) # doctest: +ELLIPSIS [[ 4. 2. ] diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 1cbaade0d3e3b..7f8629a45c551 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -8,7 +8,7 @@ from scipy import sparse from scipy import stats -from ..neighbors import KDTree, NearestNeighbors +from ..neighbors import NearestNeighbors from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils import as_float_array @@ -322,7 +322,8 @@ def _dense_fit(self, X, strategy, missing_values, axis): if full_data.size == 0: raise ValueError("There is no sample with complete data.") if full_data.shape[0] < self.n_neighbors: - raise ValueError("There are only %d complete samples, but n_neighbors=%d." %(full_data.shape[0], self.n_neighbors)) + raise ValueError("There are only %d complete samples, but n_neighbors=%d." + % (full_data.shape[0], self.n_neighbors)) if axis == 1: full_data = full_data.transpose() @@ -413,25 +414,25 @@ def transform(self, X): neigh = NearestNeighbors(self.n_neighbors) neigh = neigh.fit(statistics[:, col_index]) _dist, ind = neigh.kneighbors(row[col_full_mask], - self.n_neighbors) + self.n_neighbors) nn_index = ind[0] X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0) - else: + elif True: #@jnothman 's method - for sl in list(gen_batches(len(missing_index),100)): - index_start, index_stop = missing_index[sl][0],missing_index[sl][-1]+1 - X_sl = X[index_start: index_stop].copy() + for sl in list(gen_batches(len(missing_index), 100)): + index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1 + X_sl = X[index_start: index_stop] mask_sl = _get_mask(X_sl, self.missing_values) missing_index_sl = np.where(mask_sl.any(1))[0] - D2 = (X_sl[missing_index_sl, np.newaxis] - statistics) ** 2 + D2 = (X_sl[missing_index_sl, np.newaxis, :] - statistics) ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] means = np.mean(statistics[ind], axis=1) - X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0], missing_col] - X[index_start: index_stop] = X_sl + X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0], + missing_col] if self.axis == 1: X = X.transpose() diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index e79c4e1375bb7..e66d12e16b58d 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -425,5 +425,3 @@ def test_imputation_knn(): imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=4) msg = "There are only 3 complete samples, but n_neighbors=4." assert_raise_message(ValueError, msg, imputer.fit, X) - - From 0b2cdf7e2d5bbb15547b37070f55c8914f730a32 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Mon, 15 Jun 2015 16:37:50 -0400 Subject: [PATCH 05/17] fix numpy compatibility again --- sklearn/preprocessing/imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 7f8629a45c551..2dd6df3cab39b 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -425,7 +425,7 @@ def transform(self, X): X_sl = X[index_start: index_stop] mask_sl = _get_mask(X_sl, self.missing_values) missing_index_sl = np.where(mask_sl.any(1))[0] - D2 = (X_sl[missing_index_sl, np.newaxis, :] - statistics) ** 2 + D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) From 1b731cf0b4ba5f806f127c8a27c175c05045a720 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Tue, 16 Jun 2015 11:09:10 -0400 Subject: [PATCH 06/17] groupby missing feature, fix circular import --- sklearn/preprocessing/imputation.py | 41 ++++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 2dd6df3cab39b..43700907907bc 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -2,13 +2,12 @@ # License: BSD 3 clause import warnings - +import itertools import numpy as np import numpy.ma as ma from scipy import sparse from scipy import stats -from ..neighbors import NearestNeighbors from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils import as_float_array @@ -404,21 +403,8 @@ def transform(self, X): X = X.transpose() mask = mask.transpose() statistics = statistics.transpose() - missing_index = np.where(mask.any(1))[0] if False: - for i, row in zip(missing_index, X[missing_index]): - col_na_mask = np.isnan(row) - col_full_mask = np.logical_not(col_na_mask) - col_index = np.where(col_full_mask)[0] - impute_index = np.where(col_na_mask)[0] - neigh = NearestNeighbors(self.n_neighbors) - neigh = neigh.fit(statistics[:, col_index]) - _dist, ind = neigh.kneighbors(row[col_full_mask], - self.n_neighbors) - nn_index = ind[0] - X[i][impute_index] = statistics[nn_index][:, impute_index].mean(0) - elif True: - + missing_index = np.where(mask.any(1))[0] #@jnothman 's method for sl in list(gen_batches(len(missing_index), 100)): index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1 @@ -433,6 +419,29 @@ def transform(self, X): means = np.mean(statistics[ind], axis=1) X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0], missing_col] + else: + # group by missing features and batch within group + group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), return_inverse=True)[1] + for group_number in range(max(group_index)+1): + if group_number == 0: + continue + else: + missing_index = np.where(group_index == group_number)[0] + batch_slice = list(gen_batches(len(missing_index), 100)) + for sl in batch_slice: + index_sl = missing_index[sl] + X_sl = X[index_sl] + D2 = (X_sl[:][:, np.newaxis, :] - statistics) ** 2 + D2[np.isnan(D2)] = 0 + missing_row, missing_col = np.where(np.isnan(X_sl)) + sqdist = D2.sum(axis=2) + ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] + means = np.mean(statistics[ind], axis=1) + X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], + missing_col] + X[index_sl] = X_sl + + if self.axis == 1: X = X.transpose() From deb8c80986d4c3c417b04fd9996d09f3e43d78d1 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Tue, 16 Jun 2015 14:31:34 -0400 Subject: [PATCH 07/17] choose batchsize --- sklearn/preprocessing/imputation.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 43700907907bc..c90d1fa7f5cd4 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -2,7 +2,6 @@ # License: BSD 3 clause import warnings -import itertools import numpy as np import numpy.ma as ma from scipy import sparse @@ -403,15 +402,22 @@ def transform(self, X): X = X.transpose() mask = mask.transpose() statistics = statistics.transpose() + + batch_size = 10 # set batch size for block query if False: missing_index = np.where(mask.any(1))[0] #@jnothman 's method - for sl in list(gen_batches(len(missing_index), 100)): + for sl in list(gen_batches(len(missing_index), batch_size)): index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1 X_sl = X[index_start: index_stop] mask_sl = _get_mask(X_sl, self.missing_values) missing_index_sl = np.where(mask_sl.any(1))[0] - D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 + t1 = time() + fancy_index = X_sl[missing_index_sl][:, np.newaxis, :] + D2 = np.square(fancy_index - statistics) + #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 + t2 = time() + time_1 = time_1 + (t2-t1) D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) @@ -427,7 +433,7 @@ def transform(self, X): continue else: missing_index = np.where(group_index == group_number)[0] - batch_slice = list(gen_batches(len(missing_index), 100)) + batch_slice = list(gen_batches(len(missing_index), batch_size)) for sl in batch_slice: index_sl = missing_index[sl] X_sl = X[index_sl] @@ -456,4 +462,5 @@ def transform(self, X): X[coordinates] = values - return X \ No newline at end of file + return X + From 430a2a0d93006d0661b350ca01821be051fd6c55 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Tue, 16 Jun 2015 14:36:13 -0400 Subject: [PATCH 08/17] delete test code --- sklearn/preprocessing/imputation.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index c90d1fa7f5cd4..5377f9bdc198b 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -412,12 +412,7 @@ def transform(self, X): X_sl = X[index_start: index_stop] mask_sl = _get_mask(X_sl, self.missing_values) missing_index_sl = np.where(mask_sl.any(1))[0] - t1 = time() - fancy_index = X_sl[missing_index_sl][:, np.newaxis, :] - D2 = np.square(fancy_index - statistics) - #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 - t2 = time() - time_1 = time_1 + (t2-t1) + D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) From 85039ac769fd39bd351cd1e7344dcf3282a7188c Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Tue, 16 Jun 2015 17:32:08 -0400 Subject: [PATCH 09/17] change example/missing_value; using diabetes dataset, and use random matrix to create na. add comparision between knn and mean imputation --- examples/missing_values.py | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/examples/missing_values.py b/examples/missing_values.py index 8952590e7dcfb..651b51b3f4399 100644 --- a/examples/missing_values.py +++ b/examples/missing_values.py @@ -18,7 +18,7 @@ Score with the entire dataset = 0.43 Score without the samples containing missing values = 0.36 Score after mean imputation of the missing values = 0.42 - Score after knn imputation with 10 neighbors of the missing values = 0.43 + Score after knn imputation with 7 neighbors of the missing values = 0.43 In this case, imputing helps the classifier get close to the original score. @@ -38,23 +38,19 @@ n_samples = X_full.shape[0] n_features = X_full.shape[1] +#Create a random matrix to randomly make missing values +missing_matrix = np.random.rand(n_samples, n_features) +th = 0.15 # each sample has (1-th)^n_features of probability to have full features +mask = missing_matrix < th +missing_samples = mask.any(1) +full_percentage = (n_samples - missing_samples.sum())/float(n_samples) +print("Percentage of samples with full features: %f" %full_percentage ) + # Estimate the score on the entire dataset, with no missing values estimator = RandomForestRegressor(random_state=0, n_estimators=100) score = cross_val_score(estimator, X_full, y_full).mean() print("Score with the entire dataset = %.2f" % score) -# Add missing values in 60% of the lines -missing_rate = 0.60 # 60% of samples have missing value -missing2_rate = 0.80 # 80% of samples with missing value have 2 missing features -n_missing_samples = np.floor(n_samples * missing_rate) -n_missing2_samples = np.floor(n_samples * missing_rate * missing2_rate) -missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, - dtype=np.bool), - np.ones(n_missing_samples, - dtype=np.bool))) -rng.shuffle(missing_samples) -missing_features = rng.randint(0, n_features, n_missing_samples) -missing2_features = rng.randint(0, n_features, n_missing2_samples) # Estimate the score without the lines containing missing values X_filtered = X_full[~missing_samples, :] @@ -64,12 +60,11 @@ print("Score without the samples containing missing values = %.2f" % score) # Estimate the score after mean imputation of the missing values -missing_index = np.where(missing_samples)[0] -missing2_index = np.random.choice(missing_index, n_missing2_samples) + X_missing = X_full.copy() -X_missing[np.where(missing_samples)[0], missing_features] = np.nan -X_missing[missing2_index, missing2_features] = np.nan +X_missing[mask] = np.nan y_missing = y_full.copy() + estimator = Pipeline([("imputer", Imputer(strategy="mean", axis=0)), ("forest", RandomForestRegressor(random_state=0, From 998fba610b162b7320f03241d8aa003dc59f20ef Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Wed, 17 Jun 2015 12:27:54 -0400 Subject: [PATCH 10/17] avoid _get_mask for each iteration; add axis kw for np.any --- sklearn/preprocessing/imputation.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 5377f9bdc198b..8e4b575d4bb76 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -105,13 +105,14 @@ class Imputer(BaseEstimator, TransformerMixin): - If `axis=1` and X is encoded as a CSC matrix. n_neighbors : int, optional (default=1) - It only has effect if the strategy is "knn". It controls the number of nearest + It only has effect if the `strategy=knn`. It controls the number of nearest neighbors used to compute the mean along the axis. Attributes ---------- statistics_ : array of shape (n_features,) The imputation fill value for each feature if axis == 0. + If `strategy=knn`, then it contains those samples having no missing value. Notes ----- @@ -120,6 +121,7 @@ class Imputer(BaseEstimator, TransformerMixin): - When ``axis=1``, an exception is raised if there are rows for which it is not possible to fill in the missing values (e.g., because they only contain missing values). + - Knn strategy currently doesn't support sparse matrix. """ def __init__(self, missing_values="NaN", strategy="mean", axis=0, verbose=0, copy=True, n_neighbors=1): @@ -316,7 +318,7 @@ def _dense_fit(self, X, strategy, missing_values, axis): if axis == 1: X = X.copy().transpose() - full_data = X[np.logical_not(mask.any(1))] + full_data = X[np.logical_not(mask.any(axis=1))] if full_data.size == 0: raise ValueError("There is no sample with complete data.") if full_data.shape[0] < self.n_neighbors: @@ -405,21 +407,21 @@ def transform(self, X): batch_size = 10 # set batch size for block query if False: - missing_index = np.where(mask.any(1))[0] + missing_index = np.where(mask.any(axis=1))[0] #@jnothman 's method for sl in list(gen_batches(len(missing_index), batch_size)): - index_start, index_stop = missing_index[sl][0], missing_index[sl][-1]+1 - X_sl = X[index_start: index_stop] - mask_sl = _get_mask(X_sl, self.missing_values) - missing_index_sl = np.where(mask_sl.any(1))[0] - D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 + X_sl = X[missing_index[sl]] + test1 = X_sl[:][:, np.newaxis, :] - statistics + D2 = test1 ** 2 + #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] means = np.mean(statistics[ind], axis=1) - X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl[missing_index_sl]))[0], + X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], missing_col] + X[missing_index[sl]] = X_sl else: # group by missing features and batch within group group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), return_inverse=True)[1] From a64ee12816605878d2aa76af2a62a64d29e9e3c6 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Wed, 19 Aug 2015 15:02:57 -0400 Subject: [PATCH 11/17] preallocate output array --- sklearn/preprocessing/imputation.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 8e4b575d4bb76..96b2f452b1074 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -405,14 +405,19 @@ def transform(self, X): mask = mask.transpose() statistics = statistics.transpose() - batch_size = 10 # set batch size for block query - if False: + batch_size = 20 # set batch size for block query + if True: missing_index = np.where(mask.any(axis=1))[0] #@jnothman 's method + D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], statistics.shape[1]])) for sl in list(gen_batches(len(missing_index), batch_size)): X_sl = X[missing_index[sl]] test1 = X_sl[:][:, np.newaxis, :] - statistics - D2 = test1 ** 2 + #D2 = np.empty_like(test1) + if test1.shape != D2.shape: + D2 = np.empty_like(test1) + np.multiply(test1, test1, out=D2) + #D2 = test1 ** 2 #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) @@ -434,7 +439,10 @@ def transform(self, X): for sl in batch_slice: index_sl = missing_index[sl] X_sl = X[index_sl] - D2 = (X_sl[:][:, np.newaxis, :] - statistics) ** 2 + test1 = X_sl[:][:, np.newaxis, :] - statistics + D2 = np.empty_like(test1) + np.multiply(test1, test1, out=D2) + #D2 = test1 ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) From eacf3e8d6156199be68b6e2ae14cdd4776ebff1e Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Fri, 21 Aug 2015 13:50:22 -0400 Subject: [PATCH 12/17] add documentation --- doc/modules/preprocessing.rst | 7 +++--- examples/missing_values.py | 18 +++++++++----- sklearn/preprocessing/imputation.py | 37 +++++++++++++++++++---------- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index cc0cf59dd5f2b..757e0d03d9148 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -388,9 +388,10 @@ values. However, this comes at the price of losing data which may be valuable i.e., to infer them from the known part of the data. The :class:`Imputer` class provides basic strategies for imputing missing -values, either using the mean, the median or the most frequent value of -the row or column in which the missing values are located. This class -also allows for different missing values encodings. +values, either using the mean, the median, the most frequent value of +the row or column in which the missing values are located or the mean of the +k-nearest neighbors computed using samples without missing values. This class also +allows for different missing values encodings. The following snippet demonstrates how to replace missing values, encoded as ``np.nan``, using the mean value of the columns (axis 0) diff --git a/examples/missing_values.py b/examples/missing_values.py index 651b51b3f4399..57411b5d2de5a 100644 --- a/examples/missing_values.py +++ b/examples/missing_values.py @@ -16,7 +16,7 @@ Script output:: Score with the entire dataset = 0.43 - Score without the samples containing missing values = 0.36 + Score without the samples containing missing values = 0.35 Score after mean imputation of the missing values = 0.42 Score after knn imputation with 7 neighbors of the missing values = 0.43 @@ -39,20 +39,24 @@ n_features = X_full.shape[1] #Create a random matrix to randomly make missing values -missing_matrix = np.random.rand(n_samples, n_features) -th = 0.15 # each sample has (1-th)^n_features of probability to have full features +missing_matrix = rng.rand(n_samples, n_features) + +# each sample has (1-th)^n_features of probability to have full features +th = 0.14 mask = missing_matrix < th -missing_samples = mask.any(1) +missing_samples = mask.any(axis=1) full_percentage = (n_samples - missing_samples.sum())/float(n_samples) print("Percentage of samples with full features: %f" %full_percentage ) # Estimate the score on the entire dataset, with no missing values + estimator = RandomForestRegressor(random_state=0, n_estimators=100) score = cross_val_score(estimator, X_full, y_full).mean() print("Score with the entire dataset = %.2f" % score) # Estimate the score without the lines containing missing values + X_filtered = X_full[~missing_samples, :] y_filtered = y_full[~missing_samples] estimator = RandomForestRegressor(random_state=0, n_estimators=100) @@ -73,10 +77,12 @@ print("Score after mean imputation of the missing values = %.2f" % score) # Estimate the score after knn imputation of the missing values -neigh = 7 + +neigh = 7 # Number of neighbors to be used estimator2 = Pipeline([("imputer", Imputer(strategy="knn", axis=0, n_neighbors=neigh)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) score = cross_val_score(estimator2, X_missing, y_missing).mean() -print("Score after knn imputation with %d neighbors of the missing values = %.2f" % (neigh, score)) +print("Score after knn imputation with %d neighbors of the missing values =" + " %.2f" % (neigh, score)) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 96b2f452b1074..bc07071c48501 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -82,8 +82,8 @@ class Imputer(BaseEstimator, TransformerMixin): the axis. - If "most_frequent", then replace missing using the most frequent value along the axis. - - If "knn", then replace missing using the mean of the k-nearest neighbors - along the axis. + - If "knn", then replace missing using the mean of the k-nearest + neighbors along the axis. axis : integer, optional (default=0) The axis along which to impute. @@ -105,8 +105,8 @@ class Imputer(BaseEstimator, TransformerMixin): - If `axis=1` and X is encoded as a CSC matrix. n_neighbors : int, optional (default=1) - It only has effect if the `strategy=knn`. It controls the number of nearest - neighbors used to compute the mean along the axis. + It only has effect if the `strategy=knn`. It controls the number of + nearest neighbors used to compute the mean along the axis. Attributes ---------- @@ -257,8 +257,10 @@ def _sparse_fit(self, X, strategy, missing_values, axis): return most_frequent + # KNN elif strategy == "knn": - raise ValueError("strategy='knn' does not support sparse matrix input") + raise ValueError("strategy='knn' does not support sparse " + "matrix input") def _dense_fit(self, X, strategy, missing_values, axis): @@ -316,14 +318,18 @@ def _dense_fit(self, X, strategy, missing_values, axis): elif strategy == "knn": if axis == 1: - X = X.copy().transpose() + X = X.transpose() + mask = mask.transpose() + # Get samples with complete features full_data = X[np.logical_not(mask.any(axis=1))] if full_data.size == 0: raise ValueError("There is no sample with complete data.") if full_data.shape[0] < self.n_neighbors: - raise ValueError("There are only %d complete samples, but n_neighbors=%d." + raise ValueError("There are only %d complete samples, " + "but n_neighbors=%d." % (full_data.shape[0], self.n_neighbors)) + # Transpose back if axis == 1: full_data = full_data.transpose() @@ -405,20 +411,26 @@ def transform(self, X): mask = mask.transpose() statistics = statistics.transpose() - batch_size = 20 # set batch size for block query + batch_size = 200 # set batch size for block query if True: missing_index = np.where(mask.any(axis=1))[0] - #@jnothman 's method - D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], statistics.shape[1]])) + # @jnothman 's method + D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], + statistics.shape[1]])) + # Preallocate output array for np.multiply(test1, test1, out=D2) for sl in list(gen_batches(len(missing_index), batch_size)): X_sl = X[missing_index[sl]] test1 = X_sl[:][:, np.newaxis, :] - statistics #D2 = np.empty_like(test1) + + # For the last slice, the length may not be the same + # as batch_size if test1.shape != D2.shape: D2 = np.empty_like(test1) np.multiply(test1, test1, out=D2) #D2 = test1 ** 2 - #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) ** 2 + #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) + # ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) @@ -429,7 +441,8 @@ def transform(self, X): X[missing_index[sl]] = X_sl else: # group by missing features and batch within group - group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), return_inverse=True)[1] + group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), + return_inverse=True)[1] for group_number in range(max(group_index)+1): if group_number == 0: continue From c442bbb1b12b42f0dc5373dbd11f79b2dbed426f Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Mon, 24 Aug 2015 11:43:56 -0400 Subject: [PATCH 13/17] fix not nan imputation --- sklearn/preprocessing/imputation.py | 4 +++ .../preprocessing/tests/test_imputation.py | 28 +++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index bc07071c48501..b7fa2c4473a7f 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -420,6 +420,8 @@ def transform(self, X): # Preallocate output array for np.multiply(test1, test1, out=D2) for sl in list(gen_batches(len(missing_index), batch_size)): X_sl = X[missing_index[sl]] + mask_sl = mask[missing_index[sl]] + X_sl[mask_sl] = np.nan test1 = X_sl[:][:, np.newaxis, :] - statistics #D2 = np.empty_like(test1) @@ -452,6 +454,8 @@ def transform(self, X): for sl in batch_slice: index_sl = missing_index[sl] X_sl = X[index_sl] + mask_sl = mask[missing_index[sl]] + X_sl[mask_sl] = np.nan test1 = X_sl[:][:, np.newaxis, :] - statistics D2 = np.empty_like(test1) np.multiply(test1, test1, out=D2) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index e66d12e16b58d..e907e20d695bc 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -377,6 +377,13 @@ def test_imputation_knn(): [0, 2, -1, 6], ]) + X5 = np.array([ + [999, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + X_true_1 = np.array([ [-1, -1, 0, 5], [0, 2, -1, 3], @@ -398,26 +405,37 @@ def test_imputation_knn(): [0, 2, -1, 6], ]) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1) + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=1) X_impute = imputer.fit(X).transform(X) assert_array_equal(X_true_1, X_impute) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=1, n_neighbors=1) + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=1, n_neighbors=1) X_impute = imputer.fit(X.transpose()).transform(X.transpose()) assert_array_equal(X_true_1.transpose(), X_impute) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=2) + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=2) X_impute = imputer.fit(X).transform(X) assert_array_equal(X_true_2, X_impute) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1) + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=1) X_impute = imputer.fit(X2).transform(X2) assert_array_equal(X_true_1, X_impute) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1) + imputer = Imputer(missing_values='NaN', strategy="knn", + axis=0, n_neighbors=1) X_impute = imputer.fit(X4).transform(X4) assert_array_equal(X_true_4, X_impute) + imputer = Imputer(missing_values=999, strategy="knn", + axis=0, n_neighbors=1, copy=False) + X5 = X5.astype(float) + X_impute = imputer.fit(X5).transform(X5) + assert_array_equal(X_true_1, X5) + imputer = Imputer(missing_values='NaN', strategy="knn", axis=0) msg = "There is no sample with complete data." assert_raise_message(ValueError, msg, imputer.fit, X3) From 5a2906cd915da4229b04a469df63540db1c2567c Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Mon, 24 Aug 2015 12:08:06 -0400 Subject: [PATCH 14/17] change variable name; add user guide doc --- doc/modules/preprocessing.rst | 3 +++ sklearn/preprocessing/imputation.py | 23 +++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 757e0d03d9148..5b8ef636f874b 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -425,5 +425,8 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored in the matrix. This format is thus suitable when there are many more missing values than observed values. +Also, knn imputation strategy will use samples with full features, and if all samples +have missing features, this strategy will fail. + :class:`Imputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. See :ref:`example_missing_values.py` diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index b7fa2c4473a7f..6fc7351c847d4 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -422,22 +422,21 @@ def transform(self, X): X_sl = X[missing_index[sl]] mask_sl = mask[missing_index[sl]] X_sl[mask_sl] = np.nan - test1 = X_sl[:][:, np.newaxis, :] - statistics - #D2 = np.empty_like(test1) + impute_dist = X_sl[:][:, np.newaxis, :] - statistics # For the last slice, the length may not be the same # as batch_size - if test1.shape != D2.shape: - D2 = np.empty_like(test1) - np.multiply(test1, test1, out=D2) + if impute_dist.shape != D2.shape: + D2 = np.empty_like(impute_dist) + np.multiply(impute_dist, impute_dist, out=D2) #D2 = test1 ** 2 #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) # ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) - ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] - means = np.mean(statistics[ind], axis=1) + target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] + means = np.mean(statistics[target_index], axis=1) X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], missing_col] X[missing_index[sl]] = X_sl @@ -456,15 +455,15 @@ def transform(self, X): X_sl = X[index_sl] mask_sl = mask[missing_index[sl]] X_sl[mask_sl] = np.nan - test1 = X_sl[:][:, np.newaxis, :] - statistics - D2 = np.empty_like(test1) - np.multiply(test1, test1, out=D2) + impute_dist = X_sl[:][:, np.newaxis, :] - statistics + D2 = np.empty_like(impute_dist) + np.multiply(impute_dist, impute_dist, out=D2) #D2 = test1 ** 2 D2[np.isnan(D2)] = 0 missing_row, missing_col = np.where(np.isnan(X_sl)) sqdist = D2.sum(axis=2) - ind = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] - means = np.mean(statistics[ind], axis=1) + target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] + means = np.mean(statistics[target_index], axis=1) X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], missing_col] X[index_sl] = X_sl From b7ff8e180038c80feb0f12eaff7f41c33297f269 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Tue, 25 Aug 2015 11:50:18 -0400 Subject: [PATCH 15/17] clean up comment; remove sort by missing column; remove list for gen_batches --- sklearn/preprocessing/imputation.py | 84 +++++++++-------------------- 1 file changed, 26 insertions(+), 58 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 6fc7351c847d4..b9119ca3abbae 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -411,64 +411,32 @@ def transform(self, X): mask = mask.transpose() statistics = statistics.transpose() - batch_size = 200 # set batch size for block query - if True: - missing_index = np.where(mask.any(axis=1))[0] - # @jnothman 's method - D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], - statistics.shape[1]])) - # Preallocate output array for np.multiply(test1, test1, out=D2) - for sl in list(gen_batches(len(missing_index), batch_size)): - X_sl = X[missing_index[sl]] - mask_sl = mask[missing_index[sl]] - X_sl[mask_sl] = np.nan - impute_dist = X_sl[:][:, np.newaxis, :] - statistics - - # For the last slice, the length may not be the same - # as batch_size - if impute_dist.shape != D2.shape: - D2 = np.empty_like(impute_dist) - np.multiply(impute_dist, impute_dist, out=D2) - #D2 = test1 ** 2 - #D2 = (X_sl[missing_index_sl][:, np.newaxis, :] - statistics) - # ** 2 - D2[np.isnan(D2)] = 0 - missing_row, missing_col = np.where(np.isnan(X_sl)) - sqdist = D2.sum(axis=2) - target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] - means = np.mean(statistics[target_index], axis=1) - X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], - missing_col] - X[missing_index[sl]] = X_sl - else: - # group by missing features and batch within group - group_index = np.unique(mask.astype('u1').view((np.void, X.shape[1])), - return_inverse=True)[1] - for group_number in range(max(group_index)+1): - if group_number == 0: - continue - else: - missing_index = np.where(group_index == group_number)[0] - batch_slice = list(gen_batches(len(missing_index), batch_size)) - for sl in batch_slice: - index_sl = missing_index[sl] - X_sl = X[index_sl] - mask_sl = mask[missing_index[sl]] - X_sl[mask_sl] = np.nan - impute_dist = X_sl[:][:, np.newaxis, :] - statistics - D2 = np.empty_like(impute_dist) - np.multiply(impute_dist, impute_dist, out=D2) - #D2 = test1 ** 2 - D2[np.isnan(D2)] = 0 - missing_row, missing_col = np.where(np.isnan(X_sl)) - sqdist = D2.sum(axis=2) - target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] - means = np.mean(statistics[target_index], axis=1) - X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], - missing_col] - X[index_sl] = X_sl - - + batch_size = 1 # set batch size for block query + missing_index = np.where(mask.any(axis=1))[0] + D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], + statistics.shape[1]])) + + # Preallocate output array for np.multiply(test1, test1, out=D2) + for sl in gen_batches(len(missing_index), batch_size): + X_sl = X[missing_index[sl]] + mask_sl = mask[missing_index[sl]] + X_sl[mask_sl] = np.nan + impute_dist = X_sl[:][:, np.newaxis, :] - statistics + + # For the last slice, the length may not be the same + # as batch_size + if impute_dist.shape != D2.shape: + D2 = np.empty_like(impute_dist) + + np.multiply(impute_dist, impute_dist, out=D2) + D2[np.isnan(D2)] = 0 + missing_row, missing_col = np.where(np.isnan(X_sl)) + sqdist = D2.sum(axis=2) + target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] + means = np.mean(statistics[target_index], axis=1) + X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], + missing_col] + X[missing_index[sl]] = X_sl if self.axis == 1: X = X.transpose() From 3776dec99c5910f99f7c2e11fb611270c5c0bee7 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Wed, 26 Aug 2015 11:23:14 -0400 Subject: [PATCH 16/17] modify documentation --- doc/modules/preprocessing.rst | 11 +++--- sklearn/preprocessing/imputation.py | 18 +++++---- .../preprocessing/tests/test_imputation.py | 37 ++++++++++--------- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 5b8ef636f874b..bbe65ec15f9ca 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -388,10 +388,10 @@ values. However, this comes at the price of losing data which may be valuable i.e., to infer them from the known part of the data. The :class:`Imputer` class provides basic strategies for imputing missing -values, either using the mean, the median, the most frequent value of -the row or column in which the missing values are located or the mean of the -k-nearest neighbors computed using samples without missing values. This class also -allows for different missing values encodings. +values. It can use the mean, the median, the most frequent value of +the row or column in which the missing values are located. Alternatively it can fill +with the mean of only the k-nearest neighbors computed using samples without missing +values. The placeholder for missing values is configurable. The following snippet demonstrates how to replace missing values, encoded as ``np.nan``, using the mean value of the columns (axis 0) @@ -425,7 +425,8 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored in the matrix. This format is thus suitable when there are many more missing values than observed values. -Also, knn imputation strategy will use samples with full features, and if all samples +When using ``strategy=knn``, only samples without any missing features will be used for imputation. +If all samples have missing features, this strategy will fail. :class:`Imputer` can be used in a Pipeline as a way to build a composite diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index b9119ca3abbae..ab0a3d1602058 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -83,13 +83,14 @@ class Imputer(BaseEstimator, TransformerMixin): - If "most_frequent", then replace missing using the most frequent value along the axis. - If "knn", then replace missing using the mean of the k-nearest - neighbors along the axis. + neighbors along the axis. Only samples with no missing values are + considered as neighbors. axis : integer, optional (default=0) The axis along which to impute. - - If `axis=0`, then impute along columns. - - If `axis=1`, then impute along rows. + - If ``axis=0``, then impute along columns. + - If ``axis=1``, then impute along rows. verbose : integer, optional (default=0) Controls the verbosity of the imputer. @@ -101,18 +102,18 @@ class Imputer(BaseEstimator, TransformerMixin): - If X is not an array of floating values; - If X is sparse and `missing_values=0`; - - If `axis=0` and X is encoded as a CSR matrix; - - If `axis=1` and X is encoded as a CSC matrix. + - If ``axis=0`` and X is encoded as a CSR matrix; + - If ``axis=1`` and X is encoded as a CSC matrix. n_neighbors : int, optional (default=1) - It only has effect if the `strategy=knn`. It controls the number of - nearest neighbors used to compute the mean along the axis. + Controls the number of nearest neighbors used to compute the mean + along the axis. Only used when ``strategy=knn`` Attributes ---------- statistics_ : array of shape (n_features,) The imputation fill value for each feature if axis == 0. - If `strategy=knn`, then it contains those samples having no missing value. + If ``strategy=knn``, then it contains those samples having no missing value. Notes ----- @@ -412,6 +413,7 @@ def transform(self, X): statistics = statistics.transpose() batch_size = 1 # set batch size for block query + missing_index = np.where(mask.any(axis=1))[0] D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], statistics.shape[1]])) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index e907e20d695bc..0db13ff950390 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -356,6 +356,13 @@ def test_imputation_knn(): [-1, 2, 3, 7], ]) + X_true_1 = np.array([ + [-1, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + X2 = np.array([ [np.nan, -1, 0, np.nan], [0, 2, -1, 3], @@ -363,6 +370,13 @@ def test_imputation_knn(): [-1, 2, 3, 7], ]) + X_true_2 = np.array([ + [-0.5, -1, 0, 5], + [0, 2, -1, 3], + [-1, -1, 0, 5], + [-1, 2, 3, 7], + ]) + X3 = np.array([ [np.nan, -1, 0, 5], [0, np.nan, -1, 3], @@ -377,34 +391,20 @@ def test_imputation_knn(): [0, 2, -1, 6], ]) - X5 = np.array([ - [999, -1, 0, 5], - [0, 2, -1, 3], - [-1, -1, 0, 5], - [-1, 2, 3, 7], - ]) - - X_true_1 = np.array([ + X_true_4 = np.array([ [-1, -1, 0, 5], [0, 2, -1, 3], [-1, -1, 0, 5], - [-1, 2, 3, 7], + [0, 2, -1, 6], ]) - X_true_2 = np.array([ - [-0.5, -1, 0, 5], + X5 = np.array([ + [999, -1, 0, 5], [0, 2, -1, 3], [-1, -1, 0, 5], [-1, 2, 3, 7], ]) - X_true_4 = np.array([ - [-1, -1, 0, 5], - [0, 2, -1, 3], - [-1, -1, 0, 5], - [0, 2, -1, 6], - ]) - imputer = Imputer(missing_values='NaN', strategy="knn", axis=0, n_neighbors=1) X_impute = imputer.fit(X).transform(X) @@ -435,6 +435,7 @@ def test_imputation_knn(): X5 = X5.astype(float) X_impute = imputer.fit(X5).transform(X5) assert_array_equal(X_true_1, X5) + assert_array_equal(X_impute, X5) imputer = Imputer(missing_values='NaN', strategy="knn", axis=0) msg = "There is no sample with complete data." From ef290f3d4fc693eefc1db36fe3bcbc19c65f7815 Mon Sep 17 00:00:00 2001 From: Tian Wang Date: Wed, 26 Aug 2015 11:25:10 -0400 Subject: [PATCH 17/17] modify documentation --- examples/missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/missing_values.py b/examples/missing_values.py index 57411b5d2de5a..1c208887d94ee 100644 --- a/examples/missing_values.py +++ b/examples/missing_values.py @@ -45,8 +45,8 @@ th = 0.14 mask = missing_matrix < th missing_samples = mask.any(axis=1) -full_percentage = (n_samples - missing_samples.sum())/float(n_samples) -print("Percentage of samples with full features: %f" %full_percentage ) +full_percentage = (n_samples - missing_samples.sum()) / float(n_samples) +print("Percentage of samples with full features: %f" % full_percentage) # Estimate the score on the entire dataset, with no missing values