From 287eb1e8385259bafea68c9c5fa3ab12f61d0d1b Mon Sep 17 00:00:00 2001 From: harke Date: Sat, 24 Jun 2017 03:25:08 -0500 Subject: [PATCH 01/97] Added k-Nearest Neighbor imputation of missing data --- sklearn/preprocessing/imputation.py | 246 +++++++++++++++++++++++++--- 1 file changed, 222 insertions(+), 24 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 12d5425fbf604..7abd0d4fbdd1e 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -59,6 +59,95 @@ def _most_frequent(array, extra_value, n_repeat): else: return extra_value +def _get_knn(X, value_to_mask="NaN", n_neighbors=10, Y=None): + """Returns the k(=n_neighbors) nearest neighbors of vectors in a + given matrix in euclidean space. If two matrices are passed, + then k-Nearest Neighbors of vectors in X in the matrix Y is returned.""" + + # Setup missing mask + mask_X = _get_mask(X, value_to_mask) + + if Y is None: + # Setup the anti-mask and change missing to zero + mask_X = _get_mask(X, value_to_mask) + XT = np.transpose(X) + N = (~mask_X) * 1 + NT = np.transpose(N) + X[mask_X] = 0 + + # Matrix formula to calculate pair-wise distance between all vectors in a + # matrix with missing values. It zero-weights coordinates with missing value + # in either vector in the pair and up-weights the remaining coordinates. + # Matrix formula derived by: Shreya Bhattarai + + """ + Store np.dot(N, (XT * XT)) and add its transpose rather than + redoing a matrix product + dist = np.sqrt((X.shape[1] * 1 / ((np.dot(N, NT)))) * ( + np.dot(N, (XT * XT)) - 2 * (np.dot(X, XT)) + + np.dot((X * X), NT))) + + N_dot_XT2 = np.dot(N, (XT * XT)) + N_dot_XT2_T = np.transpose(N_dot_XT2) + """ + + N_dot_XT2 = np.dot(N, (XT * XT)) + N_dot_XT2_T = np.transpose(N_dot_XT2) + + dist = np.sqrt((X.shape[1] * 1 / ((np.dot(N, NT)))) * ( + N_dot_XT2 - 2 * (np.dot(X, XT)) + + N_dot_XT2_T)) + + # Set distance with self to np.inf + np.fill_diagonal(dist, np.inf) + + else: + # ValueError if X and Y have incompatible dimensions + if X.shape[1] != Y.shape[1]: + raise ValueError("The search dimension of the matrices " + "are not equal: [{0}] versus [{1}]". + format(X.shape[1], Y.shape[1])) + + mask_Y = _get_mask(Y, value_to_mask) + NY = (~mask_Y) * 1 + YT = np.transpose(Y) + mask_YT = _get_mask(YT, value_to_mask) + NYT = np.transpose(NY) + YT[mask_YT] = 0 + + NX = (~mask_X) * 1 + X[mask_X] = 0 + + # Matrix formula to calculate pair-wise distance between all vectors in a + # matrix X to vectors in matrix Y. It handles missing values the same way + # as for a single matrix. + # Matrix formula derived by: Shreya Bhattarai + + dist = np.sqrt((X.shape[1] * 1 / ((np.dot(NX, NYT)))) * + (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + np.dot(NX, (YT * YT)))) + + # Ensure enough candidate neighbors are available + n_candidates = dist.shape[1] if Y is not None else dist.shape[1] - 1 + if n_candidates < n_neighbors: + raise ValueError("There are only %d candidate neighbors, " + "but n_neighbors=%d." + % (dist.shape[1] - 1, n_neighbors)) + + # Missing locations and counts + row_missing_sum_X = mask_X.sum(axis=1) + # is_row_missing_X = np.any(mask_X, axis=1) + # is_col_missing_X = np.any(mask_X, axis=0) + col_missing_index_X = np.where(mask_X)[1] + + # Arg-partition (quasi-argsort) of n_neighbors and retrieve them + nbors_index = np.argpartition(dist, n_neighbors - 1, axis=1) + knn_row_index = nbors_index[:, :n_neighbors] + knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) + knn_row_index = np.repeat(knn_row_index, row_missing_sum_X, axis=0) + knn_row_index = knn_row_index.ravel() + # This assumes columns in X and Y are in the same order; maybe change this? + knn_col_index = np.repeat(col_missing_index_X, n_neighbors) + return knn_row_index, knn_col_index class Imputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. @@ -115,12 +204,16 @@ class Imputer(BaseEstimator, TransformerMixin): contain missing values). """ def __init__(self, missing_values="NaN", strategy="mean", - axis=0, verbose=0, copy=True): + axis=0, verbose=0, copy=True, n_neighbors=10, + row_max_missing=0.5, col_max_missing=0.8): self.missing_values = missing_values self.strategy = strategy self.axis = axis self.verbose = verbose self.copy = copy + self.n_neighbors = n_neighbors + self.row_max_missing = row_max_missing + self.col_max_missing = col_max_missing def fit(self, X, y=None): """Fit the imputer on X. @@ -137,7 +230,7 @@ def fit(self, X, y=None): Returns self. """ # Check parameters - allowed_strategies = ["mean", "median", "most_frequent"] + allowed_strategies = ["mean", "median", "most_frequent", "knn"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " " got strategy={1}".format(allowed_strategies, @@ -247,7 +340,12 @@ def _sparse_fit(self, X, strategy, missing_values, axis): return most_frequent - def _dense_fit(self, X, strategy, missing_values, axis): + # KNN + elif strategy == "knn": + raise ValueError("strategy='knn' does not support sparse " + "matrix input yet.") + + def _dense_fit(self, X, strategy, missing_values, axis, Y=None): """Fit the transformer on dense data.""" X = check_array(X, force_all_finite=False) mask = _get_mask(X, missing_values) @@ -298,6 +396,79 @@ def _dense_fit(self, X, strategy, missing_values, axis): return most_frequent + # KNN + elif strategy == "knn": + + if axis == 1: + X = X.transpose() + mask = mask.transpose() + + #Get dimensions and missing count + n_rows, n_cols = X.shape + row_missing_sum = mask.sum(axis=1) + + #ValueError if % missing in any column > self.col_max_missing + if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): + raise ValueError("The following axis position(s) have, " + "more than {0}% missing values: {1}" + .format(self.col_max_missing*100,np.where(mask.sum(axis=0) > + (X.shape[0] * self.col_max_missing)))) + + if X.shape[0] < self.n_neighbors: + raise ValueError("There are only %d samples, " + "but n_neighbors=%d." + % (X.shape[0], self.n_neighbors)) + + #Fit to data + + # Check for excessive missingness in rows + bad_rows = row_missing_sum > (mask.shape[1] * self.row_max_missing) + Xbad = X[bad_rows, :] + + if np.any(bad_rows): + X = X[~bad_rows, :] + mask = _get_mask(X, missing_values) + + #Get the k nearest neighbors + if Y is None: + knnrows_index, knncols_index = _get_knn(X, + n_neighbors=self.n_neighbors) + + else: + knnrows_index, knncols_index = _get_knn(X, + n_neighbors=self.n_neighbors, Y=Y) + X[mask] = np.nan + if Y is None: + imputed = np.nanmean((X[(knnrows_index, knncols_index)]). + reshape((-1, self.n_neighbors)), axis=1) + else: + imputed = np.nanmean((Y[(knnrows_index, knncols_index)]). + reshape((-1, self.n_neighbors)), axis=1) + X[mask] = imputed + + #Merge bad rows to X and mean impute any leftover missing + if np.any(bad_rows): + X_merged = np.empty((n_rows, n_cols)) + X_merged[bad_rows, :] = Xbad + X_merged[~bad_rows, :] = X + X = X_merged + + #Impute bad_rows and leftover missing with column means + mask_after_knn = _get_mask(X, self.missing_values) + if np.any(mask_after_knn): + missing_index = np.where(mask_after_knn) + X_col_means = masked_X.mean(axis=0).data + X[missing_index] = np.take(X_col_means, missing_index[1]) + + # Transpose back + if axis == 1: + X = X.transpose() + + #The mask is used to compare this imputed matrix with + #input matrix in transform(), so return X as a masked array. + X = np.ma.array(X,mask=masked_X.mask) + return X + def transform(self, X): """Impute all missing values in X. @@ -311,7 +482,8 @@ def transform(self, X): X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) statistics = self.statistics_ - if X.shape[1] != statistics.shape[0]: + #> Added knn exception below + if self.strategy != "knn" and X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) @@ -334,21 +506,22 @@ def transform(self, X): self.missing_values, self.axis) - # Delete the invalid rows/columns - invalid_mask = np.isnan(statistics) - valid_mask = np.logical_not(invalid_mask) - valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.where(valid_mask)[0] - missing = np.arange(X.shape[not self.axis])[invalid_mask] - - if self.axis == 0 and invalid_mask.any(): - if self.verbose: - warnings.warn("Deleting features without " - "observed values: %s" % missing) - X = X[:, valid_statistics_indexes] - elif self.axis == 1 and invalid_mask.any(): - raise ValueError("Some rows only contain " - "missing values: %s" % missing) + if self.strategy != "knn": + # Delete the invalid rows/columns + invalid_mask = np.isnan(statistics) + valid_mask = np.logical_not(invalid_mask) + valid_statistics = statistics[valid_mask] + valid_statistics_indexes = np.where(valid_mask)[0] + missing = np.arange(X.shape[not self.axis])[invalid_mask] + + if self.axis == 0 and invalid_mask.any(): + if self.verbose: + warnings.warn("Deleting features without " + "observed values: %s" % missing) + X = X[:, valid_statistics_indexes] + elif self.axis == 1 and invalid_mask.any(): + raise ValueError("Some rows only contain " + "missing values: %s" % missing) # Do actual imputation if sparse.issparse(X) and self.missing_values != 0: @@ -364,13 +537,38 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=self.axis) - values = np.repeat(valid_statistics, n_missing) - if self.axis == 0: - coordinates = np.where(mask.transpose())[::-1] + if self.strategy == 'knn': + if self.axis == 1: + X = X.transpose() + mask = mask.transpose() + statistics = statistics.transpose() + + #Check if unmasked values of X and statistics are equal + mask_fitted = statistics.mask + if mask_fitted.shape == X.shape: + masked_X = np.ma.array(X, mask=mask_fitted) + if np.ma.allequal(masked_X,statistics): + X = statistics.data + else: + X_imputed_masked = self._dense_fit(X, + self.strategy, + self.missing_values, + self.axis, + statistics.data) + X = X_imputed_masked.data + + if self.axis == 1: + X = X.transpose() + else: - coordinates = mask + values = np.repeat(valid_statistics, n_missing) + + if self.axis == 0: + coordinates = np.where(mask.transpose())[::-1] + else: + coordinates = mask - X[coordinates] = values + X[coordinates] = values return X From cd6d3a25431038ef16e39bca2548e574a991b422 Mon Sep 17 00:00:00 2001 From: harke Date: Sat, 24 Jun 2017 20:06:37 -0500 Subject: [PATCH 02/97] Fixed issue with passing seperate matrices in fit() and transform() --- sklearn/preprocessing/imputation.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 7abd0d4fbdd1e..b99fb56c1ddb6 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -398,6 +398,8 @@ def _dense_fit(self, X, strategy, missing_values, axis, Y=None): # KNN elif strategy == "knn": + if self.copy: + X = np.copy(X) if axis == 1: X = X.transpose() @@ -544,19 +546,18 @@ def transform(self, X): mask = mask.transpose() statistics = statistics.transpose() - #Check if unmasked values of X and statistics are equal + #Check if the masks and the unmasked values are equal mask_fitted = statistics.mask - if mask_fitted.shape == X.shape: - masked_X = np.ma.array(X, mask=mask_fitted) - if np.ma.allequal(masked_X,statistics): + masked_X = np.ma.array(X, mask=mask) + if np.array_equal(mask, mask_fitted)\ + and np.ma.allequal(masked_X, statistics): X = statistics.data else: - X_imputed_masked = self._dense_fit(X, + X = self._dense_fit(X, self.strategy, self.missing_values, self.axis, - statistics.data) - X = X_imputed_masked.data + Y=statistics.data).data if self.axis == 1: X = X.transpose() From 3fc9596aaef072f559bd39aa302d328b302252b7 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 25 Jun 2017 16:55:53 -0500 Subject: [PATCH 03/97] Retreived fitted data with self.statistics_ rather than passing it as argument --- sklearn/preprocessing/imputation.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index b99fb56c1ddb6..db4364fa12731 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -345,7 +345,7 @@ def _sparse_fit(self, X, strategy, missing_values, axis): raise ValueError("strategy='knn' does not support sparse " "matrix input yet.") - def _dense_fit(self, X, strategy, missing_values, axis, Y=None): + def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" X = check_array(X, force_all_finite=False) mask = _get_mask(X, missing_values) @@ -425,33 +425,32 @@ def _dense_fit(self, X, strategy, missing_values, axis, Y=None): # Check for excessive missingness in rows bad_rows = row_missing_sum > (mask.shape[1] * self.row_max_missing) - Xbad = X[bad_rows, :] + X_bad = X[bad_rows, :] if np.any(bad_rows): X = X[~bad_rows, :] mask = _get_mask(X, missing_values) - #Get the k nearest neighbors - if Y is None: - knnrows_index, knncols_index = _get_knn(X, - n_neighbors=self.n_neighbors) - - else: + #Get the k nearest neighbors and impute + if hasattr(self, 'statistics_'): + Y = self.statistics_.data knnrows_index, knncols_index = _get_knn(X, n_neighbors=self.n_neighbors, Y=Y) - X[mask] = np.nan - if Y is None: - imputed = np.nanmean((X[(knnrows_index, knncols_index)]). + X[mask] = np.nan + imputed = np.nanmean((Y[(knnrows_index, knncols_index)]). reshape((-1, self.n_neighbors)), axis=1) else: - imputed = np.nanmean((Y[(knnrows_index, knncols_index)]). + knnrows_index, knncols_index = _get_knn(X, + n_neighbors=self.n_neighbors) + X[mask] = np.nan + imputed = np.nanmean((X[(knnrows_index, knncols_index)]). reshape((-1, self.n_neighbors)), axis=1) X[mask] = imputed #Merge bad rows to X and mean impute any leftover missing if np.any(bad_rows): X_merged = np.empty((n_rows, n_cols)) - X_merged[bad_rows, :] = Xbad + X_merged[bad_rows, :] = X_bad X_merged[~bad_rows, :] = X X = X_merged @@ -556,8 +555,7 @@ def transform(self, X): X = self._dense_fit(X, self.strategy, self.missing_values, - self.axis, - Y=statistics.data).data + self.axis).data if self.axis == 1: X = X.transpose() From d707dcd11d8114aacff61e56f5f972205de106dc Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 13 Jul 2017 04:12:24 -0500 Subject: [PATCH 04/97] Modified metrics to enable euclidean distance calculation with missing (NaN) values --- sklearn/metrics/pairwise.py | 142 ++++++++++++++++++++----- sklearn/metrics/tests/test_pairwise.py | 31 ++++++ sklearn/neighbors/base.py | 35 ++++-- 3 files changed, 176 insertions(+), 32 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 0b63653672f51..f83baa2d330b7 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -28,6 +28,13 @@ from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan +# Get mask for missing values +def _get_mask(X, value_to_mask): + """Compute the boolean mask X == missing_values.""" + if value_to_mask == "NaN" or np.isnan(value_to_mask): + return np.isnan(X) + else: + return X == value_to_mask # Utility Functions def _return_float_dtype(X, Y): @@ -54,7 +61,8 @@ def _return_float_dtype(X, Y): return X, Y, dtype -def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): +def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, + copy=False, force_all_finite=True): """ Set X and Y appropriately and checks inputs If Y is None, it is set as a pointer to X (i.e. not a copy). @@ -103,11 +111,14 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): if Y is X or Y is None: X = Y = check_array(X, accept_sparse='csr', dtype=dtype, + copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) else: X = check_array(X, accept_sparse='csr', dtype=dtype, + copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) Y = check_array(Y, accept_sparse='csr', dtype=dtype, + copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) if precomputed: @@ -160,7 +171,8 @@ def check_paired_arrays(X, Y): # Pairwise distances def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, - X_norm_squared=None): + X_norm_squared=None, kill_missing=True, + missing_values=None, copy=False): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. @@ -179,6 +191,19 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. + Additionally, euclidean_distances() can also compute pairwise euclidean + distance for vectors in dense matrices X and Y with missing values in + arbitrary coordinates. The following formula is used for this: + + dist(X, Y) = (X.shape[1] * 1 / ((dot(NX, NYT)))) * + (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + + dot(NX, (Y.T * Y.T))) + + where NX and NYT represent the logical-not of the missing masks of + X and Y.T, respectively.This formulation zero-weights coordinates with + missing value in either vector in the pair and up-weights the remaining + coordinates. + Read more in the :ref:`User Guide `. Parameters @@ -198,6 +223,15 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, Pre-computed dot-products of vectors in X (e.g., ``(X**2).sum(axis=1)``) + kill_missing : boolean, optional + Allow missing values (e.g., NaN) + + missing_values : String, optional + String representation of missing value + + copy : boolean, optional + Make and use a deep copy of X and Y (if it exists) + Returns ------- distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) @@ -219,34 +253,75 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, -------- paired_distances : distances betweens pairs of elements of X and Y. """ - X, Y = check_pairwise_arrays(X, Y) - if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): - raise ValueError( - "Incompatible dimensions for X and X_norm_squared") + #NOTE: force_all_finite=False allows not only NaN but also inf/-inf + X, Y = check_pairwise_arrays(X, Y, force_all_finite=kill_missing, copy=copy) + if kill_missing is False and \ + (np.any(np.isinf(X.data)) or (Y is not None and np.any(np.isinf(Y.data)))): + raise ValueError( + "+/- Infinite values are not allowed.") + + if kill_missing: + if X_norm_squared is not None: + XX = check_array(X_norm_squared) + if XX.shape == (1, X.shape[0]): + XX = XX.T + elif XX.shape != (X.shape[0], 1): + raise ValueError( + "Incompatible dimensions for X and X_norm_squared") + else: + XX = row_norms(X, squared=True)[:, np.newaxis] + + if X is Y: # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is not None: + YY = np.atleast_2d(Y_norm_squared) + + if YY.shape != (1, Y.shape[0]): + raise ValueError( + "Incompatible dimensions for Y and Y_norm_squared") + else: + YY = row_norms(Y, squared=True)[np.newaxis, :] + + distances = safe_sparse_dot(X, Y.T, dense_output=True) + distances *= -2 + distances += XX + distances += YY + np.maximum(distances, 0, out=distances) + else: - XX = row_norms(X, squared=True)[:, np.newaxis] + if missing_values!="NaN" and \ + (np.any(_get_mask(X.data, "NaN")) or np.any(_get_mask(Y.data, "NaN"))): + raise ValueError( + "NaN values present but missing_value = {0}".format(missing_values)) - if X is Y: # shortcut in the common case euclidean_distances(X, X) - YY = XX.T - elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) + # ValueError if X and Y have incompatible dimensions + # if X.shape[1] != Y.shape[1]: + # raise ValueError("The search dimension of the matrices " + # "are not equal: [{0}] versus [{1}]". + # format(X.shape[1], Y.shape[1])) - if YY.shape != (1, Y.shape[0]): - raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - else: - YY = row_norms(Y, squared=True)[np.newaxis, :] + # Get missing mask for X + mask_X = _get_mask(X, missing_values) + + # Get Y.T mask and anti-mask and set Y.T's missing to zero + YT = Y.T + mask_YT = _get_mask(YT, missing_values) + NYT = (~mask_YT).astype("int") + YT[mask_YT] = 0 + + #Get X anti-mask and set X's missing to zero + NX = (~mask_X).astype("int") + X[mask_X] = 0 - distances = safe_sparse_dot(X, Y.T, dense_output=True) - distances *= -2 - distances += XX - distances += YY - np.maximum(distances, 0, out=distances) + # Matrix formula to calculate pair-wise distance between all vectors in a + # matrix X to vectors in matrix Y. It zero-weights coordinates with missing value + # in either vector in the pair and up-weights the remaining coordinates. + # Matrix formula derived by: Shreya Bhattarai + + distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ + (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + np.dot(NX, (YT * YT))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1130,6 +1205,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] +_MISSING_SUPPORTED_METRICS = ['euclidean'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): """ Compute the distance matrix from a vector array X and optional Y. @@ -1216,6 +1292,22 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) + if (kwds.get("kill_missing") is False): + if (metric not in _MISSING_SUPPORTED_METRICS): + raise ValueError( + "Metric {0} does not have missing value support ".format( + metric) + ) + if issparse(X) or (Y is not None and issparse(Y)): + raise ValueError( + "Missing value support for sparse matrices not added yet") + if (kwds.get("missing_values") is None): + raise ValueError("Missing value is not defined") + if(np.any(_get_mask(X.data, kwds.get("missing_values")). + sum(axis=1) == X.data.shape[1])): + raise ValueError( + "One or more samples(s) only have missing values.") + if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index d8b64b58ca481..8b0a42a528810 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -406,6 +406,37 @@ def test_euclidean_distances(): Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01) +def test_euclidean_distances_with_missing(): + # first check that we get right answer with missing values for X + X = np.array([[1., 5., 7., 5., 10.], + [8., 2., 4., np.nan, 8.], + [5., np.nan, 5., np.nan, 1.], + [8., np.nan, np.nan, np.nan, np.nan]]) + D1 = euclidean_distances(X, kill_missing=False, missing_values="NaN") + + D2 = np.array([[0., 9.42072184, 12.97433364, 15.65247584], + [9.42072184, 0., 9.91631652, 0.], + [12.97433364, 9.91631652, 0., 6.70820393], + [15.65247584, 0., 6.70820393, 0.]]) + + assert_array_almost_equal(D1, D2) + + # check with pairs of matrices with missing values + X = np.array([[1., np.nan, 3., 4., 2.], + [np.nan, 4., 6., 1., np.nan], + [3., np.nan, np.nan, np.nan, 1.]]) + + Y = np.array([[np.nan, 7., 7., np.nan, 2.], + [np.nan, np.nan, 5., 4., 7.], + [np.nan, np.nan, np.nan, 4., 5.]]) + + D3 = np.array([[6.32455532, 6.95221787, 4.74341649], + [5., 5., 6.70820393], + [2.23606798, 13.41640786, 8.94427191]]) + + D4 = euclidean_distances(X, Y, kill_missing=False, missing_values="NaN") + + assert_array_almost_equal(D3, D4) def test_cosine_distances(): # Check the pairwise Cosine distances computation diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e14da8bbc2e97..72f11f608cbcc 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -157,7 +157,7 @@ def _init_params(self, n_neighbors=None, radius=None, self._tree = None self._fit_method = None - def _fit(self, X): + def _fit(self, X, kill_missing=True): if self.metric_params is None: self.effective_metric_params_ = {} else: @@ -201,7 +201,9 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - X = check_array(X, accept_sparse='csr') + # # copy=True if missing accepted as they will be replaced by 0 + # copy = True if kill_missing is False else False + X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) n_samples = X.shape[0] if n_samples == 0: @@ -270,7 +272,8 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" - def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, + kill_missing=True, missing_values="NaN", copy=None): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -290,6 +293,15 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): return_distance : boolean, optional. Defaults to True. If False, distances will not be returned + kill_missing : boolean, optional + Allow missing values (e.g., NaN) + + missing_values : String, optional + String representation of missing value + + copy : boolean, optional + Make and use a deep copy of X + Returns ------- dist : array @@ -331,7 +343,9 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + # copy=True if missing accepted as they will be replaced by 0 + # copy = True if kill_missing is False else False + X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) else: query_is_train = True X = self._fit_X @@ -349,12 +363,19 @@ class from an array representing our data set and ask who's n_samples, _ = X.shape sample_range = np.arange(n_samples)[:, None] + # copy=True if missing accepted and copy is None + if copy is None: + copy = True if kill_missing is False else False + n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=n_jobs, squared=True) + n_jobs=n_jobs, squared=True, + kill_missing=kill_missing, + missing_values=missing_values, + copy=copy) else: dist = pairwise_distances( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, @@ -791,7 +812,7 @@ def fit(self, X, y): class UnsupervisedMixin(object): - def fit(self, X, y=None): + def fit(self, X, y=None, kill_missing=True): """Fit the model using X as training data Parameters @@ -800,4 +821,4 @@ def fit(self, X, y=None): Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. """ - return self._fit(X) + return self._fit(X, kill_missing) From b4b5ae9a4775d7ceb420fc3bac54994820483897 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 17 Jul 2017 21:21:43 -0500 Subject: [PATCH 05/97] Changes to ensure Python 2.x compatibility --- sklearn/metrics/pairwise.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f83baa2d330b7..f9c46b1eb109e 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -307,11 +307,13 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # Get Y.T mask and anti-mask and set Y.T's missing to zero YT = Y.T mask_YT = _get_mask(YT, missing_values) - NYT = (~mask_YT).astype("int") + NYT = (~mask_YT).astype(np.int8) + # NYT = (~mask_YT) YT[mask_YT] = 0 - #Get X anti-mask and set X's missing to zero - NX = (~mask_X).astype("int") + # Get X anti-mask and set X's missing to zero + NX = (~mask_X).astype(np.int8) + # NX = (~mask_X) X[mask_X] = 0 # Matrix formula to calculate pair-wise distance between all vectors in a @@ -319,9 +321,15 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # in either vector in the pair and up-weights the remaining coordinates. # Matrix formula derived by: Shreya Bhattarai - distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ - (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - np.dot(NX, (YT * YT))) + # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ + # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + # np.dot(NX, (YT * YT))) + + # Above is faster but following for Python 2.x support + distances = np.multiply(np.multiply(X.shape[1], (1.0 / np.dot(NX, NYT))), + (np.dot(np.multiply(X, X), NYT) - + (2.0 * (np.dot(X, YT))) + + np.dot(NX, (np.multiply(YT, YT))))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. From 04ed4a0aa452c03d7b09a6a07e820307964d37c9 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 17 Jul 2017 22:24:15 -0500 Subject: [PATCH 06/97] Fixed pep8 issues --- sklearn/metrics/pairwise.py | 35 ++++++++++++++++---------- sklearn/metrics/tests/test_pairwise.py | 2 ++ sklearn/neighbors/base.py | 3 ++- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f9c46b1eb109e..a855aa0ad583a 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -28,6 +28,7 @@ from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan + # Get mask for missing values def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" @@ -36,6 +37,7 @@ def _get_mask(X, value_to_mask): else: return X == value_to_mask + # Utility Functions def _return_float_dtype(X, Y): """ @@ -254,10 +256,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, paired_distances : distances betweens pairs of elements of X and Y. """ - #NOTE: force_all_finite=False allows not only NaN but also inf/-inf - X, Y = check_pairwise_arrays(X, Y, force_all_finite=kill_missing, copy=copy) + # NOTE: force_all_finite=False allows not only NaN but also inf/-inf + X, Y = check_pairwise_arrays(X, Y, + force_all_finite=kill_missing, copy=copy) if kill_missing is False and \ - (np.any(np.isinf(X.data)) or (Y is not None and np.any(np.isinf(Y.data)))): + (np.any(np.isinf(X.data)) or + (Y is not None and np.any(np.isinf(Y.data)))): raise ValueError( "+/- Infinite values are not allowed.") @@ -290,10 +294,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, np.maximum(distances, 0, out=distances) else: - if missing_values!="NaN" and \ - (np.any(_get_mask(X.data, "NaN")) or np.any(_get_mask(Y.data, "NaN"))): + if missing_values != "NaN" and \ + (np.any(_get_mask(X.data, "NaN")) or + np.any(_get_mask(Y.data, "NaN"))): raise ValueError( - "NaN values present but missing_value = {0}".format(missing_values)) + "NaN values present but missing_value = {0}". + format(missing_values)) # ValueError if X and Y have incompatible dimensions # if X.shape[1] != Y.shape[1]: @@ -316,17 +322,19 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # NX = (~mask_X) X[mask_X] = 0 - # Matrix formula to calculate pair-wise distance between all vectors in a - # matrix X to vectors in matrix Y. It zero-weights coordinates with missing value - # in either vector in the pair and up-weights the remaining coordinates. - # Matrix formula derived by: Shreya Bhattarai + # Matrix formula to calculate pair-wise distance between all vectors + # in a matrix X to vectors in matrix Y. It zero-weights coordinates + # with missing value in either vector in the pair and up-weights the + # remaining coordinates. + # Formula derived by: Shreya Bhattarai # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + # np.dot(NX, (YT * YT))) # Above is faster but following for Python 2.x support - distances = np.multiply(np.multiply(X.shape[1], (1.0 / np.dot(NX, NYT))), + distances = np.multiply(np.multiply(X.shape[1], + (1.0 / np.dot(NX, NYT))), (np.dot(np.multiply(X, X), NYT) - (2.0 * (np.dot(X, YT))) + np.dot(NX, (np.multiply(YT, YT))))) @@ -1215,6 +1223,7 @@ def _pairwise_callable(X, Y, metric, **kwds): _MISSING_SUPPORTED_METRICS = ['euclidean'] + def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): """ Compute the distance matrix from a vector array X and optional Y. @@ -1311,8 +1320,8 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Missing value support for sparse matrices not added yet") if (kwds.get("missing_values") is None): raise ValueError("Missing value is not defined") - if(np.any(_get_mask(X.data, kwds.get("missing_values")). - sum(axis=1) == X.data.shape[1])): + if(np.any(_get_mask(X.data, kwds.get("missing_values")).sum( + axis=1) == X.data.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 8b0a42a528810..ddb4e795dd1bc 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -406,6 +406,7 @@ def test_euclidean_distances(): Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01) + def test_euclidean_distances_with_missing(): # first check that we get right answer with missing values for X X = np.array([[1., 5., 7., 5., 10.], @@ -438,6 +439,7 @@ def test_euclidean_distances_with_missing(): assert_array_almost_equal(D3, D4) + def test_cosine_distances(): # Check the pairwise Cosine distances computation rng = np.random.RandomState(1337) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 72f11f608cbcc..e4fa7ffe6db03 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -345,7 +345,8 @@ class from an array representing our data set and ask who's query_is_train = False # copy=True if missing accepted as they will be replaced by 0 # copy = True if kill_missing is False else False - X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) + X = check_array(X, accept_sparse='csr', + force_all_finite=kill_missing) else: query_is_train = True X = self._fit_X From a6d8ef66b5fcdda9144b04d997b3e8ba8ad4d441 Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 06:22:32 -0500 Subject: [PATCH 07/97] Addressed comments from review --- sklearn/metrics/pairwise.py | 321 ++++++++++++++-------- sklearn/metrics/tests/test_pairwise.py | 18 +- sklearn/neighbors/base.py | 238 +++++++++++++--- sklearn/neighbors/tests/test_neighbors.py | 60 ++++ 4 files changed, 486 insertions(+), 151 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a855aa0ad583a..f01f4fd36ab33 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -94,6 +94,13 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, .. versionadded:: 0.18 + copy : bool + Create and return a deep copy of X and Y (if Y exists) + + force_all_finite : bool + Throw a ValueError exception if either X or Y (if Y exists) + contains any NaN or +/- inf values + Returns ------- safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features) @@ -173,38 +180,119 @@ def check_paired_arrays(X, Y): # Pairwise distances def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, - X_norm_squared=None, kill_missing=True, - missing_values=None, copy=False): + X_norm_squared=None): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. - For efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: - dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) - This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. - However, this is not the most precise way of doing this computation, and the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. + Read more in the :ref:`User Guide `. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples_1, n_features) + Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) + Y_norm_squared : array-like, shape (n_samples_2, ), optional + Pre-computed dot-products of vectors in Y (e.g., + ``(Y**2).sum(axis=1)``) + squared : boolean, optional + Return squared Euclidean distances. + X_norm_squared : array-like, shape = [n_samples_1], optional + Pre-computed dot-products of vectors in X (e.g., + ``(X**2).sum(axis=1)``) + Returns + ------- + distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) + Examples + -------- + >>> from sklearn.metrics.pairwise import euclidean_distances + >>> X = [[0, 1], [1, 1]] + >>> # distance between rows of X + >>> euclidean_distances(X, X) + array([[ 0., 1.], + [ 1., 0.]]) + >>> # get distance to origin + >>> euclidean_distances(X, [[0, 0]]) + array([[ 1. ], + [ 1.41421356]]) + See also + -------- + paired_distances : distances betweens pairs of elements of X and Y. + """ + X, Y = check_pairwise_arrays(X, Y) + + if X_norm_squared is not None: + XX = check_array(X_norm_squared) + if XX.shape == (1, X.shape[0]): + XX = XX.T + elif XX.shape != (X.shape[0], 1): + raise ValueError( + "Incompatible dimensions for X and X_norm_squared") + else: + XX = row_norms(X, squared=True)[:, np.newaxis] + + if X is Y: # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is not None: + YY = np.atleast_2d(Y_norm_squared) + + if YY.shape != (1, Y.shape[0]): + raise ValueError( + "Incompatible dimensions for Y and Y_norm_squared") + else: + YY = row_norms(Y, squared=True)[np.newaxis, :] + + distances = safe_sparse_dot(X, Y.T, dense_output=True) + distances *= -2 + distances += XX + distances += YY + np.maximum(distances, 0, out=distances) + + if X is Y: + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + distances.flat[::distances.shape[0] + 1] = 0.0 + + return distances if squared else np.sqrt(distances, out=distances) + - Additionally, euclidean_distances() can also compute pairwise euclidean - distance for vectors in dense matrices X and Y with missing values in - arbitrary coordinates. The following formula is used for this: +# Pairwise distances in the presence of missing values +def masked_euclidean_distances(X, Y=None, squared=False, + missing_values="NaN", copy=True, **kwargs): + """ + Considering the rows of X (and Y=X) as vectors, compute the + distance matrix between each pair of vectors. Similarly, if + Y is not X, then compute the distance matrix between each + pair of vectors (i.e., each row pair) in X and Y. + + This function computes pairwise euclidean distance for vectors + in dense matrices X and Y with missing values in arbitrary + coordinates. The following formula is used for this: dist(X, Y) = (X.shape[1] * 1 / ((dot(NX, NYT)))) * (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + dot(NX, (Y.T * Y.T))) where NX and NYT represent the logical-not of the missing masks of - X and Y.T, respectively.This formulation zero-weights coordinates with - missing value in either vector in the pair and up-weights the remaining - coordinates. + X and Y.T, respectively.This formulation zero-weights feature coordinates + with missing value in either vector in the pair and up-weights the + remaining coordinates. + Formula derived by: Shreya Bhattarai + + Breakdown of euclidean distance calculation between a vector pair x,y: + + weight = Total # of coordinates / # of non-missing coordinates + dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates) + + This of course implies that if all coordinates are missing in either + vector in the pair then NaN is returned for that pair. Read more in the :ref:`User Guide `. @@ -214,25 +302,14 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) - Y_norm_squared : array-like, shape (n_samples_2, ), optional - Pre-computed dot-products of vectors in Y (e.g., - ``(Y**2).sum(axis=1)``) - squared : boolean, optional Return squared Euclidean distances. - X_norm_squared : array-like, shape = [n_samples_1], optional - Pre-computed dot-products of vectors in X (e.g., - ``(X**2).sum(axis=1)``) - - kill_missing : boolean, optional - Allow missing values (e.g., NaN) - - missing_values : String, optional - String representation of missing value + missing_values : "NaN" or integer, optional (default=”NaN”) + Representation of missing value copy : boolean, optional - Make and use a deep copy of X and Y (if it exists) + Make and use a deep copy of X and Y (if Y exists) Returns ------- @@ -240,14 +317,15 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, Examples -------- - >>> from sklearn.metrics.pairwise import euclidean_distances - >>> X = [[0, 1], [1, 1]] + >>> from sklearn.metrics.pairwise import masked_euclidean_distances + >>> nan = float("NaN") + >>> X = [[0, 1], [1, nan]] >>> # distance between rows of X - >>> euclidean_distances(X, X) - array([[ 0., 1.], - [ 1., 0.]]) + >>> masked_euclidean_distances(X, X) + array([[ 0., 1.41421356], + [ 1.41421356, 0.]]) >>> # get distance to origin - >>> euclidean_distances(X, [[0, 0]]) + >>> masked_euclidean_distances(X, [[0, 0]]) array([[ 1. ], [ 1.41421356]]) @@ -255,89 +333,86 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, -------- paired_distances : distances betweens pairs of elements of X and Y. """ + # Check and except sparse matrices + if issparse(X) or (Y is not None and issparse(Y)): + raise ValueError( + "Missing value support for sparse matrices not added yet") # NOTE: force_all_finite=False allows not only NaN but also inf/-inf X, Y = check_pairwise_arrays(X, Y, - force_all_finite=kill_missing, copy=copy) - if kill_missing is False and \ - (np.any(np.isinf(X.data)) or - (Y is not None and np.any(np.isinf(Y.data)))): + force_all_finite=False, copy=copy) + if (np.any(np.isinf(X.data)) or + (Y is not None and np.any(np.isinf(Y.data)))): raise ValueError( "+/- Infinite values are not allowed.") - if kill_missing: - if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): - raise ValueError( - "Incompatible dimensions for X and X_norm_squared") - else: - XX = row_norms(X, squared=True)[:, np.newaxis] - - if X is Y: # shortcut in the common case euclidean_distances(X, X) - YY = XX.T - elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) - - if YY.shape != (1, Y.shape[0]): - raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - else: - YY = row_norms(Y, squared=True)[np.newaxis, :] - - distances = safe_sparse_dot(X, Y.T, dense_output=True) - distances *= -2 - distances += XX - distances += YY - np.maximum(distances, 0, out=distances) + # Check if any rows have only missing value + if np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])\ + or (Y is not None and np.any(_get_mask(Y, missing_values).sum( + axis=1) == Y.shape[1])): + raise ValueError("One or more rows only contain missing values.") + # + # if kill_missing: + # if X_norm_squared is not None: + # XX = check_array(X_norm_squared) + # if XX.shape == (1, X.shape[0]): + # XX = XX.T + # elif XX.shape != (X.shape[0], 1): + # raise ValueError( + # "Incompatible dimensions for X and X_norm_squared") + # else: + # XX = row_norms(X, squared=True)[:, np.newaxis] + # + # if X is Y: # shortcut in the common case euclidean_distances(X, X) + # YY = XX.T + # elif Y_norm_squared is not None: + # YY = np.atleast_2d(Y_norm_squared) + # + # if YY.shape != (1, Y.shape[0]): + # raise ValueError( + # "Incompatible dimensions for Y and Y_norm_squared") + # else: + # YY = row_norms(Y, squared=True)[np.newaxis, :] + # + # distances = safe_sparse_dot(X, Y.T, dense_output=True) + # distances *= -2 + # distances += XX + # distances += YY + # np.maximum(distances, 0, out=distances) + + # else: + if missing_values != "NaN" and \ + (np.any(_get_mask(X.data, "NaN")) or + np.any(_get_mask(Y.data, "NaN"))): + raise ValueError( + "NaN values present but missing_value = {0}".format( + missing_values)) - else: - if missing_values != "NaN" and \ - (np.any(_get_mask(X.data, "NaN")) or - np.any(_get_mask(Y.data, "NaN"))): - raise ValueError( - "NaN values present but missing_value = {0}". - format(missing_values)) - - # ValueError if X and Y have incompatible dimensions - # if X.shape[1] != Y.shape[1]: - # raise ValueError("The search dimension of the matrices " - # "are not equal: [{0}] versus [{1}]". - # format(X.shape[1], Y.shape[1])) - - # Get missing mask for X - mask_X = _get_mask(X, missing_values) - - # Get Y.T mask and anti-mask and set Y.T's missing to zero - YT = Y.T - mask_YT = _get_mask(YT, missing_values) - NYT = (~mask_YT).astype(np.int8) - # NYT = (~mask_YT) - YT[mask_YT] = 0 - - # Get X anti-mask and set X's missing to zero - NX = (~mask_X).astype(np.int8) - # NX = (~mask_X) - X[mask_X] = 0 - - # Matrix formula to calculate pair-wise distance between all vectors - # in a matrix X to vectors in matrix Y. It zero-weights coordinates - # with missing value in either vector in the pair and up-weights the - # remaining coordinates. - # Formula derived by: Shreya Bhattarai - - # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ - # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - # np.dot(NX, (YT * YT))) - - # Above is faster but following for Python 2.x support - distances = np.multiply(np.multiply(X.shape[1], - (1.0 / np.dot(NX, NYT))), - (np.dot(np.multiply(X, X), NYT) - - (2.0 * (np.dot(X, YT))) + - np.dot(NX, (np.multiply(YT, YT))))) + # Get missing mask for X + mask_X = _get_mask(X, missing_values) + + # Get Y.T mask and anti-mask and set Y.T's missing to zero + YT = Y.T + mask_YT = _get_mask(YT, missing_values) + NYT = (~mask_YT).astype(np.int8) + YT[mask_YT] = 0 + + # Get X anti-mask and set X's missing to zero + NX = (~mask_X).astype(np.int8) + X[mask_X] = 0 + + # Calculate distances + + # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ + # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + # np.dot(NX, (YT * YT))) + + # Above is faster but following for Python 2.x support + distances = np.multiply(np.multiply(X.shape[1], + (1.0 / np.dot(NX, NYT))), + (np.dot(np.multiply(X, X), NYT) - + (2.0 * (np.dot(X, YT))) + + np.dot(NX, (np.multiply(YT, YT))))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1133,6 +1208,11 @@ def chi2_kernel(X, Y=None, gamma=1.): 'precomputed': None, # HACK: precomputed is always allowed, never called } +# Helper functions with missing value support - distance +MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { + 'euclidean': masked_euclidean_distances, +} + def distance_metrics(): """Valid metrics for pairwise_distances. @@ -1221,7 +1301,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] -_MISSING_SUPPORTED_METRICS = ['euclidean'] +_MASKED_SUPPORTED_METRICS = ['euclidean'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): @@ -1309,8 +1389,13 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) - if (kwds.get("kill_missing") is False): - if (metric not in _MISSING_SUPPORTED_METRICS): + # To handle kill_missing = False + kill_missing = kwds.get("kill_missing") + if not kill_missing and kill_missing is not None: + missing_values = kwds.get("missing_values") if kwds.get( + "missing_values") is not None else np.nan + + if (metric not in _MASKED_SUPPORTED_METRICS): raise ValueError( "Metric {0} does not have missing value support ".format( metric) @@ -1318,18 +1403,22 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): if issparse(X) or (Y is not None and issparse(Y)): raise ValueError( "Missing value support for sparse matrices not added yet") - if (kwds.get("missing_values") is None): - raise ValueError("Missing value is not defined") - if(np.any(_get_mask(X.data, kwds.get("missing_values")).sum( - axis=1) == X.data.shape[1])): + # if (kwds.get("missing_values") is None): + # raise ValueError("Missing value is not defined") + if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") + # if type(metric) is str: + # metric = "masked_" + metric if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X + elif kill_missing is False and metric in \ + MASKED_PAIRWISE_DISTANCE_FUNCTIONS: + func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] elif metric in PAIRWISE_DISTANCE_FUNCTIONS: - func = PAIRWISE_DISTANCE_FUNCTIONS[metric] + func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index ddb4e795dd1bc..8732bf5e6d70a 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -17,6 +17,7 @@ from sklearn.externals.six import iteritems from sklearn.metrics.pairwise import euclidean_distances +from sklearn.metrics.pairwise import masked_euclidean_distances from sklearn.metrics.pairwise import manhattan_distances from sklearn.metrics.pairwise import linear_kernel from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel @@ -56,6 +57,17 @@ def test_pairwise_distances(): S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) + # Euclidean dist. (masked) should be equivalent to calling the function. + X = rng.random_sample((5, 4)) + S = pairwise_distances(X, metric="euclidean", kill_missing=False) + S2 = masked_euclidean_distances(X) + assert_array_almost_equal(S, S2) + # Euclidean distance, with Y != X. + Y = rng.random_sample((2, 4)) + S = pairwise_distances(X, Y, metric="euclidean", + kill_missing=False) + S2 = masked_euclidean_distances(X, Y) + assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) @@ -407,13 +419,13 @@ def test_euclidean_distances(): assert_greater(np.max(np.abs(wrong_D - D1)), .01) -def test_euclidean_distances_with_missing(): +def test_masked_euclidean_distances(): # first check that we get right answer with missing values for X X = np.array([[1., 5., 7., 5., 10.], [8., 2., 4., np.nan, 8.], [5., np.nan, 5., np.nan, 1.], [8., np.nan, np.nan, np.nan, np.nan]]) - D1 = euclidean_distances(X, kill_missing=False, missing_values="NaN") + D1 = masked_euclidean_distances(X, missing_values="NaN") D2 = np.array([[0., 9.42072184, 12.97433364, 15.65247584], [9.42072184, 0., 9.91631652, 0.], @@ -435,7 +447,7 @@ def test_euclidean_distances_with_missing(): [5., 5., 6.70820393], [2.23606798, 13.41640786, 8.94427191]]) - D4 = euclidean_distances(X, Y, kill_missing=False, missing_values="NaN") + D4 = masked_euclidean_distances(X, Y, missing_values="NaN") assert_array_almost_equal(D3, D4) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e4fa7ffe6db03..b21c058202ecf 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -17,6 +17,7 @@ from ..base import BaseEstimator from ..metrics import pairwise_distances from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from ..metrics.pairwise import _MASKED_SUPPORTED_METRICS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices from ..utils.multiclass import check_classification_targets from ..externals import six @@ -158,6 +159,25 @@ def _init_params(self, n_neighbors=None, radius=None, self._fit_method = None def _fit(self, X, kill_missing=True): + if not kill_missing: + if self.metric not in _MASKED_SUPPORTED_METRICS: + raise ValueError( + "Metric {0} is currently not supported for " + "data containing missing values.".format(self.metric) + ) + + _MASKED_SUPPORTED_ALGORITHMS = ["brute"] + if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: + if self.algorithm == "auto": + pass + else: + warnings.warn( + "{0} algorithm is currently not supported for " + "data containing missing values. " + "Reverting to a supported algorithm.". + format(self.algorithm)) + self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] + if self.metric_params is None: self.effective_metric_params_ = {} else: @@ -203,23 +223,30 @@ def _fit(self, X, kill_missing=True): # # copy=True if missing accepted as they will be replaced by 0 # copy = True if kill_missing is False else False - X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) + X = check_array(X, accept_sparse='csr', + force_all_finite=kill_missing) n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") if issparse(X): - if self.algorithm not in ('auto', 'brute'): - warnings.warn("cannot use tree with sparse input: " - "using brute force") - if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: - raise ValueError("metric '%s' not valid for sparse input" - % self.effective_metric_) - self._fit_X = X.copy() - self._tree = None - self._fit_method = 'brute' - return self + if not kill_missing: + raise ValueError( + "Nearest neighbor algorithm does not currently support" + "the use of sparse matrices." + ) + else: + if self.algorithm not in ('auto', 'brute'): + warnings.warn("cannot use tree with sparse input: " + "using brute force") + if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: + raise ValueError("metric '%s' not valid for sparse input" + % self.effective_metric_) + self._fit_X = X.copy() + self._tree = None + self._fit_method = 'brute' + return self self._fit_method = self.algorithm self._fit_X = X @@ -272,8 +299,7 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" - def kneighbors(self, X=None, n_neighbors=None, return_distance=True, - kill_missing=True, missing_values="NaN", copy=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -293,15 +319,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, return_distance : boolean, optional. Defaults to True. If False, distances will not be returned - kill_missing : boolean, optional - Allow missing values (e.g., NaN) - - missing_values : String, optional - String representation of missing value - - copy : boolean, optional - Make and use a deep copy of X - Returns ------- dist : array @@ -345,8 +362,7 @@ class from an array representing our data set and ask who's query_is_train = False # copy=True if missing accepted as they will be replaced by 0 # copy = True if kill_missing is False else False - X = check_array(X, accept_sparse='csr', - force_all_finite=kill_missing) + X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X @@ -364,19 +380,12 @@ class from an array representing our data set and ask who's n_samples, _ = X.shape sample_range = np.arange(n_samples)[:, None] - # copy=True if missing accepted and copy is None - if copy is None: - copy = True if kill_missing is False else False - n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=n_jobs, squared=True, - kill_missing=kill_missing, - missing_values=missing_values, - copy=copy) + n_jobs=n_jobs, squared=True) else: dist = pairwise_distances( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, @@ -443,6 +452,171 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind + def masked_kneighbors(self, X=None, n_neighbors=None, return_distance=True, + missing_values="NaN", copy=True): + """Finds the K-neighbors of a point, even when they contain NaN values. + + Returns indices of and distances to the neighbors of each point. + + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + n_neighbors : int + Number of neighbors to get (default is the value + passed to the constructor). + + return_distance : boolean, optional. Defaults to True. + If False, distances will not be returned + + missing_values : "NaN" or integer, optional. Default is "NaN". + Representation of missing value + + copy : boolean, optional. Default is True. + Create and use a deep copy of X + + Returns + ------- + dist : array + Array representing the lengths to points, only present if + return_distance=True + + ind : array + Indices of the nearest points in the population matrix. + + Examples + -------- + In the following example, we construct a NeighborsClassifier + class from an array representing our data set and ask who's + the closest point to [0, nan, 1], where "nan" represents a + missing value. + >>> nan = float("nan") + >>> samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=2, metric="euclidean") + >>> neigh.fit(samples, kill_missing=False) # doctest: +ELLIPSIS + NearestNeighbors(algorithm='auto', leaf_size=30,...) + >>> print(neigh.masked_kneighbors(n_neighbors=2, + >>> return_distance=False)) # doctest: +ELLIPSIS + (array([[3, 1], [3, 2], [3, 1], [2, 1]])...) + + >>> X = [[0, nan, 1]] + >>> neigh.masked_kneighbors([[0, nan, 1]], 2, + >>> return_distance=False) # doctest: +ELLIPSIS + (array([[1, 3]])...) + """ + if self._fit_method is None: + raise NotFittedError("Must fit neighbors before querying.") + + if n_neighbors is None: + n_neighbors = self.n_neighbors + + if X is not None: + query_is_train = False + X = check_array(X, accept_sparse='csr', + force_all_finite=False, copy=copy) + else: + query_is_train = True + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + n_neighbors += 1 + + train_size = self._fit_X.shape[0] + if n_neighbors > train_size: + raise ValueError( + "Expected n_neighbors <= n_samples, " + " but n_samples = %d, n_neighbors = %d" % + (train_size, n_neighbors) + ) + n_samples, _ = X.shape + sample_range = np.arange(n_samples)[:, None] + + n_jobs = _get_n_jobs(self.n_jobs) + if self._fit_method == 'brute': + # for efficiency, use squared euclidean distances + if self.effective_metric_ == 'euclidean': + dist = pairwise_distances(X, self._fit_X, 'euclidean', + n_jobs=n_jobs, squared=True, + kill_missing=False, + missing_values=missing_values, + copy=copy) + else: + # dist = pairwise_distances( + # X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, + # **self.effective_metric_params_) + raise ValueError( + "Only the following metrics are currently supported for " + "data with missing values:{0}". + format(_MASKED_SUPPORTED_METRICS) + ) + neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[ + sample_range, np.argsort(dist[sample_range, neigh_ind])] + + if return_distance: + if self.effective_metric_ == 'euclidean': + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind + + # elif self._fit_method in ['ball_tree', 'kd_tree']: + # if issparse(X): + # raise ValueError( + # "%s does not work with sparse matrices." + # "Densify the data, " + # "or set algorithm='brute'" % self._fit_method) + # result = Parallel(n_jobs, backend='threading')( + # delayed(self._tree.query, check_pickle=False)( + # X[s], n_neighbors, return_distance) + # for s in gen_even_slices(X.shape[0], n_jobs) + # ) + # if return_distance: + # dist, neigh_ind = tuple(zip(*result)) + # result = np.vstack(dist), np.vstack(neigh_ind) + # else: + # result = np.vstack(result) + else: + raise ValueError("internal: _fit_method not recognized for data " + "containing missing") + + if not query_is_train: + return result + else: + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e + # the sample itself. + if return_distance: + dist, neigh_ind = result + else: + neigh_ind = result + + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = np.all(sample_mask, axis=1) + sample_mask[:, 0][dup_gr_nbrs] = False + + neigh_ind = np.reshape( + neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) + + if return_distance: + dist = np.reshape( + dist[sample_mask], (n_samples, n_neighbors - 1)) + return dist, neigh_ind + return neigh_ind + def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'): """Computes the (weighted) graph of k-Neighbors for points in X diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 052c83c71d2e7..f772e8b963e14 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -88,6 +88,66 @@ def test_unsupervised_kneighbors(n_samples=20, n_features=5, assert_array_almost_equal(results[i][1], results[i + 1][1]) +def test_masked_unsupervised_kneighbors(): + # Test 1 + X = np.array([[np.nan, 3., 7., np.nan], + [6., 3., 7., 2.], + [7., 3., 4., 4.], + [2., 7., 7., 1.], + [np.nan, 2., np.nan, 4.]], dtype=np.float32) + + Y = np.array([[3., 1., 7., np.nan], + [1., 3., 1., 6.], + [np.nan, 1., np.nan, 5.], + [3., 1., 3., 3.], + [2., 3., 1., 9.]], dtype=np.float32) + + neigh = neighbors.NearestNeighbors(2, metric="euclidean") + neigh.fit(X, kill_missing=False) + X_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) + XY_neigh = neigh.masked_kneighbors(Y, 2, return_distance=False) + + # Expected outcome + N1 = np.array( + [[1, 4], + [0, 4], + [4, 1], + [0, 1], + [2, 0]]) + + N2 = np.array( + [[4, 0], + [4, 2], + [4, 2], + [4, 2], + [4, 2]]) + + assert_array_equal(X_neigh, N1) + assert_array_equal(XY_neigh, N2) + + # Test 2 + nan = float("nan") + samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] + neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="euclidean") + + neigh.fit(samples, kill_missing=False) + X2_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) + + XY2_neigh = neigh.masked_kneighbors([[0, nan, 1]], 2, + return_distance=False) + + # Expected outcome + N3 = np.array( + [[3, 1], + [3, 2], + [3, 1], + [2, 1]]) + N4 = np.array([[1, 3]]) + + assert_array_equal(X2_neigh, N3) + assert_array_equal(XY2_neigh, N4) + + def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors X = rng.random_sample((10, 3)) From e4f8612ffa61d176059f52f1beca42a674ffb94b Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 07:12:43 -0500 Subject: [PATCH 08/97] Docstring example issues --- sklearn/metrics/pairwise.py | 4 +++- sklearn/neighbors/base.py | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f01f4fd36ab33..aa8d3c4140bd9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -222,9 +222,10 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, >>> euclidean_distances(X, [[0, 0]]) array([[ 1. ], [ 1.41421356]]) + See also -------- - paired_distances : distances betweens pairs of elements of X and Y. + paired_distances : distances between pairs of elements of X and Y. """ X, Y = check_pairwise_arrays(X, Y) @@ -324,6 +325,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, >>> masked_euclidean_distances(X, X) array([[ 0., 1.41421356], [ 1.41421356, 0.]]) + >>> # get distance to origin >>> masked_euclidean_distances(X, [[0, 0]]) array([[ 1. ], diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index b21c058202ecf..cf3189afac42a 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -500,14 +500,15 @@ class from an array representing our data set and ask who's >>> neigh = NearestNeighbors(n_neighbors=2, metric="euclidean") >>> neigh.fit(samples, kill_missing=False) # doctest: +ELLIPSIS NearestNeighbors(algorithm='auto', leaf_size=30,...) - >>> print(neigh.masked_kneighbors(n_neighbors=2, - >>> return_distance=False)) # doctest: +ELLIPSIS + >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) + >>> print(N) # doctest: +ELLIPSIS (array([[3, 1], [3, 2], [3, 1], [2, 1]])...) >>> X = [[0, nan, 1]] - >>> neigh.masked_kneighbors([[0, nan, 1]], 2, - >>> return_distance=False) # doctest: +ELLIPSIS + >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) + >>> print(N2) # doctest: +ELLIPSIS (array([[1, 3]])...) + """ if self._fit_method is None: raise NotFittedError("Must fit neighbors before querying.") From daf247f7db0f3e4e8fca798cb18a070a78bcdc2a Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 14:05:09 -0500 Subject: [PATCH 09/97] Formatting fixes on docstring --- sklearn/metrics/pairwise.py | 4 ++-- sklearn/neighbors/base.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index aa8d3c4140bd9..423a082085648 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -323,8 +323,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, >>> X = [[0, 1], [1, nan]] >>> # distance between rows of X >>> masked_euclidean_distances(X, X) - array([[ 0., 1.41421356], - [ 1.41421356, 0.]]) + array([[ 0. , 1.41421356], + [ 1.41421356, 0. ]]) >>> # get distance to origin >>> masked_euclidean_distances(X, [[0, 0]]) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index cf3189afac42a..79bda40e4f1c4 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -502,12 +502,15 @@ class from an array representing our data set and ask who's NearestNeighbors(algorithm='auto', leaf_size=30,...) >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) >>> print(N) # doctest: +ELLIPSIS - (array([[3, 1], [3, 2], [3, 1], [2, 1]])...) + [[3 1] + [3 2] + [3 1] + [2 1]] >>> X = [[0, nan, 1]] >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) >>> print(N2) # doctest: +ELLIPSIS - (array([[1, 3]])...) + [[1 3]] """ if self._fit_method is None: From 10f5adb99a5ae7b681203e19d58888eeb055f6cf Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 15:25:44 -0500 Subject: [PATCH 10/97] And yet more fixes --- sklearn/neighbors/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 79bda40e4f1c4..0a1abc006697e 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -503,9 +503,9 @@ class from an array representing our data set and ask who's >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) >>> print(N) # doctest: +ELLIPSIS [[3 1] - [3 2] - [3 1] - [2 1]] + [3 2] + [3 1] + [2 1]] >>> X = [[0, nan, 1]] >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) From 22cf9ef72b2a5a317f4e754d22bad6c89962c055 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 23 Jul 2017 00:07:15 -0500 Subject: [PATCH 11/97] Addressed review comments (Part 2) --- sklearn/metrics/pairwise.py | 190 +++++++++++----------- sklearn/metrics/tests/test_pairwise.py | 5 +- sklearn/neighbors/base.py | 61 +++---- sklearn/neighbors/tests/test_neighbors.py | 12 +- 4 files changed, 135 insertions(+), 133 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 423a082085648..05fe4bd3dbb23 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -8,7 +8,7 @@ # Lars Buitinck # Joel Nothman # License: BSD 3 clause - +from __future__ import division import itertools from functools import partial @@ -64,7 +64,8 @@ def _return_float_dtype(X, Y): def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, - copy=False, force_all_finite=True): + accept_sparse='csr', force_all_finite=True, + copy=False): """ Set X and Y appropriately and checks inputs If Y is None, it is set as a pointer to X (i.e. not a copy). @@ -94,12 +95,24 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, .. versionadded:: 0.18 - copy : bool - Create and return a deep copy of X and Y (if Y exists) + accept_sparse : string, boolean or list/tuple of strings + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + .. deprecated:: 0.19 + Passing 'None' to parameter ``accept_sparse`` in methods is + deprecated in version 0.19 "and will be removed in 0.21. Use + ``accept_sparse=False`` instead. force_all_finite : bool - Throw a ValueError exception if either X or Y (if Y exists) - contains any NaN or +/- inf values + Whether to raise an error on np.inf and np.nan in X (or Y if it exists) + + copy : bool + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. Returns ------- @@ -119,14 +132,14 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, dtype = dtype_float if Y is X or Y is None: - X = Y = check_array(X, accept_sparse='csr', dtype=dtype, + X = Y = check_array(X, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) else: - X = check_array(X, accept_sparse='csr', dtype=dtype, + X = check_array(X, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) - Y = check_array(Y, accept_sparse='csr', dtype=dtype, + Y = check_array(Y, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) @@ -266,8 +279,9 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # Pairwise distances in the presence of missing values def masked_euclidean_distances(X, Y=None, squared=False, - missing_values="NaN", copy=True, **kwargs): - """ + missing_values="NaN", copy=True): + """Calculates euclidean distances in the presence of missing values + Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. Similarly, if Y is not X, then compute the distance matrix between each @@ -275,25 +289,47 @@ def masked_euclidean_distances(X, Y=None, squared=False, This function computes pairwise euclidean distance for vectors in dense matrices X and Y with missing values in arbitrary - coordinates. The following formula is used for this: + coordinates. + + The following formula is used for this: - dist(X, Y) = (X.shape[1] * 1 / ((dot(NX, NYT)))) * + dist(X, Y) = (X.shape[1] / ((dot(NX, NYT)))) * (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + dot(NX, (Y.T * Y.T))) where NX and NYT represent the logical-not of the missing masks of - X and Y.T, respectively.This formulation zero-weights feature coordinates - with missing value in either vector in the pair and up-weights the - remaining coordinates. - Formula derived by: Shreya Bhattarai + X and Y.T, respectively. + Formula in matrix form derived by: + Shreya Bhattarai + + This formulation zero-weights feature coordinates with missing value in + either vector in the pair and up-weights the remaining coordinates. + For instance, say we have two sample points (x1, y1) and (x2, NaN): + + To calculate the euclidean distance between these, first the square + "distance" is calculated based only on the first feature coordinate + as the second coordinate is missing in one of the samples, + i.e., we have (x2-x1)**2. This squared distance is scaled-up by the ratio + of total number of coordinates to the number of available coordinates, + which in this case is 2/1 = 2. Now, we are left with 2*((x2-x1)**2). + Finally, if squared=False then the square root of this is evaluated + and returned otherwise the value is returned as is. Breakdown of euclidean distance calculation between a vector pair x,y: weight = Total # of coordinates / # of non-missing coordinates dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates) - This of course implies that if all coordinates are missing in either - vector in the pair then NaN is returned for that pair. + This formulation implies that if all coordinates are missing in either + vector in the pair or if there are no common non-missing coordinates then + NaN is returned for that pair. + + References + ---------- + John K. Dixon, "Pattern Recognition with Partly Missing Data", + IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: + 10, pp. 617 - 621, Oct. 1979. + http://ieeexplore.ieee.org/abstract/document/4310090/ Read more in the :ref:`User Guide `. @@ -306,7 +342,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, squared : boolean, optional Return squared Euclidean distances. - missing_values : "NaN" or integer, optional (default=”NaN”) + missing_values : "NaN" or integer, optional Representation of missing value copy : boolean, optional @@ -335,13 +371,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, -------- paired_distances : distances betweens pairs of elements of X and Y. """ - # Check and except sparse matrices - if issparse(X) or (Y is not None and issparse(Y)): - raise ValueError( - "Missing value support for sparse matrices not added yet") - # NOTE: force_all_finite=False allows not only NaN but also inf/-inf - X, Y = check_pairwise_arrays(X, Y, + # NOTE: force_all_finite=False allows not only NaN but also +/- inf + X, Y = check_pairwise_arrays(X, Y, accept_sparse=False, force_all_finite=False, copy=copy) if (np.any(np.isinf(X.data)) or (Y is not None and np.any(np.isinf(Y.data)))): @@ -353,34 +385,6 @@ def masked_euclidean_distances(X, Y=None, squared=False, or (Y is not None and np.any(_get_mask(Y, missing_values).sum( axis=1) == Y.shape[1])): raise ValueError("One or more rows only contain missing values.") - # - # if kill_missing: - # if X_norm_squared is not None: - # XX = check_array(X_norm_squared) - # if XX.shape == (1, X.shape[0]): - # XX = XX.T - # elif XX.shape != (X.shape[0], 1): - # raise ValueError( - # "Incompatible dimensions for X and X_norm_squared") - # else: - # XX = row_norms(X, squared=True)[:, np.newaxis] - # - # if X is Y: # shortcut in the common case euclidean_distances(X, X) - # YY = XX.T - # elif Y_norm_squared is not None: - # YY = np.atleast_2d(Y_norm_squared) - # - # if YY.shape != (1, Y.shape[0]): - # raise ValueError( - # "Incompatible dimensions for Y and Y_norm_squared") - # else: - # YY = row_norms(Y, squared=True)[np.newaxis, :] - # - # distances = safe_sparse_dot(X, Y.T, dense_output=True) - # distances *= -2 - # distances += XX - # distances += YY - # np.maximum(distances, 0, out=distances) # else: if missing_values != "NaN" and \ @@ -405,16 +409,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, # Calculate distances - # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ - # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - # np.dot(NX, (YT * YT))) - - # Above is faster but following for Python 2.x support - distances = np.multiply(np.multiply(X.shape[1], - (1.0 / np.dot(NX, NYT))), - (np.dot(np.multiply(X, X), NYT) - - (2.0 * (np.dot(X, YT))) + - np.dot(NX, (np.multiply(YT, YT))))) + distances = (X.shape[1] / ((np.dot(NX, NYT)))) * \ + (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + np.dot(NX, (YT * YT))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1208,12 +1205,13 @@ def chi2_kernel(X, Y=None, gamma=1.): 'l1': manhattan_distances, 'manhattan': manhattan_distances, 'precomputed': None, # HACK: precomputed is always allowed, never called + 'masked_euclidean': masked_euclidean_distances, } # Helper functions with missing value support - distance -MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { - 'euclidean': masked_euclidean_distances, -} +# MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { +# 'euclidean': masked_euclidean_distances, +# } def distance_metrics(): @@ -1225,16 +1223,17 @@ def distance_metrics(): The valid distance metrics, and the function they map to, are: - ============ ==================================== - metric Function - ============ ==================================== - 'cityblock' metrics.pairwise.manhattan_distances - 'cosine' metrics.pairwise.cosine_distances - 'euclidean' metrics.pairwise.euclidean_distances - 'l1' metrics.pairwise.manhattan_distances - 'l2' metrics.pairwise.euclidean_distances - 'manhattan' metrics.pairwise.manhattan_distances - ============ ==================================== + ============ ==================================== + metric Function + ============ ==================================== + 'cityblock' metrics.pairwise.manhattan_distances + 'cosine' metrics.pairwise.cosine_distances + 'euclidean' metrics.pairwise.euclidean_distances + 'l1' metrics.pairwise.manhattan_distances + 'l2' metrics.pairwise.euclidean_distances + 'manhattan' metrics.pairwise.manhattan_distances + 'masked_euclidean' metrics.pairwise.masked_euclidean_distances + ============ ==================================== Read more in the :ref:`User Guide `. @@ -1301,9 +1300,10 @@ def _pairwise_callable(X, Y, metric, **kwds): 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', - 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] + 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski", + 'masked_euclidean'] -_MASKED_SUPPORTED_METRICS = ['euclidean'] +_MASKED_SUPPORTED_METRICS = ['masked_euclidean'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): @@ -1323,7 +1323,9 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan']. These metrics support sparse matrix inputs. + 'manhattan']. These metrics support sparse matrix + inputs. + Also, ['masked_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', @@ -1392,33 +1394,27 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "callable" % (metric, _VALID_METRICS)) # To handle kill_missing = False - kill_missing = kwds.get("kill_missing") - if not kill_missing and kill_missing is not None: + # kill_missing = kwds.get("kill_missing") + # if not kill_missing and kill_missing is not None: + if metric in _MASKED_SUPPORTED_METRICS: missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan - if (metric not in _MASKED_SUPPORTED_METRICS): - raise ValueError( - "Metric {0} does not have missing value support ".format( - metric) - ) - if issparse(X) or (Y is not None and issparse(Y)): - raise ValueError( - "Missing value support for sparse matrices not added yet") - # if (kwds.get("missing_values") is None): - # raise ValueError("Missing value is not defined") + # if (metric not in _MASKED_SUPPORTED_METRICS): + # raise ValueError( + # "Metric {0} does not have missing value support ".format( + # metric) + # ) if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") - # if type(metric) is str: - # metric = "masked_" + metric if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X - elif kill_missing is False and metric in \ - MASKED_PAIRWISE_DISTANCE_FUNCTIONS: - func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] + # elif kill_missing is False and metric in \ + # MASKED_PAIRWISE_DISTANCE_FUNCTIONS: + # func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 8732bf5e6d70a..774bf66957935 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -59,13 +59,12 @@ def test_pairwise_distances(): assert_array_almost_equal(S, S2) # Euclidean dist. (masked) should be equivalent to calling the function. X = rng.random_sample((5, 4)) - S = pairwise_distances(X, metric="euclidean", kill_missing=False) + S = pairwise_distances(X, metric="masked_euclidean") S2 = masked_euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) - S = pairwise_distances(X, Y, metric="euclidean", - kill_missing=False) + S = pairwise_distances(X, Y, metric="masked_euclidean") S2 = masked_euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 0a1abc006697e..78f319716ec85 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -158,25 +158,29 @@ def _init_params(self, n_neighbors=None, radius=None, self._tree = None self._fit_method = None - def _fit(self, X, kill_missing=True): - if not kill_missing: - if self.metric not in _MASKED_SUPPORTED_METRICS: - raise ValueError( - "Metric {0} is currently not supported for " - "data containing missing values.".format(self.metric) - ) - - _MASKED_SUPPORTED_ALGORITHMS = ["brute"] - if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: - if self.algorithm == "auto": - pass - else: - warnings.warn( - "{0} algorithm is currently not supported for " - "data containing missing values. " - "Reverting to a supported algorithm.". - format(self.algorithm)) - self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] + def _fit(self, X): + if self.metric in _MASKED_SUPPORTED_METRICS: + kill_missing = False + # if not kill_missing: + # if self.metric not in _MASKED_SUPPORTED_METRICS: + # raise ValueError( + # "Metric {0} is currently not supported for " + # "data containing missing values.".format(self.metric) + # ) + # + # _MASKED_SUPPORTED_ALGORITHMS = ["brute"] + # if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: + # if self.algorithm == "auto": + # pass + # else: + # warnings.warn( + # "{0} algorithm is currently not supported for " + # "data containing missing values. " + # "Reverting to a supported algorithm.". + # format(self.algorithm)) + # self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] + else: + kill_missing = True if self.metric_params is None: self.effective_metric_params_ = {} @@ -360,8 +364,6 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - # copy=True if missing accepted as they will be replaced by 0 - # copy = True if kill_missing is False else False X = check_array(X, accept_sparse='csr') else: query_is_train = True @@ -497,8 +499,8 @@ class from an array representing our data set and ask who's >>> nan = float("nan") >>> samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] >>> from sklearn.neighbors import NearestNeighbors - >>> neigh = NearestNeighbors(n_neighbors=2, metric="euclidean") - >>> neigh.fit(samples, kill_missing=False) # doctest: +ELLIPSIS + >>> neigh = NearestNeighbors(n_neighbors=2, metric="masked_euclidean") + >>> neigh.fit(samples) # doctest: +ELLIPSIS NearestNeighbors(algorithm='auto', leaf_size=30,...) >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) >>> print(N) # doctest: +ELLIPSIS @@ -543,10 +545,9 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean': - dist = pairwise_distances(X, self._fit_X, 'euclidean', + if self.effective_metric_ == 'masked_euclidean': + dist = pairwise_distances(X, self._fit_X, 'masked_euclidean', n_jobs=n_jobs, squared=True, - kill_missing=False, missing_values=missing_values, copy=copy) else: @@ -565,7 +566,7 @@ class from an array representing our data set and ask who's sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == 'euclidean': + if self.effective_metric_ == 'masked_euclidean': result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = dist[sample_range, neigh_ind], neigh_ind @@ -991,7 +992,8 @@ def fit(self, X, y): class UnsupervisedMixin(object): - def fit(self, X, y=None, kill_missing=True): + # def fit(self, X, y=None, kill_missing=True): + def fit(self, X, y=None): """Fit the model using X as training data Parameters @@ -1000,4 +1002,5 @@ def fit(self, X, y=None, kill_missing=True): Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. """ - return self._fit(X, kill_missing) + # return self._fit(X, kill_missing) + return self._fit(X) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index f772e8b963e14..be457fdcf4b5a 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -102,8 +102,8 @@ def test_masked_unsupervised_kneighbors(): [3., 1., 3., 3.], [2., 3., 1., 9.]], dtype=np.float32) - neigh = neighbors.NearestNeighbors(2, metric="euclidean") - neigh.fit(X, kill_missing=False) + neigh = neighbors.NearestNeighbors(2, metric="masked_euclidean") + neigh.fit(X) X_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) XY_neigh = neigh.masked_kneighbors(Y, 2, return_distance=False) @@ -128,9 +128,10 @@ def test_masked_unsupervised_kneighbors(): # Test 2 nan = float("nan") samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] - neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="euclidean") + neigh = neighbors.NearestNeighbors(n_neighbors=2, + metric="masked_euclidean") - neigh.fit(samples, kill_missing=False) + neigh.fit(samples) X2_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) XY2_neigh = neigh.masked_kneighbors([[0, nan, 1]], 2, @@ -1079,6 +1080,9 @@ def test_valid_brute_metric_for_auto_algorithm(): nb_p.kneighbors(DYX) for metric in VALID_METRICS_SPARSE['brute']: + # TODO: Remove after adding sparse support for masked_euclidean + if metric == "masked_euclidean": + continue if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric).fit(Xcsr) From 2482c8ad3a7dd9acbf6c3a4921c4eb9c3807d830 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 23 Jul 2017 00:38:04 -0500 Subject: [PATCH 12/97] Changed nan-mask from int8 to int32 --- sklearn/metrics/pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 05fe4bd3dbb23..a34ddb56d7d9f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -400,11 +400,11 @@ def masked_euclidean_distances(X, Y=None, squared=False, # Get Y.T mask and anti-mask and set Y.T's missing to zero YT = Y.T mask_YT = _get_mask(YT, missing_values) - NYT = (~mask_YT).astype(np.int8) + NYT = (~mask_YT).astype(np.int32) YT[mask_YT] = 0 # Get X anti-mask and set X's missing to zero - NX = (~mask_X).astype(np.int8) + NX = (~mask_X).astype(np.int32) X[mask_X] = 0 # Calculate distances From 66527cd81f4da7a1c02c7bf98a774d9733fbf76d Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 24 Jul 2017 00:52:24 -0500 Subject: [PATCH 13/97] Addressed review comments (#3) --- sklearn/metrics/pairwise.py | 42 ++--- sklearn/metrics/tests/test_pairwise.py | 53 +++--- sklearn/neighbors/base.py | 201 ++-------------------- sklearn/neighbors/tests/test_neighbors.py | 11 +- 4 files changed, 56 insertions(+), 251 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a34ddb56d7d9f..ebbdb1ad7e8ae 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -324,13 +324,6 @@ def masked_euclidean_distances(X, Y=None, squared=False, vector in the pair or if there are no common non-missing coordinates then NaN is returned for that pair. - References - ---------- - John K. Dixon, "Pattern Recognition with Partly Missing Data", - IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: - 10, pp. 617 - 621, Oct. 1979. - http://ieeexplore.ieee.org/abstract/document/4310090/ - Read more in the :ref:`User Guide `. Parameters @@ -367,6 +360,13 @@ def masked_euclidean_distances(X, Y=None, squared=False, array([[ 1. ], [ 1.41421356]]) + References + ---------- + * John K. Dixon, "Pattern Recognition with Partly Missing Data", + IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: + 10, pp. 617 - 621, Oct. 1979. + http://ieeexplore.ieee.org/abstract/document/4310090/ + See also -------- paired_distances : distances betweens pairs of elements of X and Y. @@ -409,9 +409,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, # Calculate distances - distances = (X.shape[1] / ((np.dot(NX, NYT)))) * \ - (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - np.dot(NX, (YT * YT))) + distances = (X.shape[1] / (np.dot(NX, NYT))) * \ + (np.dot(X * X, NYT) - 2 * (np.dot(X, YT)) + + np.dot(NX, YT * YT)) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1208,11 +1208,6 @@ def chi2_kernel(X, Y=None, gamma=1.): 'masked_euclidean': masked_euclidean_distances, } -# Helper functions with missing value support - distance -# MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { -# 'euclidean': masked_euclidean_distances, -# } - def distance_metrics(): """Valid metrics for pairwise_distances. @@ -1223,9 +1218,9 @@ def distance_metrics(): The valid distance metrics, and the function they map to, are: - ============ ==================================== + =================== ============================================ metric Function - ============ ==================================== + =================== ============================================ 'cityblock' metrics.pairwise.manhattan_distances 'cosine' metrics.pairwise.cosine_distances 'euclidean' metrics.pairwise.euclidean_distances @@ -1233,7 +1228,7 @@ def distance_metrics(): 'l2' metrics.pairwise.euclidean_distances 'manhattan' metrics.pairwise.manhattan_distances 'masked_euclidean' metrics.pairwise.masked_euclidean_distances - ============ ==================================== + =================== ============================================ Read more in the :ref:`User Guide `. @@ -1393,18 +1388,10 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) - # To handle kill_missing = False - # kill_missing = kwds.get("kill_missing") - # if not kill_missing and kill_missing is not None: if metric in _MASKED_SUPPORTED_METRICS: missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan - # if (metric not in _MASKED_SUPPORTED_METRICS): - # raise ValueError( - # "Metric {0} does not have missing value support ".format( - # metric) - # ) if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") @@ -1412,9 +1399,6 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X - # elif kill_missing is False and metric in \ - # MASKED_PAIRWISE_DISTANCE_FUNCTIONS: - # func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 774bf66957935..c92c41532fca5 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -58,15 +58,13 @@ def test_pairwise_distances(): S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Euclidean dist. (masked) should be equivalent to calling the function. - X = rng.random_sample((5, 4)) - S = pairwise_distances(X, metric="masked_euclidean") - S2 = masked_euclidean_distances(X) - assert_array_almost_equal(S, S2) - # Euclidean distance, with Y != X. - Y = rng.random_sample((2, 4)) - S = pairwise_distances(X, Y, metric="masked_euclidean") - S2 = masked_euclidean_distances(X, Y) - assert_array_almost_equal(S, S2) + X_masked = rng.random_sample((5, 4)) + Y_masked = rng.random_sample((2, 4)) + X_masked[0, 0] = np.nan + Y_masked[0, 0] = np.nan + S_masked = pairwise_distances(X_masked, Y_masked, metric="masked_euclidean") + S2_masked = masked_euclidean_distances(X_masked, Y_masked) + assert_array_almost_equal(S_masked, S2_masked) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) @@ -419,20 +417,6 @@ def test_euclidean_distances(): def test_masked_euclidean_distances(): - # first check that we get right answer with missing values for X - X = np.array([[1., 5., 7., 5., 10.], - [8., 2., 4., np.nan, 8.], - [5., np.nan, 5., np.nan, 1.], - [8., np.nan, np.nan, np.nan, np.nan]]) - D1 = masked_euclidean_distances(X, missing_values="NaN") - - D2 = np.array([[0., 9.42072184, 12.97433364, 15.65247584], - [9.42072184, 0., 9.91631652, 0.], - [12.97433364, 9.91631652, 0., 6.70820393], - [15.65247584, 0., 6.70820393, 0.]]) - - assert_array_almost_equal(D1, D2) - # check with pairs of matrices with missing values X = np.array([[1., np.nan, 3., 4., 2.], [np.nan, 4., 6., 1., np.nan], @@ -442,14 +426,33 @@ def test_masked_euclidean_distances(): [np.nan, np.nan, 5., 4., 7.], [np.nan, np.nan, np.nan, 4., 5.]]) - D3 = np.array([[6.32455532, 6.95221787, 4.74341649], + D1 = np.array([[6.32455532, 6.95221787, 4.74341649], [5., 5., 6.70820393], [2.23606798, 13.41640786, 8.94427191]]) - D4 = masked_euclidean_distances(X, Y, missing_values="NaN") + D2 = masked_euclidean_distances(X, Y, missing_values="NaN") + + assert_array_almost_equal(D1, D2) + + # check when squared = True + D3 = np.array( + [[40., 48.33333331, 22.5], + [25., 25., 45.], + [5., 180., 80.]]) + D4 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN") assert_array_almost_equal(D3, D4) + # Check with explicit formula and square=True + assert_array_almost_equal( + masked_euclidean_distances(X[:1], Y[:1], squared=True), + [[5.0/2.0 * ((7-3)**2 + (2-2)**2)]]) + + # Check when Y = X is explicitly passed + D5 = masked_euclidean_distances(X, missing_values="NaN") + D6 = masked_euclidean_distances(X, X, missing_values="NaN") + assert_array_almost_equal(D5, D6) + def test_cosine_distances(): # Check the pairwise Cosine distances computation diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 78f319716ec85..e173868f19040 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -161,24 +161,6 @@ def _init_params(self, n_neighbors=None, radius=None, def _fit(self, X): if self.metric in _MASKED_SUPPORTED_METRICS: kill_missing = False - # if not kill_missing: - # if self.metric not in _MASKED_SUPPORTED_METRICS: - # raise ValueError( - # "Metric {0} is currently not supported for " - # "data containing missing values.".format(self.metric) - # ) - # - # _MASKED_SUPPORTED_ALGORITHMS = ["brute"] - # if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: - # if self.algorithm == "auto": - # pass - # else: - # warnings.warn( - # "{0} algorithm is currently not supported for " - # "data containing missing values. " - # "Reverting to a supported algorithm.". - # format(self.algorithm)) - # self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] else: kill_missing = True @@ -225,8 +207,6 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - # # copy=True if missing accepted as they will be replaced by 0 - # copy = True if kill_missing is False else False X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) @@ -364,7 +344,11 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + if self.effective_metric_ in _MASKED_SUPPORTED_METRICS: + X = check_array(X, accept_sparse='csr', + force_all_finite=False) + else: + X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X @@ -388,6 +372,10 @@ class from an array representing our data set and ask who's if self.effective_metric_ == 'euclidean': dist = pairwise_distances(X, self._fit_X, 'euclidean', n_jobs=n_jobs, squared=True) + elif self.effective_metric_ == 'masked_euclidean': + dist = pairwise_distances(X, self._fit_X, + 'masked_euclidean', + n_jobs=n_jobs, squared=True) else: dist = pairwise_distances( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, @@ -400,7 +388,8 @@ class from an array representing our data set and ask who's sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == 'euclidean': + if self.effective_metric_ == 'euclidean' or self.\ + effective_metric_ == 'masked_euclidean': result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = dist[sample_range, neigh_ind], neigh_ind @@ -454,174 +443,6 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind - def masked_kneighbors(self, X=None, n_neighbors=None, return_distance=True, - missing_values="NaN", copy=True): - """Finds the K-neighbors of a point, even when they contain NaN values. - - Returns indices of and distances to the neighbors of each point. - - Parameters - ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' - The query point or points. - If not provided, neighbors of each indexed point are returned. - In this case, the query point is not considered its own neighbor. - - n_neighbors : int - Number of neighbors to get (default is the value - passed to the constructor). - - return_distance : boolean, optional. Defaults to True. - If False, distances will not be returned - - missing_values : "NaN" or integer, optional. Default is "NaN". - Representation of missing value - - copy : boolean, optional. Default is True. - Create and use a deep copy of X - - Returns - ------- - dist : array - Array representing the lengths to points, only present if - return_distance=True - - ind : array - Indices of the nearest points in the population matrix. - - Examples - -------- - In the following example, we construct a NeighborsClassifier - class from an array representing our data set and ask who's - the closest point to [0, nan, 1], where "nan" represents a - missing value. - >>> nan = float("nan") - >>> samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] - >>> from sklearn.neighbors import NearestNeighbors - >>> neigh = NearestNeighbors(n_neighbors=2, metric="masked_euclidean") - >>> neigh.fit(samples) # doctest: +ELLIPSIS - NearestNeighbors(algorithm='auto', leaf_size=30,...) - >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) - >>> print(N) # doctest: +ELLIPSIS - [[3 1] - [3 2] - [3 1] - [2 1]] - - >>> X = [[0, nan, 1]] - >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) - >>> print(N2) # doctest: +ELLIPSIS - [[1 3]] - - """ - if self._fit_method is None: - raise NotFittedError("Must fit neighbors before querying.") - - if n_neighbors is None: - n_neighbors = self.n_neighbors - - if X is not None: - query_is_train = False - X = check_array(X, accept_sparse='csr', - force_all_finite=False, copy=copy) - else: - query_is_train = True - X = self._fit_X - # Include an extra neighbor to account for the sample itself being - # returned, which is removed later - n_neighbors += 1 - - train_size = self._fit_X.shape[0] - if n_neighbors > train_size: - raise ValueError( - "Expected n_neighbors <= n_samples, " - " but n_samples = %d, n_neighbors = %d" % - (train_size, n_neighbors) - ) - n_samples, _ = X.shape - sample_range = np.arange(n_samples)[:, None] - - n_jobs = _get_n_jobs(self.n_jobs) - if self._fit_method == 'brute': - # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'masked_euclidean': - dist = pairwise_distances(X, self._fit_X, 'masked_euclidean', - n_jobs=n_jobs, squared=True, - missing_values=missing_values, - copy=copy) - else: - # dist = pairwise_distances( - # X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, - # **self.effective_metric_params_) - raise ValueError( - "Only the following metrics are currently supported for " - "data with missing values:{0}". - format(_MASKED_SUPPORTED_METRICS) - ) - neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) - neigh_ind = neigh_ind[:, :n_neighbors] - # argpartition doesn't guarantee sorted order, so we sort again - neigh_ind = neigh_ind[ - sample_range, np.argsort(dist[sample_range, neigh_ind])] - - if return_distance: - if self.effective_metric_ == 'masked_euclidean': - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind - else: - result = neigh_ind - - # elif self._fit_method in ['ball_tree', 'kd_tree']: - # if issparse(X): - # raise ValueError( - # "%s does not work with sparse matrices." - # "Densify the data, " - # "or set algorithm='brute'" % self._fit_method) - # result = Parallel(n_jobs, backend='threading')( - # delayed(self._tree.query, check_pickle=False)( - # X[s], n_neighbors, return_distance) - # for s in gen_even_slices(X.shape[0], n_jobs) - # ) - # if return_distance: - # dist, neigh_ind = tuple(zip(*result)) - # result = np.vstack(dist), np.vstack(neigh_ind) - # else: - # result = np.vstack(result) - else: - raise ValueError("internal: _fit_method not recognized for data " - "containing missing") - - if not query_is_train: - return result - else: - # If the query data is the same as the indexed data, we would like - # to ignore the first nearest neighbor of every sample, i.e - # the sample itself. - if return_distance: - dist, neigh_ind = result - else: - neigh_ind = result - - sample_mask = neigh_ind != sample_range - - # Corner case: When the number of duplicates are more - # than the number of neighbors, the first NN will not - # be the sample, but a duplicate. - # In that case mask the first duplicate. - dup_gr_nbrs = np.all(sample_mask, axis=1) - sample_mask[:, 0][dup_gr_nbrs] = False - - neigh_ind = np.reshape( - neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) - - if return_distance: - dist = np.reshape( - dist[sample_mask], (n_samples, n_neighbors - 1)) - return dist, neigh_ind - return neigh_ind - def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'): """Computes the (weighted) graph of k-Neighbors for points in X diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index be457fdcf4b5a..b9390883f9903 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -104,9 +104,8 @@ def test_masked_unsupervised_kneighbors(): neigh = neighbors.NearestNeighbors(2, metric="masked_euclidean") neigh.fit(X) - X_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) - XY_neigh = neigh.masked_kneighbors(Y, 2, return_distance=False) - + X_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) + XY_neigh = neigh.kneighbors(Y, 2, return_distance=False) # Expected outcome N1 = np.array( [[1, 4], @@ -130,12 +129,10 @@ def test_masked_unsupervised_kneighbors(): samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="masked_euclidean") - neigh.fit(samples) - X2_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) - XY2_neigh = neigh.masked_kneighbors([[0, nan, 1]], 2, - return_distance=False) + X2_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) + XY2_neigh = neigh.kneighbors([[0, nan, 1]], 2, return_distance=False) # Expected outcome N3 = np.array( From a968b1e27af4b71990d8a7e117f4f56b67dd7833 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 24 Jul 2017 00:57:34 -0500 Subject: [PATCH 14/97] Pep8 fix --- sklearn/metrics/tests/test_pairwise.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index c92c41532fca5..3247862726a01 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -62,7 +62,8 @@ def test_pairwise_distances(): Y_masked = rng.random_sample((2, 4)) X_masked[0, 0] = np.nan Y_masked[0, 0] = np.nan - S_masked = pairwise_distances(X_masked, Y_masked, metric="masked_euclidean") + S_masked = pairwise_distances(X_masked, Y_masked, + metric="masked_euclidean") S2_masked = masked_euclidean_distances(X_masked, Y_masked) assert_array_almost_equal(S_masked, S2_masked) # Test with tuples as X and Y From 356c8e86278f03d02bbe3288520a25c55f6c9762 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 24 Jul 2017 02:35:10 -0500 Subject: [PATCH 15/97] Comment edit on test_pairwise --- sklearn/metrics/tests/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 3247862726a01..ee0020ee4e1d0 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -57,7 +57,7 @@ def test_pairwise_distances(): S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) - # Euclidean dist. (masked) should be equivalent to calling the function. + # Check to ensure NaNs work with pairwise_distances. X_masked = rng.random_sample((5, 4)) Y_masked = rng.random_sample((2, 4)) X_masked[0, 0] = np.nan From d6aeaf31f8524dde4bd82a18c4e42ea79d96f1cf Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 01:15:46 -0500 Subject: [PATCH 16/97] Addressed review comments #4 --- sklearn/metrics/pairwise.py | 40 ++++++++++++------------ sklearn/metrics/tests/test_pairwise.py | 11 +++---- sklearn/neighbors/base.py | 43 +++++++++++--------------- 3 files changed, 43 insertions(+), 51 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index ebbdb1ad7e8ae..58dd5cde301e0 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -197,32 +197,44 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. + For efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: + dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) + This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. + However, this is not the most precise way of doing this computation, and the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. + Read more in the :ref:`User Guide `. + Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples_1, n_features) + Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) + Y_norm_squared : array-like, shape (n_samples_2, ), optional Pre-computed dot-products of vectors in Y (e.g., ``(Y**2).sum(axis=1)``) + squared : boolean, optional Return squared Euclidean distances. + X_norm_squared : array-like, shape = [n_samples_1], optional Pre-computed dot-products of vectors in X (e.g., ``(X**2).sum(axis=1)``) + Returns ------- distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) + Examples -------- >>> from sklearn.metrics.pairwise import euclidean_distances @@ -277,7 +289,6 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, return distances if squared else np.sqrt(distances, out=distances) -# Pairwise distances in the presence of missing values def masked_euclidean_distances(X, Y=None, squared=False, missing_values="NaN", copy=True): """Calculates euclidean distances in the presence of missing values @@ -302,27 +313,18 @@ def masked_euclidean_distances(X, Y=None, squared=False, Formula in matrix form derived by: Shreya Bhattarai - This formulation zero-weights feature coordinates with missing value in - either vector in the pair and up-weights the remaining coordinates. - For instance, say we have two sample points (x1, y1) and (x2, NaN): + When calculating the distance between a pair of samples, this formulation + essentially zero-weights feature coordinates with a missing value in either + sample and scales up the weight of the remaining coordinates: - To calculate the euclidean distance between these, first the square - "distance" is calculated based only on the first feature coordinate - as the second coordinate is missing in one of the samples, - i.e., we have (x2-x1)**2. This squared distance is scaled-up by the ratio - of total number of coordinates to the number of available coordinates, - which in this case is 2/1 = 2. Now, we are left with 2*((x2-x1)**2). - Finally, if squared=False then the square root of this is evaluated - and returned otherwise the value is returned as is. - - Breakdown of euclidean distance calculation between a vector pair x,y: - - weight = Total # of coordinates / # of non-missing coordinates dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates) + where, + weight = Total # of coordinates / # of non-missing coordinates - This formulation implies that if all coordinates are missing in either - vector in the pair or if there are no common non-missing coordinates then - NaN is returned for that pair. + For instance, the distance between sample points (x1, y1) and (x2, NaN) + would result in sqrt(2*((x2-x1)**2). Note that if all the coordinates are + missing or if there are no common non-missing coordinates then NaN is + returned for that pair. Read more in the :ref:`User Guide `. diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index ee0020ee4e1d0..812d35eace9cd 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -427,13 +427,10 @@ def test_masked_euclidean_distances(): [np.nan, np.nan, 5., 4., 7.], [np.nan, np.nan, np.nan, 4., 5.]]) - D1 = np.array([[6.32455532, 6.95221787, 4.74341649], - [5., 5., 6.70820393], - [2.23606798, 13.41640786, 8.94427191]]) + D1 = masked_euclidean_distances(X, Y, missing_values="NaN") + D2 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN") - D2 = masked_euclidean_distances(X, Y, missing_values="NaN") - - assert_array_almost_equal(D1, D2) + assert_array_almost_equal(D1**2, D2) # check when squared = True D3 = np.array( @@ -444,7 +441,7 @@ def test_masked_euclidean_distances(): assert_array_almost_equal(D3, D4) - # Check with explicit formula and square=True + # Check with explicit formula and squared=True assert_array_almost_equal( masked_euclidean_distances(X[:1], Y[:1], squared=True), [[5.0/2.0 * ((7-3)**2 + (2-2)**2)]]) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e173868f19040..a6ffbff630c2a 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -159,10 +159,8 @@ def _init_params(self, n_neighbors=None, radius=None, self._fit_method = None def _fit(self, X): - if self.metric in _MASKED_SUPPORTED_METRICS: - kill_missing = False - else: - kill_missing = True + allow_nans = True if self.\ + metric in _MASKED_SUPPORTED_METRICS else False if self.metric_params is None: self.effective_metric_params_ = {} @@ -208,29 +206,28 @@ def _fit(self, X): return self X = check_array(X, accept_sparse='csr', - force_all_finite=kill_missing) + force_all_finite=not allow_nans) n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") if issparse(X): - if not kill_missing: + if allow_nans: raise ValueError( "Nearest neighbor algorithm does not currently support" - "the use of sparse matrices." + "the use of sparse matrices for missing values." ) - else: - if self.algorithm not in ('auto', 'brute'): - warnings.warn("cannot use tree with sparse input: " - "using brute force") - if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: - raise ValueError("metric '%s' not valid for sparse input" - % self.effective_metric_) - self._fit_X = X.copy() - self._tree = None - self._fit_method = 'brute' - return self + if self.algorithm not in ('auto', 'brute'): + warnings.warn("cannot use tree with sparse input: " + "using brute force") + if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: + raise ValueError("metric '%s' not valid for sparse input" + % self.effective_metric_) + self._fit_X = X.copy() + self._tree = None + self._fit_method = 'brute' + return self self._fit_method = self.algorithm self._fit_X = X @@ -369,12 +366,10 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean': - dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=n_jobs, squared=True) - elif self.effective_metric_ == 'masked_euclidean': + if self.effective_metric_ == 'euclidean' or self.\ + effective_metric_ == 'masked_euclidean': dist = pairwise_distances(X, self._fit_X, - 'masked_euclidean', + self.effective_metric_, n_jobs=n_jobs, squared=True) else: dist = pairwise_distances( @@ -813,7 +808,6 @@ def fit(self, X, y): class UnsupervisedMixin(object): - # def fit(self, X, y=None, kill_missing=True): def fit(self, X, y=None): """Fit the model using X as training data @@ -823,5 +817,4 @@ def fit(self, X, y=None): Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. """ - # return self._fit(X, kill_missing) return self._fit(X) From e8ccdee096add356c55e6d20986a2f625bfbc57f Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 02:37:42 -0500 Subject: [PATCH 17/97] replaced or with in --- sklearn/neighbors/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index a6ffbff630c2a..00f3537f81354 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -366,8 +366,7 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean' or self.\ - effective_metric_ == 'masked_euclidean': + if self.effective_metric_ in ['euclidean', 'masked_euclidean']: dist = pairwise_distances(X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, squared=True) From 4a8309b6605fdf19bc72303785f15e6d04699146 Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 02:46:05 -0500 Subject: [PATCH 18/97] Changed allow_nans assignment --- sklearn/neighbors/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 00f3537f81354..3e77b51aad740 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -159,8 +159,7 @@ def _init_params(self, n_neighbors=None, radius=None, self._fit_method = None def _fit(self, X): - allow_nans = True if self.\ - metric in _MASKED_SUPPORTED_METRICS else False + allow_nans = self.metric in _MASKED_SUPPORTED_METRICS if self.metric_params is None: self.effective_metric_params_ = {} From 5cbc156a4663621f9555f150a0f1d1c50bf2b988 Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 02:50:11 -0500 Subject: [PATCH 19/97] One more or to in --- sklearn/neighbors/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 3e77b51aad740..66dbc87fe39f9 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -381,8 +381,7 @@ class from an array representing our data set and ask who's sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == 'euclidean' or self.\ - effective_metric_ == 'masked_euclidean': + if self.effective_metric_ in ['euclidean', 'masked_euclidean']: result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = dist[sample_range, neigh_ind], neigh_ind From a31c43a595c9a594f072930cc0b0b672983e689e Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 31 Jul 2017 00:54:13 -0500 Subject: [PATCH 20/97] Addressed review comments #5 --- sklearn/metrics/pairwise.py | 61 +++++++++----------------- sklearn/metrics/tests/test_pairwise.py | 31 +++++++++++++ 2 files changed, 51 insertions(+), 41 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 58dd5cde301e0..c00b6847e379d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -102,11 +102,6 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, to be any format. False means that a sparse matrix input will raise an error. - .. deprecated:: 0.19 - Passing 'None' to parameter ``accept_sparse`` in methods is - deprecated in version 0.19 "and will be removed in 0.21. Use - ``accept_sparse=False`` instead. - force_all_finite : bool Whether to raise an error on np.inf and np.nan in X (or Y if it exists) @@ -293,25 +288,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, missing_values="NaN", copy=True): """Calculates euclidean distances in the presence of missing values - Considering the rows of X (and Y=X) as vectors, compute the - distance matrix between each pair of vectors. Similarly, if - Y is not X, then compute the distance matrix between each - pair of vectors (i.e., each row pair) in X and Y. - - This function computes pairwise euclidean distance for vectors - in dense matrices X and Y with missing values in arbitrary - coordinates. - - The following formula is used for this: - - dist(X, Y) = (X.shape[1] / ((dot(NX, NYT)))) * - (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + - dot(NX, (Y.T * Y.T))) - - where NX and NYT represent the logical-not of the missing masks of - X and Y.T, respectively. - Formula in matrix form derived by: - Shreya Bhattarai + Considering the rows of X (and Y=X) as samples, compute the distance matrix + between each pair of samples. Similarly, if Y is not X, then compute the + distance matrix between each sample pair (i.e., each row pair) in X and Y. When calculating the distance between a pair of samples, this formulation essentially zero-weights feature coordinates with a missing value in either @@ -321,10 +300,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, where, weight = Total # of coordinates / # of non-missing coordinates - For instance, the distance between sample points (x1, y1) and (x2, NaN) - would result in sqrt(2*((x2-x1)**2). Note that if all the coordinates are - missing or if there are no common non-missing coordinates then NaN is - returned for that pair. + Note that if all the coordinates are missing or if there are no common + non-missing coordinates then NaN is returned for that pair. Read more in the :ref:`User Guide `. @@ -377,31 +354,31 @@ def masked_euclidean_distances(X, Y=None, squared=False, # NOTE: force_all_finite=False allows not only NaN but also +/- inf X, Y = check_pairwise_arrays(X, Y, accept_sparse=False, force_all_finite=False, copy=copy) - if (np.any(np.isinf(X.data)) or - (Y is not None and np.any(np.isinf(Y.data)))): + if (np.any(np.isinf(X)) or + (Y is not X and np.any(np.isinf(Y)))): raise ValueError( "+/- Infinite values are not allowed.") + # Get missing mask for X and Y.T + mask_X = _get_mask(X, missing_values) + + YT = Y.T + mask_YT = _get_mask(YT, missing_values) + # Check if any rows have only missing value - if np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])\ - or (Y is not None and np.any(_get_mask(Y, missing_values).sum( - axis=1) == Y.shape[1])): + if np.any(mask_X.sum(axis=1) == X.shape[1])\ + or (Y is not X and np.any(mask_YT.sum(axis=0) == Y.shape[1])): raise ValueError("One or more rows only contain missing values.") # else: if missing_values != "NaN" and \ - (np.any(_get_mask(X.data, "NaN")) or - np.any(_get_mask(Y.data, "NaN"))): + (np.any(np.isnan(X)) or + (Y is not X and np.any(np.isnan(Y)))): raise ValueError( "NaN values present but missing_value = {0}".format( missing_values)) - # Get missing mask for X - mask_X = _get_mask(X, missing_values) - - # Get Y.T mask and anti-mask and set Y.T's missing to zero - YT = Y.T - mask_YT = _get_mask(YT, missing_values) + # Get anti-mask and set Y.T's missing to zero NYT = (~mask_YT).astype(np.int32) YT[mask_YT] = 0 @@ -410,6 +387,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, X[mask_X] = 0 # Calculate distances + # The following formula was derived in matrix form by: + # Shreya Bhattarai distances = (X.shape[1] / (np.dot(NX, NYT))) * \ (np.dot(X * X, NYT) - 2 * (np.dot(X, YT)) + diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 812d35eace9cd..6312aa7a2590b 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -451,6 +451,37 @@ def test_masked_euclidean_distances(): D6 = masked_euclidean_distances(X, X, missing_values="NaN") assert_array_almost_equal(D5, D6) + # Check with missing_value = 1 while NaN is present + assert_raises(ValueError, masked_euclidean_distances, X, Y, + missing_values=1) + # Check with inf present + X_inf = np.array([ + [np.inf, np.nan, 3., 4., 2.], + [np.nan, 4., 6., 1., np.nan], + [3., np.nan, np.nan, np.nan, 1.]]) + + assert_raises(ValueError, masked_euclidean_distances, X_inf, Y) + + # Check with a row containing all NaNs + X_nan_row = np.array([ + [1., np.nan, 3., 4., 2.], + [np.nan, 4., 6., 1., np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan]]) + + Y_nan_row = np.array([ + [np.nan, 7., 7., np.nan, 2.], + [np.nan, np.nan, 5., 4., 7.], + [np.nan, np.nan, np.nan, np.nan, np.nan]]) + + assert_raises(ValueError, masked_euclidean_distances, X_nan_row, Y) + assert_raises(ValueError, masked_euclidean_distances, X, Y_nan_row) + + # Check copy = True against copy = False + # Note: This test will alter X and Y + D7 = masked_euclidean_distances(X, Y, copy=True) + D8 = masked_euclidean_distances(X, Y, copy=False) + assert_array_almost_equal(D7, D8) + def test_cosine_distances(): # Check the pairwise Cosine distances computation From eacb19d5740ea435b22182f14a67f6b91383808a Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 31 Jul 2017 01:30:10 -0500 Subject: [PATCH 21/97] Edited comments --- sklearn/metrics/tests/test_pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 6312aa7a2590b..30668d01bd418 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -418,7 +418,7 @@ def test_euclidean_distances(): def test_masked_euclidean_distances(): - # check with pairs of matrices with missing values + # Check with pairs of matrices with missing values X = np.array([[1., np.nan, 3., 4., 2.], [np.nan, 4., 6., 1., np.nan], [3., np.nan, np.nan, np.nan, 1.]]) @@ -432,7 +432,7 @@ def test_masked_euclidean_distances(): assert_array_almost_equal(D1**2, D2) - # check when squared = True + # Check when squared = True D3 = np.array( [[40., 48.33333331, 22.5], [25., 25., 45.], From cfb7c97dfe157c11504abf078fc92d5686286ea3 Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 2 Aug 2017 22:58:11 -0500 Subject: [PATCH 22/97] KNN Imputation with masked_euclidean and sklearn.neighbors --- sklearn/preprocessing/imputation.py | 243 +++--------------------- sklearn/preprocessing/knn_imputation.py | 222 ++++++++++++++++++++++ 2 files changed, 245 insertions(+), 220 deletions(-) create mode 100644 sklearn/preprocessing/knn_imputation.py diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index db4364fa12731..12d5425fbf604 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -59,95 +59,6 @@ def _most_frequent(array, extra_value, n_repeat): else: return extra_value -def _get_knn(X, value_to_mask="NaN", n_neighbors=10, Y=None): - """Returns the k(=n_neighbors) nearest neighbors of vectors in a - given matrix in euclidean space. If two matrices are passed, - then k-Nearest Neighbors of vectors in X in the matrix Y is returned.""" - - # Setup missing mask - mask_X = _get_mask(X, value_to_mask) - - if Y is None: - # Setup the anti-mask and change missing to zero - mask_X = _get_mask(X, value_to_mask) - XT = np.transpose(X) - N = (~mask_X) * 1 - NT = np.transpose(N) - X[mask_X] = 0 - - # Matrix formula to calculate pair-wise distance between all vectors in a - # matrix with missing values. It zero-weights coordinates with missing value - # in either vector in the pair and up-weights the remaining coordinates. - # Matrix formula derived by: Shreya Bhattarai - - """ - Store np.dot(N, (XT * XT)) and add its transpose rather than - redoing a matrix product - dist = np.sqrt((X.shape[1] * 1 / ((np.dot(N, NT)))) * ( - np.dot(N, (XT * XT)) - 2 * (np.dot(X, XT)) + - np.dot((X * X), NT))) - - N_dot_XT2 = np.dot(N, (XT * XT)) - N_dot_XT2_T = np.transpose(N_dot_XT2) - """ - - N_dot_XT2 = np.dot(N, (XT * XT)) - N_dot_XT2_T = np.transpose(N_dot_XT2) - - dist = np.sqrt((X.shape[1] * 1 / ((np.dot(N, NT)))) * ( - N_dot_XT2 - 2 * (np.dot(X, XT)) + - N_dot_XT2_T)) - - # Set distance with self to np.inf - np.fill_diagonal(dist, np.inf) - - else: - # ValueError if X and Y have incompatible dimensions - if X.shape[1] != Y.shape[1]: - raise ValueError("The search dimension of the matrices " - "are not equal: [{0}] versus [{1}]". - format(X.shape[1], Y.shape[1])) - - mask_Y = _get_mask(Y, value_to_mask) - NY = (~mask_Y) * 1 - YT = np.transpose(Y) - mask_YT = _get_mask(YT, value_to_mask) - NYT = np.transpose(NY) - YT[mask_YT] = 0 - - NX = (~mask_X) * 1 - X[mask_X] = 0 - - # Matrix formula to calculate pair-wise distance between all vectors in a - # matrix X to vectors in matrix Y. It handles missing values the same way - # as for a single matrix. - # Matrix formula derived by: Shreya Bhattarai - - dist = np.sqrt((X.shape[1] * 1 / ((np.dot(NX, NYT)))) * - (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + np.dot(NX, (YT * YT)))) - - # Ensure enough candidate neighbors are available - n_candidates = dist.shape[1] if Y is not None else dist.shape[1] - 1 - if n_candidates < n_neighbors: - raise ValueError("There are only %d candidate neighbors, " - "but n_neighbors=%d." - % (dist.shape[1] - 1, n_neighbors)) - - # Missing locations and counts - row_missing_sum_X = mask_X.sum(axis=1) - # is_row_missing_X = np.any(mask_X, axis=1) - # is_col_missing_X = np.any(mask_X, axis=0) - col_missing_index_X = np.where(mask_X)[1] - - # Arg-partition (quasi-argsort) of n_neighbors and retrieve them - nbors_index = np.argpartition(dist, n_neighbors - 1, axis=1) - knn_row_index = nbors_index[:, :n_neighbors] - knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) - knn_row_index = np.repeat(knn_row_index, row_missing_sum_X, axis=0) - knn_row_index = knn_row_index.ravel() - # This assumes columns in X and Y are in the same order; maybe change this? - knn_col_index = np.repeat(col_missing_index_X, n_neighbors) - return knn_row_index, knn_col_index class Imputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. @@ -204,16 +115,12 @@ class Imputer(BaseEstimator, TransformerMixin): contain missing values). """ def __init__(self, missing_values="NaN", strategy="mean", - axis=0, verbose=0, copy=True, n_neighbors=10, - row_max_missing=0.5, col_max_missing=0.8): + axis=0, verbose=0, copy=True): self.missing_values = missing_values self.strategy = strategy self.axis = axis self.verbose = verbose self.copy = copy - self.n_neighbors = n_neighbors - self.row_max_missing = row_max_missing - self.col_max_missing = col_max_missing def fit(self, X, y=None): """Fit the imputer on X. @@ -230,7 +137,7 @@ def fit(self, X, y=None): Returns self. """ # Check parameters - allowed_strategies = ["mean", "median", "most_frequent", "knn"] + allowed_strategies = ["mean", "median", "most_frequent"] if self.strategy not in allowed_strategies: raise ValueError("Can only use these strategies: {0} " " got strategy={1}".format(allowed_strategies, @@ -340,11 +247,6 @@ def _sparse_fit(self, X, strategy, missing_values, axis): return most_frequent - # KNN - elif strategy == "knn": - raise ValueError("strategy='knn' does not support sparse " - "matrix input yet.") - def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" X = check_array(X, force_all_finite=False) @@ -396,80 +298,6 @@ def _dense_fit(self, X, strategy, missing_values, axis): return most_frequent - # KNN - elif strategy == "knn": - if self.copy: - X = np.copy(X) - - if axis == 1: - X = X.transpose() - mask = mask.transpose() - - #Get dimensions and missing count - n_rows, n_cols = X.shape - row_missing_sum = mask.sum(axis=1) - - #ValueError if % missing in any column > self.col_max_missing - if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): - raise ValueError("The following axis position(s) have, " - "more than {0}% missing values: {1}" - .format(self.col_max_missing*100,np.where(mask.sum(axis=0) > - (X.shape[0] * self.col_max_missing)))) - - if X.shape[0] < self.n_neighbors: - raise ValueError("There are only %d samples, " - "but n_neighbors=%d." - % (X.shape[0], self.n_neighbors)) - - #Fit to data - - # Check for excessive missingness in rows - bad_rows = row_missing_sum > (mask.shape[1] * self.row_max_missing) - X_bad = X[bad_rows, :] - - if np.any(bad_rows): - X = X[~bad_rows, :] - mask = _get_mask(X, missing_values) - - #Get the k nearest neighbors and impute - if hasattr(self, 'statistics_'): - Y = self.statistics_.data - knnrows_index, knncols_index = _get_knn(X, - n_neighbors=self.n_neighbors, Y=Y) - X[mask] = np.nan - imputed = np.nanmean((Y[(knnrows_index, knncols_index)]). - reshape((-1, self.n_neighbors)), axis=1) - else: - knnrows_index, knncols_index = _get_knn(X, - n_neighbors=self.n_neighbors) - X[mask] = np.nan - imputed = np.nanmean((X[(knnrows_index, knncols_index)]). - reshape((-1, self.n_neighbors)), axis=1) - X[mask] = imputed - - #Merge bad rows to X and mean impute any leftover missing - if np.any(bad_rows): - X_merged = np.empty((n_rows, n_cols)) - X_merged[bad_rows, :] = X_bad - X_merged[~bad_rows, :] = X - X = X_merged - - #Impute bad_rows and leftover missing with column means - mask_after_knn = _get_mask(X, self.missing_values) - if np.any(mask_after_knn): - missing_index = np.where(mask_after_knn) - X_col_means = masked_X.mean(axis=0).data - X[missing_index] = np.take(X_col_means, missing_index[1]) - - # Transpose back - if axis == 1: - X = X.transpose() - - #The mask is used to compare this imputed matrix with - #input matrix in transform(), so return X as a masked array. - X = np.ma.array(X,mask=masked_X.mask) - return X - def transform(self, X): """Impute all missing values in X. @@ -483,8 +311,7 @@ def transform(self, X): X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, force_all_finite=False, copy=self.copy) statistics = self.statistics_ - #> Added knn exception below - if self.strategy != "knn" and X.shape[1] != statistics.shape[0]: + if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" % (X.shape[1], self.statistics_.shape[0])) @@ -507,22 +334,21 @@ def transform(self, X): self.missing_values, self.axis) - if self.strategy != "knn": - # Delete the invalid rows/columns - invalid_mask = np.isnan(statistics) - valid_mask = np.logical_not(invalid_mask) - valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.where(valid_mask)[0] - missing = np.arange(X.shape[not self.axis])[invalid_mask] - - if self.axis == 0 and invalid_mask.any(): - if self.verbose: - warnings.warn("Deleting features without " - "observed values: %s" % missing) - X = X[:, valid_statistics_indexes] - elif self.axis == 1 and invalid_mask.any(): - raise ValueError("Some rows only contain " - "missing values: %s" % missing) + # Delete the invalid rows/columns + invalid_mask = np.isnan(statistics) + valid_mask = np.logical_not(invalid_mask) + valid_statistics = statistics[valid_mask] + valid_statistics_indexes = np.where(valid_mask)[0] + missing = np.arange(X.shape[not self.axis])[invalid_mask] + + if self.axis == 0 and invalid_mask.any(): + if self.verbose: + warnings.warn("Deleting features without " + "observed values: %s" % missing) + X = X[:, valid_statistics_indexes] + elif self.axis == 1 and invalid_mask.any(): + raise ValueError("Some rows only contain " + "missing values: %s" % missing) # Do actual imputation if sparse.issparse(X) and self.missing_values != 0: @@ -538,36 +364,13 @@ def transform(self, X): mask = _get_mask(X, self.missing_values) n_missing = np.sum(mask, axis=self.axis) + values = np.repeat(valid_statistics, n_missing) - if self.strategy == 'knn': - if self.axis == 1: - X = X.transpose() - mask = mask.transpose() - statistics = statistics.transpose() - - #Check if the masks and the unmasked values are equal - mask_fitted = statistics.mask - masked_X = np.ma.array(X, mask=mask) - if np.array_equal(mask, mask_fitted)\ - and np.ma.allequal(masked_X, statistics): - X = statistics.data - else: - X = self._dense_fit(X, - self.strategy, - self.missing_values, - self.axis).data - - if self.axis == 1: - X = X.transpose() - + if self.axis == 0: + coordinates = np.where(mask.transpose())[::-1] else: - values = np.repeat(valid_statistics, n_missing) - - if self.axis == 0: - coordinates = np.where(mask.transpose())[::-1] - else: - coordinates = mask + coordinates = mask - X[coordinates] = values + X[coordinates] = values return X diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py new file mode 100644 index 0000000000000..3887169d37ad4 --- /dev/null +++ b/sklearn/preprocessing/knn_imputation.py @@ -0,0 +1,222 @@ +# Authors: Ashim Bhattarai +# License: BSD 3 clause + +from __future__ import division +import warnings +import numpy as np + +from ..base import BaseEstimator, TransformerMixin +from ..utils import check_array +from ..utils.validation import check_is_fitted +from ..utils.validation import FLOAT_DTYPES +from ..neighbors import NearestNeighbors + +__all__ = [ + 'KNNImputer', +] + + +def _get_mask(X, value_to_mask): + """Compute the boolean mask X == missing_values.""" + if value_to_mask == "NaN" or np.isnan(value_to_mask): + return np.isnan(X) + else: + return X == value_to_mask + + +class KNNImputer(BaseEstimator, TransformerMixin): + """Imputation transformer for completing missing values. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + missing_values : integer or "NaN", optional (default="NaN") + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. For missing values encoded as np.nan, + use the string value "NaN". + + n_neighbors : int, optional (default = 10) + Number of neighbors to get. + + weighted : bool, optional (default = True) + Set the imputed value as a distance-weighted score of the neighbors + + metric : string or callable, optional (default = 'masked_euclidean') + metric to use for distance computation. + + row_max_missing: float, optional (default = 0.5) + The maximum percentage of columns (i.e. features) that can be missing + before the sample is excluded from nearest neighbor imputation. It + means that such rows will not be considered a potential donor in fit() + and in transform() their missing feature values will be imputed to be + the column mean for the entire dataset. + + col_max_missing: float, optional (default = 0.8) + The maximum percentage of rows (or samples) that can be missing + for a given feature beyond which an error is raised. + + copy : boolean, optional (default=True) + If True, a copy of X will be created. If False, imputation will + be done in-place whenever possible. Note that, if metric is + "masked_euclidean" and copy=False then missing_values in the + input matrix X will be overwritten with zeros. + + Attributes + ---------- + statistics_ : array of shape (n_features,) + A tuple whose first element if the fitted NearestNeighbors object + and second element is the column means using available values. + + Notes + ----- + """ + + def __init__(self, missing_values="NaN", n_neighbors=10, + weighted=True, metric="masked_euclidean", + row_max_missing=0.5, col_max_missing=0.8, copy=True): + + self.missing_values = missing_values + self.n_neighbors = n_neighbors + self.weighted = weighted + self.metric = metric + self.row_max_missing = row_max_missing + self.col_max_missing = col_max_missing + self.copy = copy + + def fit(self, X, y=None): + """Fit the imputer on X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : object + Returns self. + """ + # Check parameters + X = check_array(X, accept_sparse=False, dtype=np.float64, + force_all_finite=False, copy=self.copy) + mask = _get_mask(X, self.missing_values) + + # Check if % missing in any column > col_max_missing + if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): + raise ValueError("The following columns have, " + "more than {0}% missing values: {1}" + .format(self.col_max_missing*100, np.where( + mask.sum(axis=0) > (X.shape[0] * + self.col_max_missing)))) + # X_masked = np.ma.array(X, mask=mask) + X_col_means = X.mean(axis=0) + + # Check if % missing in any row > col_max_missing + bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) + if np.any(bad_rows): + warnings.warn( + "The following rows have more than {0}% missing values and " + "are not included as nearest neighbors: {1}" + .format(self.row_max_missing*100, np.where(bad_rows))) + + # Remove rows that have more than row_max_missing % missing + X = X[~bad_rows, :] + mask = _get_mask(X, self.missing_values) + # X_masked = np.ma.array(X, mask=mask) + + if X.shape[0] < self.n_neighbors: + raise ValueError("There are only %d samples, " + "but n_neighbors=%d." + % (X.shape[0], self.n_neighbors)) + + # Instantiate NN object, get column means, and store in statistics_ + neigh = NearestNeighbors(n_neighbors=self.n_neighbors, + metric=self.metric) + self.statistics_ = (neigh.fit(X), X_col_means) + + return self + + def transform(self, X): + """Impute all missing values in X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape = [n_samples, n_features] + The input data to complete. + """ + check_is_fitted(self, 'statistics_') + X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, + force_all_finite=False, copy=self.copy) + mask = _get_mask(X, self.missing_values) + n_rows_X, n_cols_X = X.shape + row_total_missing = mask.sum(axis=1) + + # Get fitted objects + fitted_NNObj, fitted_col_means = self.statistics_ + fitted_X = fitted_NNObj._fit_X + fitted_mask = _get_mask(fitted_X, self.missing_values) + + # Check for excessive missingness in rows + bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing) + if np.any(bad_rows): + warnings.warn( + "The following rows have more than {0}% missing values and " + "are imputed with column means: {1}" + .format(self.row_max_missing*100, np.where(bad_rows))) + X_bad = X[bad_rows, :] + X = X[~bad_rows, :] + mask = _get_mask(X, self.missing_values) + row_total_missing = mask.sum(axis=1) + + # Check if the X in fit() and transform() are the same + if np.ma.allequal(np.ma.array(X, mask=mask), + np.ma.array(fitted_X, mask=fitted_mask)) and \ + np.array_equal(mask, fitted_mask): + + # Get the k nearest neighbors from fitted matrix + neighbors = fitted_NNObj.kneighbors(n_neighbors=self.n_neighbors, + return_distance=self.weighted) + else: + neighbors = fitted_NNObj.kneighbors(X, + n_neighbors=self.n_neighbors, + return_distance=self.weighted) + + # Get row index and distance (if weighted) of donors + if self.weighted: + knn_row_index = neighbors[1] + knn_distances = neighbors[0] + else: + knn_row_index = neighbors + # knn_distances = np.ones_like(neighbors) + + knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) + knn_row_index = np.repeat(knn_row_index, row_total_missing, axis=0) + knn_row_index = knn_row_index.ravel() + + # Get column index of donors + # NOTE: Following assumes columns in X and _fit_X are in the same order + col_missing_index = np.where(mask)[1] + knn_col_index = np.repeat(col_missing_index, self.n_neighbors) + + # Calculate kNN score and impute + imputed = np.nanmean( + (fitted_NNObj._fit_X[(knn_row_index, knn_col_index)]). + reshape((-1, self.n_neighbors)), axis=1) + X[mask] = imputed + + # Merge bad rows to X and mean impute any leftover missing + if np.any(bad_rows): + X_merged = np.empty((n_rows_X, n_cols_X)) + X_merged[bad_rows, :] = X_bad + X_merged[~bad_rows, :] = X + X = X_merged + + # Impute bad_rows and leftover missing with column means + mask_after_knn = _get_mask(X, self.missing_values) + if np.any(mask_after_knn): + missing_index = np.where(mask_after_knn) + X[missing_index] = np.take(fitted_col_means, missing_index[1]) + + return X \ No newline at end of file From aa8547a18bc59135d8dc56aa0a05a70f9bd81f4c Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 3 Aug 2017 18:07:46 -0500 Subject: [PATCH 23/97] fixed array base check --- sklearn/preprocessing/knn_imputation.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py index 3887169d37ad4..1bc098c68c564 100644 --- a/sklearn/preprocessing/knn_imputation.py +++ b/sklearn/preprocessing/knn_imputation.py @@ -156,7 +156,6 @@ def transform(self, X): # Get fitted objects fitted_NNObj, fitted_col_means = self.statistics_ fitted_X = fitted_NNObj._fit_X - fitted_mask = _get_mask(fitted_X, self.missing_values) # Check for excessive missingness in rows bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing) @@ -171,11 +170,7 @@ def transform(self, X): row_total_missing = mask.sum(axis=1) # Check if the X in fit() and transform() are the same - if np.ma.allequal(np.ma.array(X, mask=mask), - np.ma.array(fitted_X, mask=fitted_mask)) and \ - np.array_equal(mask, fitted_mask): - - # Get the k nearest neighbors from fitted matrix + if X.base is fitted_X.base: neighbors = fitted_NNObj.kneighbors(n_neighbors=self.n_neighbors, return_distance=self.weighted) else: @@ -186,7 +181,7 @@ def transform(self, X): # Get row index and distance (if weighted) of donors if self.weighted: knn_row_index = neighbors[1] - knn_distances = neighbors[0] + # knn_distances = neighbors[0] else: knn_row_index = neighbors # knn_distances = np.ones_like(neighbors) @@ -203,7 +198,7 @@ def transform(self, X): # Calculate kNN score and impute imputed = np.nanmean( (fitted_NNObj._fit_X[(knn_row_index, knn_col_index)]). - reshape((-1, self.n_neighbors)), axis=1) + reshape((-1, self.n_neighbors)), axis=1) X[mask] = imputed # Merge bad rows to X and mean impute any leftover missing @@ -219,4 +214,4 @@ def transform(self, X): missing_index = np.where(mask_after_knn) X[missing_index] = np.take(fitted_col_means, missing_index[1]) - return X \ No newline at end of file + return X From 009efa9eda98e90c6df2f0a1b25664ba68bf19ef Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 3 Aug 2017 18:59:34 -0500 Subject: [PATCH 24/97] Fix column mean to nanmean --- sklearn/preprocessing/knn_imputation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py index 1bc098c68c564..59d294b51b3b7 100644 --- a/sklearn/preprocessing/knn_imputation.py +++ b/sklearn/preprocessing/knn_imputation.py @@ -110,22 +110,22 @@ def fit(self, X, y=None): .format(self.col_max_missing*100, np.where( mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)))) - # X_masked = np.ma.array(X, mask=mask) - X_col_means = X.mean(axis=0) + X_col_means = np.nanmean(X, axis=0) # Check if % missing in any row > col_max_missing bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) if np.any(bad_rows): warnings.warn( "The following rows have more than {0}% missing values and " - "are not included as nearest neighbors: {1}" + "are not included as donor neighbors: {1}" .format(self.row_max_missing*100, np.where(bad_rows))) # Remove rows that have more than row_max_missing % missing X = X[~bad_rows, :] - mask = _get_mask(X, self.missing_values) + # mask = _get_mask(X, self.missing_values) # X_masked = np.ma.array(X, mask=mask) + # Check if sufficient neighboring samples available if X.shape[0] < self.n_neighbors: raise ValueError("There are only %d samples, " "but n_neighbors=%d." From 70f294ad530f4bdb3a2147deaeb632d6647bf0a8 Mon Sep 17 00:00:00 2001 From: harke Date: Sat, 5 Aug 2017 23:27:13 -0500 Subject: [PATCH 25/97] Added weight support and cleaned the code --- sklearn/preprocessing/knn_imputation.py | 70 ++++++++++++++----------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py index 59d294b51b3b7..ec6ca3bea4364 100644 --- a/sklearn/preprocessing/knn_imputation.py +++ b/sklearn/preprocessing/knn_imputation.py @@ -10,7 +10,7 @@ from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES from ..neighbors import NearestNeighbors - +from ..neighbors.base import _get_weights, _check_weights __all__ = [ 'KNNImputer', ] @@ -36,11 +36,22 @@ class KNNImputer(BaseEstimator, TransformerMixin): `missing_values` will be imputed. For missing values encoded as np.nan, use the string value "NaN". - n_neighbors : int, optional (default = 10) + n_neighbors : int, optional (default = 5) Number of neighbors to get. - weighted : bool, optional (default = True) - Set the imputed value as a distance-weighted score of the neighbors + weights : str or callable + weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. metric : string or callable, optional (default = 'masked_euclidean') metric to use for distance computation. @@ -65,20 +76,20 @@ class KNNImputer(BaseEstimator, TransformerMixin): Attributes ---------- statistics_ : array of shape (n_features,) - A tuple whose first element if the fitted NearestNeighbors object - and second element is the column means using available values. + A tuple whose first element is the fitted NearestNeighbors object + and the second element is the column means using available values. Notes ----- """ - def __init__(self, missing_values="NaN", n_neighbors=10, - weighted=True, metric="masked_euclidean", + def __init__(self, missing_values="NaN", n_neighbors=5, + weights="uniform", metric="masked_euclidean", row_max_missing=0.5, col_max_missing=0.8, copy=True): self.missing_values = missing_values self.n_neighbors = n_neighbors - self.weighted = weighted + self.weights = _check_weights(weights) self.metric = metric self.row_max_missing = row_max_missing self.col_max_missing = col_max_missing @@ -89,7 +100,7 @@ def fit(self, X, y=None): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like}, shape (n_samples, n_features) Input data, where ``n_samples`` is the number of samples and ``n_features`` is the number of features. @@ -110,7 +121,7 @@ def fit(self, X, y=None): .format(self.col_max_missing*100, np.where( mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)))) - X_col_means = np.nanmean(X, axis=0) + X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data # Check if % missing in any row > col_max_missing bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) @@ -122,8 +133,6 @@ def fit(self, X, y=None): # Remove rows that have more than row_max_missing % missing X = X[~bad_rows, :] - # mask = _get_mask(X, self.missing_values) - # X_masked = np.ma.array(X, mask=mask) # Check if sufficient neighboring samples available if X.shape[0] < self.n_neighbors: @@ -143,7 +152,7 @@ def transform(self, X): Parameters ---------- - X : {array-like, sparse matrix}, shape = [n_samples, n_features] + X : {array-like}, shape = [n_samples, n_features] The input data to complete. """ check_is_fitted(self, 'statistics_') @@ -170,25 +179,20 @@ def transform(self, X): row_total_missing = mask.sum(axis=1) # Check if the X in fit() and transform() are the same - if X.base is fitted_X.base: - neighbors = fitted_NNObj.kneighbors(n_neighbors=self.n_neighbors, - return_distance=self.weighted) + if X is fitted_X or X.base is fitted_X.base: + neighbors = fitted_NNObj.kneighbors(n_neighbors=self.n_neighbors) else: neighbors = fitted_NNObj.kneighbors(X, - n_neighbors=self.n_neighbors, - return_distance=self.weighted) + n_neighbors=self.n_neighbors) - # Get row index and distance (if weighted) of donors - if self.weighted: - knn_row_index = neighbors[1] - # knn_distances = neighbors[0] - else: - knn_row_index = neighbors - # knn_distances = np.ones_like(neighbors) + # Get row index, distance, and weights of donors + knn_distances, knn_row_index = neighbors + weights = _get_weights(knn_distances[row_total_missing.astype( + np.bool), ], self.weights) knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) - knn_row_index = np.repeat(knn_row_index, row_total_missing, axis=0) - knn_row_index = knn_row_index.ravel() + knn_row_index = np.repeat(knn_row_index, + row_total_missing, axis=0).ravel() # Get column index of donors # NOTE: Following assumes columns in X and _fit_X are in the same order @@ -196,10 +200,12 @@ def transform(self, X): knn_col_index = np.repeat(col_missing_index, self.n_neighbors) # Calculate kNN score and impute - imputed = np.nanmean( - (fitted_NNObj._fit_X[(knn_row_index, knn_col_index)]). - reshape((-1, self.n_neighbors)), axis=1) - X[mask] = imputed + donors = fitted_X[ + (knn_row_index, knn_col_index)].reshape((-1, self.n_neighbors)) + donors = np.ma.array( + donors, mask=_get_mask(donors, self.missing_values)) + imputed = np.ma.average(donors, axis=1, weights=weights) + X[mask] = imputed.data # Merge bad rows to X and mean impute any leftover missing if np.any(bad_rows): From a54c162109b583ff4e9f681c143b1a949e59f67f Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 6 Aug 2017 01:59:51 -0500 Subject: [PATCH 26/97] Added inf check --- sklearn/preprocessing/knn_imputation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py index ec6ca3bea4364..2575b1df0e635 100644 --- a/sklearn/preprocessing/knn_imputation.py +++ b/sklearn/preprocessing/knn_imputation.py @@ -112,9 +112,12 @@ def fit(self, X, y=None): # Check parameters X = check_array(X, accept_sparse=False, dtype=np.float64, force_all_finite=False, copy=self.copy) - mask = _get_mask(X, self.missing_values) + # Check for +/- inf + if (np.any(np.isinf(X))): + raise ValueError("+/- Infinite values are not allowed.") # Check if % missing in any column > col_max_missing + mask = _get_mask(X, self.missing_values) if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): raise ValueError("The following columns have, " "more than {0}% missing values: {1}" From c412e3baf4486a55973bfe2eb2f28a438d9756af Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 6 Aug 2017 04:13:56 -0500 Subject: [PATCH 27/97] Changed error message --- sklearn/preprocessing/knn_imputation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py index 2575b1df0e635..659438be09ae6 100644 --- a/sklearn/preprocessing/knn_imputation.py +++ b/sklearn/preprocessing/knn_imputation.py @@ -114,7 +114,8 @@ def fit(self, X, y=None): force_all_finite=False, copy=self.copy) # Check for +/- inf if (np.any(np.isinf(X))): - raise ValueError("+/- Infinite values are not allowed.") + raise ValueError("+/- inf values are not allowed even though NaN " + "values are allowed.") # Check if % missing in any column > col_max_missing mask = _get_mask(X, self.missing_values) From ffe677456035473253de3b2de8a8a2da04e7273a Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 8 Aug 2017 00:30:57 -0500 Subject: [PATCH 28/97] Added test suite and example. Expanded docstring description --- sklearn/preprocessing/knn_imputation.py | 48 +++- .../tests/test_knn_imputation.py | 220 ++++++++++++++++++ 2 files changed, 256 insertions(+), 12 deletions(-) create mode 100644 sklearn/preprocessing/tests/test_knn_imputation.py diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py index 659438be09ae6..a4f2970468842 100644 --- a/sklearn/preprocessing/knn_imputation.py +++ b/sklearn/preprocessing/knn_imputation.py @@ -25,9 +25,9 @@ def _get_mask(X, value_to_mask): class KNNImputer(BaseEstimator, TransformerMixin): - """Imputation transformer for completing missing values. - - Read more in the :ref:`User Guide `. + """Imputation transformer for completing missing values using Nearest + Neighbors. Broadly speaking, the imputation is performed using either + the weighted or the unweighted mean of the desired number of neighbors. Parameters ---------- @@ -75,12 +75,33 @@ class KNNImputer(BaseEstimator, TransformerMixin): Attributes ---------- - statistics_ : array of shape (n_features,) - A tuple whose first element is the fitted NearestNeighbors object - and the second element is the column means using available values. + statistics_ : {tuple} + A tuple whose first element is the input dataset used to fit the + KNNImputer object and the second element is the column means of that + dataset using observed (i.e. non-missing) values. - Notes - ----- + References + ---------- + * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor + Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing + value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 + no. 6, 2001 Pages 520-525. + + Examples + -------- + >>> from sklearn.preprocessing import knn_imputation + >>> nan = float("NaN") + >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] + >>> imputer = knn_imputation.KNNImputer(n_neighbors=2, weights="uniform") + >>> imputer.fit(X) + KNNImputer(col_max_missing=0.8, copy=True, metric='masked_euclidean', + missing_values='NaN', n_neighbors=2, row_max_missing=0.5, + weights='uniform') + >>> imputer.transform(X) + array([[ 1. , 2. , 4. ], + [ 3. , 4. , 3. ], + [ 5.5, 6. , 5. ], + [ 8. , 8. , 7. ]]) """ def __init__(self, missing_values="NaN", n_neighbors=5, @@ -110,12 +131,13 @@ def fit(self, X, y=None): Returns self. """ # Check parameters + force_all_finite = False if self.missing_values in ["NaN", + np.nan] else True X = check_array(X, accept_sparse=False, dtype=np.float64, - force_all_finite=False, copy=self.copy) + force_all_finite=force_all_finite, copy=self.copy) # Check for +/- inf if (np.any(np.isinf(X))): - raise ValueError("+/- inf values are not allowed even though NaN " - "values are allowed.") + raise ValueError("+/- inf values are not allowed.") # Check if % missing in any column > col_max_missing mask = _get_mask(X, self.missing_values) @@ -159,9 +181,11 @@ def transform(self, X): X : {array-like}, shape = [n_samples, n_features] The input data to complete. """ + force_all_finite = False if self.missing_values in ["NaN", + np.nan] else True check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=False, copy=self.copy) + force_all_finite=force_all_finite, copy=self.copy) mask = _get_mask(X, self.missing_values) n_rows_X, n_cols_X = X.shape row_total_missing = mask.sum(axis=1) diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py new file mode 100644 index 0000000000000..1998ca199b828 --- /dev/null +++ b/sklearn/preprocessing/tests/test_knn_imputation.py @@ -0,0 +1,220 @@ +from __future__ import division +import numpy as np + +from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_array_almost_equal +from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_false + +from sklearn.preprocessing.knn_imputation import KNNImputer +from sklearn.random_projection import sparse_random_matrix + + +def test_knn_imputation_shape(): + # Verify the shapes of the imputed matrix for different weights and + # number of neighbors. + n_rows = 10 + n_cols = 2 + X = np.random.rand(n_rows, n_cols) + X[0, 0] = np.nan + + for weights in ['uniform', 'distance']: + for n_neighbors in range(1, 6): + imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) + X_imputed = imputer.fit_transform(X) + assert_equal(X_imputed.shape, (n_rows, n_cols)) + + +def test_knn_imputation_zero(): + # Test imputation when missing_values == 0 + missing_values = 0 + n_neighbors = 2 + imputer = KNNImputer(missing_values=missing_values, + n_neighbors=n_neighbors, + weights="uniform") + + # Test with missing_values=0 when NaN present + X = np.array([ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + ]) + assert_raises(ValueError, imputer.fit, X) + + # Test with % zeros in column > col_max_missing + X = np.array([ + [1, 0, 0, 0, 5], + [2, 1, 0, 2, 3], + [3, 2, 0, 0, 0], + [4, 6, 0, 5, 13], + ]) + assert_raises(ValueError, imputer.fit, X) + + # Test with an imputable matrix + X = np.array([ + [1, 0, 1, 0, 5], + [2, 1, 2, 2, 3], + [3, 2, 3, 0, 0], + [6, 6, 0, 5, 13], + ]) + + statistics_mean = [3, 3, 2, 3.5, 7] + X_imputed = np.array([ + [1, 1.5, 1, 2, 5], + [2, 1, 2, 2, 3], + [3, 2, 3, 2, 4], + [6, 6, 1.5, 5, 13], + ]) + + assert_array_equal(imputer.fit(X).transform(X), X_imputed) + assert_array_equal(imputer.statistics_[1], statistics_mean) + + +def test_knn_imputation_default(): + # Test imputation with default values + # imputer = KNNImputer() + + # Test with % missing in a column > col_max_missing + X = np.array([ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [np.nan, 8, 0, 8, 9], + ]) + assert_raises(ValueError, KNNImputer().fit, X) + + # Test with insufficient number of neighbors + imputer = KNNImputer() + X = np.array([ + [1, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [6, 6, 2, 5, 13], + ]) + assert_raises(ValueError, KNNImputer().fit, X) + + # Test with inf present + X = np.array([ + [np.inf, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + assert_raises(ValueError, KNNImputer().fit, X) + + # Test with an imputable matrix + X = np.array([ + [1, 0, 0, 1], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [np.nan, 4, 5, 5], + [6, np.nan, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ]) + + statistics_mean = [6, 5, 6, 8] + + X_imputed = np.array([ + [1, 0, 0, 1], + [2, 1, 2, 5.25], + [3, 2, 3, 5.25], + [4, 4, 5, 5], + [6, 3, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit(X).transform(X), X_imputed) + assert_array_equal(imputer.statistics_[1], statistics_mean) + + # Test with % missing in row > row_max_missing + X = np.array([ + [1, 0, 0, 1], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [np.nan, 4, 5, 5], + [6, np.nan, 6, 7], + [8, 8, 8, 8], + [np.nan, np.nan, np.nan, 19], + ]) + + statistics_mean = [4, 3, 4, 8] + X_imputed = np.array([ + [1, 0, 0, 1], + [2, 1, 2, 5.25], + [3, 2, 3, 5.25], + [4, 4, 5, 5], + [6, 3, 6, 7], + [8, 8, 8, 8], + [4, 3, 4, 19], + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit(X).transform(X), X_imputed) + assert_array_equal(imputer.statistics_[1], statistics_mean) + + # Test with weights = "distance" + X = np.array([ + [np.nan, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ]) + + statistics_mean = [3, 2, 2.5] + + X_imputed = np.array([ + [2.3828, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ]) + + imputer = KNNImputer(n_neighbors=2, weights="distance") + assert_array_almost_equal(imputer.fit(X).transform(X), X_imputed, + decimal=4) + assert_array_equal(imputer.statistics_[1], statistics_mean) + + +def test_imputation_pickle(): + # Test for pickling imputers. + import pickle + + l = 100 + X = np.random.rand(l, l+1) + + imputer = KNNImputer() + imputer.fit(X) + + imputer_pickled = pickle.loads(pickle.dumps(imputer)) + + assert_array_equal(imputer.transform(X.copy()), + imputer_pickled.transform(X.copy()), + "Fail to transform the data after pickling ") + + +def test_imputation_copy(): + # Test imputation with copy + X_orig = sparse_random_matrix(10, 10, density=0.75, random_state=0) + + # copy=True, dense => copy + X = X_orig.copy().toarray() + imputer = KNNImputer(missing_values=0, copy=True) + Xt = imputer.fit(X).transform(X) + Xt[0, 0] = -1 + assert_false(np.all(X == Xt)) + + # copy=False, dense => no copy + X = X_orig.copy().toarray() + imputer = KNNImputer(missing_values=0, copy=False) + Xt = imputer.fit(X).transform(X) + Xt[0, 0] = -1 + assert_array_equal(X, Xt) From c2d6a6c3b5fe09a93c8e055b0b01d94a83d89c43 Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 8 Aug 2017 01:44:07 -0500 Subject: [PATCH 29/97] Changes to preprocessing __init__ --- sklearn/preprocessing/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 2b105709ffe08..620b542f6024d 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -31,6 +31,7 @@ from .label import MultiLabelBinarizer from .imputation import Imputer +from .knn_imputation import KNNImputer __all__ = [ @@ -38,6 +39,7 @@ 'FunctionTransformer', 'Imputer', 'KernelCenterer', + 'KNNImputer', 'LabelBinarizer', 'LabelEncoder', 'MultiLabelBinarizer', From 9a1967748948339154edd11c21650fa1d32c615c Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 8 Aug 2017 02:45:00 -0500 Subject: [PATCH 30/97] Added KNNImputer exception for NaN and inf in estimator_checks --- sklearn/utils/estimator_checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4760253a5a43e..acaf781fd7993 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -87,7 +87,7 @@ def _yield_non_meta_checks(name, estimator): # cross-decomposition's "transform" returns X and Y yield check_pipeline_consistency - if name not in ['Imputer']: + if name not in ['Imputer', 'KNNImputer']: # Test that all estimators check their input for NaN's and infs yield check_estimators_nan_inf From a6a0a2f3ad0420b22889873a1e65f06d58524a7b Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 8 Aug 2017 23:20:29 -0500 Subject: [PATCH 31/97] Moved _check_weights() to fit() --- sklearn/preprocessing/knn_imputation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py index a4f2970468842..0202476b92e63 100644 --- a/sklearn/preprocessing/knn_imputation.py +++ b/sklearn/preprocessing/knn_imputation.py @@ -110,7 +110,7 @@ def __init__(self, missing_values="NaN", n_neighbors=5, self.missing_values = missing_values self.n_neighbors = n_neighbors - self.weights = _check_weights(weights) + self.weights = weights self.metric = metric self.row_max_missing = row_max_missing self.col_max_missing = col_max_missing @@ -135,6 +135,8 @@ def fit(self, X, y=None): np.nan] else True X = check_array(X, accept_sparse=False, dtype=np.float64, force_all_finite=force_all_finite, copy=self.copy) + self.weights = _check_weights(self.weights) + # Check for +/- inf if (np.any(np.isinf(X))): raise ValueError("+/- inf values are not allowed.") From 4fbbe4039da09b3944555d7dc9d90062b3d2ccd1 Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 17 Aug 2017 21:25:29 -0500 Subject: [PATCH 32/97] Addressed review comments - 1 --- examples/plot_missing_values.py | 18 +- sklearn/preprocessing/__init__.py | 2 - sklearn/preprocessing/imputation.py | 293 +++++++++++++++++- sklearn/preprocessing/knn_imputation.py | 253 --------------- .../tests/test_knn_imputation.py | 66 ++-- 5 files changed, 347 insertions(+), 285 deletions(-) delete mode 100644 sklearn/preprocessing/knn_imputation.py diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 17a256fa4fa2f..cad10a95051dd 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -9,16 +9,22 @@ cross-validation. Sometimes dropping rows or using marker values is more effective. +Imputer: Missing values can be replaced by the mean, the median or the most frequent value using the ``strategy`` hyper-parameter. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). +KNNImputer: +Missing values can be imputed using the weighted or unweighted mean of the +desired number of nearest neighbors. + Script output:: Score with the entire dataset = 0.56 Score without the samples containing missing values = 0.48 Score after imputation of the missing values = 0.55 + Score after knn-imputation of the missing values = 0.55 In this case, imputing helps the classifier get close to the original score. @@ -28,7 +34,7 @@ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline -from sklearn.preprocessing import Imputer +from sklearn.preprocessing.imputation import Imputer, KNNImputer from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) @@ -71,3 +77,13 @@ n_estimators=100))]) score = cross_val_score(estimator, X_missing, y_missing).mean() print("Score after imputation of the missing values = %.2f" % score) + +# Estimate the score after kNN-imputation of the missing values +X_missing = X_full.copy() +X_missing[np.where(missing_samples)[0], missing_features] = np.nan +y_missing = y_full.copy() +knn_estimator = Pipeline([("knnimputer", KNNImputer(n_neighbors=10)), + ("forest", RandomForestRegressor(random_state=0, + n_estimators=100))]) +knn_score = cross_val_score(knn_estimator, X_missing, y_missing).mean() +print("Score after knn-imputation of the missing values = %.2f" % knn_score) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 620b542f6024d..2b105709ffe08 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -31,7 +31,6 @@ from .label import MultiLabelBinarizer from .imputation import Imputer -from .knn_imputation import KNNImputer __all__ = [ @@ -39,7 +38,6 @@ 'FunctionTransformer', 'Imputer', 'KernelCenterer', - 'KNNImputer', 'LabelBinarizer', 'LabelEncoder', 'MultiLabelBinarizer', diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 12d5425fbf604..1a943fceae7de 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -1,6 +1,6 @@ # Authors: Nicolas Tresegnie # License: BSD 3 clause - +from __future__ import division import warnings import numpy as np @@ -13,6 +13,8 @@ from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES +from ..neighbors import NearestNeighbors +from ..neighbors.base import _get_weights, _check_weights from ..externals import six @@ -21,6 +23,7 @@ __all__ = [ 'Imputer', + 'KNNImputer' ] @@ -374,3 +377,291 @@ def transform(self, X): X[coordinates] = values return X + + +class KNNImputer(BaseEstimator, TransformerMixin): + """Imputation for completing missing values using Nearest Neighbors. + + Broadly speaking, the imputation is performed using either + the weighted or the unweighted mean of the desired number of neighbors. + + Parameters + ---------- + missing_values : integer or "NaN", optional (default="NaN") + The placeholder for the missing values. All occurrences of + `missing_values` will be imputed. For missing values encoded as np.nan, + use the string value "NaN". + + n_neighbors : int, optional (default = 5) + Maximum number of neighboring samples to use for imputation. When + any of the neighbors themselves have the feature value missing then + the remaining n_neighbors-1 neighbors are used and, if need be, + the process repeats until a single neighbor remains. If all + the neighbors have the feature value missing, then the overall feature + mean is used for imputation. + + weights : str or callable + weight function used in prediction. Possible values: + + - 'uniform' : uniform weights. All points in each neighborhood + are weighted equally. + - 'distance' : weight points by the inverse of their distance. + in this case, closer neighbors of a query point will have a + greater influence than neighbors which are further away. + - [callable] : a user-defined function which accepts an + array of distances, and returns an array of the same shape + containing the weights. + + Uniform weights are used by default. + + metric : string or callable, optional (default = 'masked_euclidean') + metric to use for distance computation. + + row_max_missing : float, optional (default = 0.5) + The maximum percentage of columns (i.e. features) that can be missing + before the sample is excluded from nearest neighbor imputation. It + means that such rows will not be considered a potential donor in fit() + and in transform() their missing feature values will be imputed to be + the column mean for the entire dataset. + + col_max_missing : float, optional (default = 0.8) + The maximum percentage of rows (or samples) that can be missing + for a given feature beyond which an error is raised. + + copy : boolean, optional (default=True) + If True, a copy of X will be created. If False, imputation will + be done in-place whenever possible. Note that, if metric is + "masked_euclidean" and copy=False then missing_values in the + input matrix X will be overwritten with zeros. + + Attributes + ---------- + statistics_ : {tuple} + A tuple whose first element is the input dataset used to fit the + KNNImputer object and the second element is the column means of that + dataset using observed (i.e. non-missing) values. + + References + ---------- + * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor + Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing + value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 + no. 6, 2001 Pages 520-525. + + Examples + -------- + >>> from sklearn.preprocessing.imputation import KNNImputer + >>> nan = float("NaN") + >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] + >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") + >>> imputer.fit_transform(X) + array([[ 1. , 2. , 4. ], + [ 3. , 4. , 3. ], + [ 5.5, 6. , 5. ], + [ 8. , 8. , 7. ]]) + """ + + def __init__(self, missing_values="NaN", n_neighbors=5, + weights="uniform", metric="masked_euclidean", + row_max_missing=0.5, col_max_missing=0.8, copy=True): + + self.missing_values = missing_values + self.n_neighbors = n_neighbors + self.weights = weights + self.metric = metric + self.row_max_missing = row_max_missing + self.col_max_missing = col_max_missing + self.copy = copy + + def fit(self, X, y=None): + """Fit the imputer on X. + + Parameters + ---------- + X : {array-like}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + self : object + Returns self. + """ + # Check parameters + force_all_finite = False if self.missing_values in ["NaN", + np.nan] else True + X = check_array(X, accept_sparse=False, dtype=np.float64, + force_all_finite=force_all_finite, copy=self.copy) + self.weights = _check_weights(self.weights) + + # Check for +/- inf + if (np.any(np.isinf(X))): + raise ValueError("+/- inf values are not allowed.") + + # Check if % missing in any column > col_max_missing + mask = _get_mask(X, self.missing_values) + if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): + raise ValueError("The following columns have, " + "more than {0}% missing values: {1}" + .format(self.col_max_missing*100, np.where( + mask.sum(axis=0) > (X.shape[0] * + self.col_max_missing)))) + X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data + + # Check if % missing in any row > col_max_missing + bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) + if np.any(bad_rows): + warnings.warn( + "The following rows have more than {0}% missing values and " + "are not included as donor neighbors: {1}" + .format(self.row_max_missing*100, np.where(bad_rows))) + + # Remove rows that have more than row_max_missing % missing + X = X[~bad_rows, :] + + # Check if sufficient neighboring samples available + if X.shape[0] < self.n_neighbors: + raise ValueError("There are only %d samples, " + "but n_neighbors=%d." + % (X.shape[0], self.n_neighbors)) + + # Instantiate NN object, get column means, and store in statistics_ + neigh = NearestNeighbors(n_neighbors=self.n_neighbors, + metric=self.metric) + self._fitted_neighbors = neigh.fit(X) + self.statistics_ = X_col_means + + return self + + def _transform(self, X, n_neighbors_new): + """Impute all missing values in X. + + Parameters + ---------- + X : {array-like}, shape = [n_samples, n_features] + The input data to complete. + + n_neighbors_new : int + Indicates whether to pass n_neighbors or n_neighbors+1 to + _tranform(). + Calling transform() automatically sets this to self.n_neighbors + while fit_transform() sets it to self.n_neighbors + 1. + """ + check_is_fitted(self, 'statistics_') + force_all_finite = False if self.missing_values in ["NaN", + np.nan] else True + X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, + force_all_finite=force_all_finite, copy=self.copy) + # Check for +/- inf + if (np.any(np.isinf(X))): + raise ValueError("+/- inf values are not allowed.") + + # Get fitted data and ensure correct dimension + fitted_X = self._fitted_neighbors._fit_X + if X.shape[1] != fitted_X.shape[1]: + raise ValueError("Incompatible dimension between the fitted " + "dataset and the one to be transformed.") + mask = _get_mask(X, self.missing_values) + n_rows_X, n_cols_X = X.shape + row_total_missing = mask.sum(axis=1) + if not np.any(row_total_missing > 0): + return X + # row_has_missing = row_total_missing.astype(np.bool) + + # Check for excessive missingness in rows + bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing) + if np.any(bad_rows): + warnings.warn( + "The following rows have more than {0}% missing values and " + "are imputed with column means: {1}" + .format(self.row_max_missing*100, np.where(bad_rows))) + X_bad = X[bad_rows, :] + X = X[~bad_rows, :] + mask = _get_mask(X, self.missing_values) + row_total_missing = mask.sum(axis=1) + row_has_missing = row_total_missing.astype(np.bool) + # Check if the X in fit() and transform() are the same + # if X is fitted_X or X.base is fitted_X.base: + # neighbors = self._fitted_neighbors.kneighbors( + # n_neighbors=self.n_neighbors) + # else: + # neighbors = self._fitted_neighbors.kneighbors( + # X[row_has_missing, :], n_neighbors=self.n_neighbors) + if np.any(row_has_missing): + neighbors = self._fitted_neighbors.kneighbors( + X[row_has_missing, :], n_neighbors=n_neighbors_new) + + # Get row index, distance, and weights of donors + knn_distances, knn_row_index = neighbors + if n_neighbors_new > self.n_neighbors: + knn_distances = knn_distances[:, 1:] + knn_row_index = knn_row_index[:, 1:] + weights = _get_weights(knn_distances, self.weights) + + knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) + row_repeats = row_total_missing[row_total_missing != 0] + knn_row_index = np.repeat( + knn_row_index, row_repeats, axis=0).ravel() + + # Get column index of donors + # NOTE: Following assumes columns in X and _fit_X are in the same order + row_missing_index, col_missing_index = np.where(mask) + knn_col_index = np.repeat(col_missing_index, self.n_neighbors) + + # Calculate kNN score and impute + donors = fitted_X[ + (knn_row_index, knn_col_index)].reshape((-1, self.n_neighbors)) + donors_mask = _get_mask(donors, self.missing_values) + donors = np.ma.array( + donors, mask=donors_mask) + imputed = np.ma.average(donors, axis=1, weights=weights) + X[mask] = imputed.data + unimputed_index = np.where(donors_mask.sum(axis=1) == self.n_neighbors) + # imputed_mask = _get_mask(imputed.data, self.missing_values) + if len(unimputed_index[0]) > 0: + # unimputed_loc = np.where(imputed_mask) + # unimputed_rows, unimputed_cols = np.where(mask) + unimputed_rows = row_missing_index[unimputed_index] + unimputed_cols = col_missing_index[unimputed_index] + X[(unimputed_rows, unimputed_cols)] = np.take(self.statistics_, + unimputed_cols) + + # Merge bad rows to X and mean impute any leftover missing + if np.any(bad_rows): + bad_missing_index = np.where(_get_mask(X_bad, self.missing_values)) + X_bad[bad_missing_index] = np.take(self.statistics_, + bad_missing_index[1]) + X_merged = np.empty((n_rows_X, n_cols_X)) + X_merged[bad_rows, :] = X_bad + X_merged[~bad_rows, :] = X + X = X_merged + + # Impute bad_rows and leftover missing with column means + # mask_after_knn = _get_mask(X[row_has_missing, :], self.missing_values) + # if np.any(mask_after_knn): + # missing_index = np.where(mask_after_knn) + # X[row_has_missing, :][missing_index] = np.take(self.statistics_, + # missing_index[1]) + + return X + + def fit_transform(self, X, y=None, **fit_params): + """Impute all missing values in X. + + Parameters + ---------- + X : {array-like}, shape = [n_samples, n_features] + The input data to complete. + """ + return self.fit(X)._transform(X, n_neighbors_new=self.n_neighbors + 1) + + def transform(self, X): + """Impute all missing values in X. + + Parameters + ---------- + X : {array-like}, shape = [n_samples, n_features] + The input data to complete. + """ + check_is_fitted(self, 'statistics_') + return self._transform(X, n_neighbors_new=self.n_neighbors) \ No newline at end of file diff --git a/sklearn/preprocessing/knn_imputation.py b/sklearn/preprocessing/knn_imputation.py deleted file mode 100644 index 0202476b92e63..0000000000000 --- a/sklearn/preprocessing/knn_imputation.py +++ /dev/null @@ -1,253 +0,0 @@ -# Authors: Ashim Bhattarai -# License: BSD 3 clause - -from __future__ import division -import warnings -import numpy as np - -from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array -from ..utils.validation import check_is_fitted -from ..utils.validation import FLOAT_DTYPES -from ..neighbors import NearestNeighbors -from ..neighbors.base import _get_weights, _check_weights -__all__ = [ - 'KNNImputer', -] - - -def _get_mask(X, value_to_mask): - """Compute the boolean mask X == missing_values.""" - if value_to_mask == "NaN" or np.isnan(value_to_mask): - return np.isnan(X) - else: - return X == value_to_mask - - -class KNNImputer(BaseEstimator, TransformerMixin): - """Imputation transformer for completing missing values using Nearest - Neighbors. Broadly speaking, the imputation is performed using either - the weighted or the unweighted mean of the desired number of neighbors. - - Parameters - ---------- - missing_values : integer or "NaN", optional (default="NaN") - The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. For missing values encoded as np.nan, - use the string value "NaN". - - n_neighbors : int, optional (default = 5) - Number of neighbors to get. - - weights : str or callable - weight function used in prediction. Possible values: - - - 'uniform' : uniform weights. All points in each neighborhood - are weighted equally. - - 'distance' : weight points by the inverse of their distance. - in this case, closer neighbors of a query point will have a - greater influence than neighbors which are further away. - - [callable] : a user-defined function which accepts an - array of distances, and returns an array of the same shape - containing the weights. - - Uniform weights are used by default. - - metric : string or callable, optional (default = 'masked_euclidean') - metric to use for distance computation. - - row_max_missing: float, optional (default = 0.5) - The maximum percentage of columns (i.e. features) that can be missing - before the sample is excluded from nearest neighbor imputation. It - means that such rows will not be considered a potential donor in fit() - and in transform() their missing feature values will be imputed to be - the column mean for the entire dataset. - - col_max_missing: float, optional (default = 0.8) - The maximum percentage of rows (or samples) that can be missing - for a given feature beyond which an error is raised. - - copy : boolean, optional (default=True) - If True, a copy of X will be created. If False, imputation will - be done in-place whenever possible. Note that, if metric is - "masked_euclidean" and copy=False then missing_values in the - input matrix X will be overwritten with zeros. - - Attributes - ---------- - statistics_ : {tuple} - A tuple whose first element is the input dataset used to fit the - KNNImputer object and the second element is the column means of that - dataset using observed (i.e. non-missing) values. - - References - ---------- - * Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor - Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing - value estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 - no. 6, 2001 Pages 520-525. - - Examples - -------- - >>> from sklearn.preprocessing import knn_imputation - >>> nan = float("NaN") - >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] - >>> imputer = knn_imputation.KNNImputer(n_neighbors=2, weights="uniform") - >>> imputer.fit(X) - KNNImputer(col_max_missing=0.8, copy=True, metric='masked_euclidean', - missing_values='NaN', n_neighbors=2, row_max_missing=0.5, - weights='uniform') - >>> imputer.transform(X) - array([[ 1. , 2. , 4. ], - [ 3. , 4. , 3. ], - [ 5.5, 6. , 5. ], - [ 8. , 8. , 7. ]]) - """ - - def __init__(self, missing_values="NaN", n_neighbors=5, - weights="uniform", metric="masked_euclidean", - row_max_missing=0.5, col_max_missing=0.8, copy=True): - - self.missing_values = missing_values - self.n_neighbors = n_neighbors - self.weights = weights - self.metric = metric - self.row_max_missing = row_max_missing - self.col_max_missing = col_max_missing - self.copy = copy - - def fit(self, X, y=None): - """Fit the imputer on X. - - Parameters - ---------- - X : {array-like}, shape (n_samples, n_features) - Input data, where ``n_samples`` is the number of samples and - ``n_features`` is the number of features. - - Returns - ------- - self : object - Returns self. - """ - # Check parameters - force_all_finite = False if self.missing_values in ["NaN", - np.nan] else True - X = check_array(X, accept_sparse=False, dtype=np.float64, - force_all_finite=force_all_finite, copy=self.copy) - self.weights = _check_weights(self.weights) - - # Check for +/- inf - if (np.any(np.isinf(X))): - raise ValueError("+/- inf values are not allowed.") - - # Check if % missing in any column > col_max_missing - mask = _get_mask(X, self.missing_values) - if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): - raise ValueError("The following columns have, " - "more than {0}% missing values: {1}" - .format(self.col_max_missing*100, np.where( - mask.sum(axis=0) > (X.shape[0] * - self.col_max_missing)))) - X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data - - # Check if % missing in any row > col_max_missing - bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) - if np.any(bad_rows): - warnings.warn( - "The following rows have more than {0}% missing values and " - "are not included as donor neighbors: {1}" - .format(self.row_max_missing*100, np.where(bad_rows))) - - # Remove rows that have more than row_max_missing % missing - X = X[~bad_rows, :] - - # Check if sufficient neighboring samples available - if X.shape[0] < self.n_neighbors: - raise ValueError("There are only %d samples, " - "but n_neighbors=%d." - % (X.shape[0], self.n_neighbors)) - - # Instantiate NN object, get column means, and store in statistics_ - neigh = NearestNeighbors(n_neighbors=self.n_neighbors, - metric=self.metric) - self.statistics_ = (neigh.fit(X), X_col_means) - - return self - - def transform(self, X): - """Impute all missing values in X. - - Parameters - ---------- - X : {array-like}, shape = [n_samples, n_features] - The input data to complete. - """ - force_all_finite = False if self.missing_values in ["NaN", - np.nan] else True - check_is_fitted(self, 'statistics_') - X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, - force_all_finite=force_all_finite, copy=self.copy) - mask = _get_mask(X, self.missing_values) - n_rows_X, n_cols_X = X.shape - row_total_missing = mask.sum(axis=1) - - # Get fitted objects - fitted_NNObj, fitted_col_means = self.statistics_ - fitted_X = fitted_NNObj._fit_X - - # Check for excessive missingness in rows - bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing) - if np.any(bad_rows): - warnings.warn( - "The following rows have more than {0}% missing values and " - "are imputed with column means: {1}" - .format(self.row_max_missing*100, np.where(bad_rows))) - X_bad = X[bad_rows, :] - X = X[~bad_rows, :] - mask = _get_mask(X, self.missing_values) - row_total_missing = mask.sum(axis=1) - - # Check if the X in fit() and transform() are the same - if X is fitted_X or X.base is fitted_X.base: - neighbors = fitted_NNObj.kneighbors(n_neighbors=self.n_neighbors) - else: - neighbors = fitted_NNObj.kneighbors(X, - n_neighbors=self.n_neighbors) - - # Get row index, distance, and weights of donors - knn_distances, knn_row_index = neighbors - weights = _get_weights(knn_distances[row_total_missing.astype( - np.bool), ], self.weights) - - knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) - knn_row_index = np.repeat(knn_row_index, - row_total_missing, axis=0).ravel() - - # Get column index of donors - # NOTE: Following assumes columns in X and _fit_X are in the same order - col_missing_index = np.where(mask)[1] - knn_col_index = np.repeat(col_missing_index, self.n_neighbors) - - # Calculate kNN score and impute - donors = fitted_X[ - (knn_row_index, knn_col_index)].reshape((-1, self.n_neighbors)) - donors = np.ma.array( - donors, mask=_get_mask(donors, self.missing_values)) - imputed = np.ma.average(donors, axis=1, weights=weights) - X[mask] = imputed.data - - # Merge bad rows to X and mean impute any leftover missing - if np.any(bad_rows): - X_merged = np.empty((n_rows_X, n_cols_X)) - X_merged[bad_rows, :] = X_bad - X_merged[~bad_rows, :] = X - X = X_merged - - # Impute bad_rows and leftover missing with column means - mask_after_knn = _get_mask(X, self.missing_values) - if np.any(mask_after_knn): - missing_index = np.where(mask_after_knn) - X[missing_index] = np.take(fitted_col_means, missing_index[1]) - - return X diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py index 1998ca199b828..f8dcd56321d70 100644 --- a/sklearn/preprocessing/tests/test_knn_imputation.py +++ b/sklearn/preprocessing/tests/test_knn_imputation.py @@ -7,7 +7,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false -from sklearn.preprocessing.knn_imputation import KNNImputer +from sklearn.preprocessing.imputation import KNNImputer from sklearn.random_projection import sparse_random_matrix @@ -68,8 +68,8 @@ def test_knn_imputation_zero(): [6, 6, 1.5, 5, 13], ]) - assert_array_equal(imputer.fit(X).transform(X), X_imputed) - assert_array_equal(imputer.statistics_[1], statistics_mean) + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) def test_knn_imputation_default(): @@ -132,8 +132,8 @@ def test_knn_imputation_default(): ]) imputer = KNNImputer() - assert_array_equal(imputer.fit(X).transform(X), X_imputed) - assert_array_equal(imputer.statistics_[1], statistics_mean) + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) # Test with % missing in row > row_max_missing X = np.array([ @@ -158,8 +158,35 @@ def test_knn_imputation_default(): ]) imputer = KNNImputer() - assert_array_equal(imputer.fit(X).transform(X), X_imputed) - assert_array_equal(imputer.statistics_[1], statistics_mean) + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test with all neighboring donors also having missing feature values + X = np.array([ + [1, 0, 0, np.nan], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [np.nan, 4, 5, 5], + [6, np.nan, 6, 7], + [8, 8, 8, 8], + [np.nan, np.nan, np.nan, 20], + ]) + + statistics_mean = [4, 3, 4, 10] + + X_imputed = np.array([ + [1, 0, 0, 10], + [2, 1, 2, 10], + [3, 2, 3, 5], + [4.5, 4, 5, 5], + [6, 6, 6, 7], + [8, 8, 8, 8], + [4, 3, 4, 20], + ]) + + imputer = KNNImputer(n_neighbors=2) + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) # Test with weights = "distance" X = np.array([ @@ -179,26 +206,9 @@ def test_knn_imputation_default(): ]) imputer = KNNImputer(n_neighbors=2, weights="distance") - assert_array_almost_equal(imputer.fit(X).transform(X), X_imputed, + assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=4) - assert_array_equal(imputer.statistics_[1], statistics_mean) - - -def test_imputation_pickle(): - # Test for pickling imputers. - import pickle - - l = 100 - X = np.random.rand(l, l+1) - - imputer = KNNImputer() - imputer.fit(X) - - imputer_pickled = pickle.loads(pickle.dumps(imputer)) - - assert_array_equal(imputer.transform(X.copy()), - imputer_pickled.transform(X.copy()), - "Fail to transform the data after pickling ") + assert_array_equal(imputer.statistics_, statistics_mean) def test_imputation_copy(): @@ -208,13 +218,13 @@ def test_imputation_copy(): # copy=True, dense => copy X = X_orig.copy().toarray() imputer = KNNImputer(missing_values=0, copy=True) - Xt = imputer.fit(X).transform(X) + Xt = imputer.fit_transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = KNNImputer(missing_values=0, copy=False) - Xt = imputer.fit(X).transform(X) + Xt = imputer.fit_transform(X) Xt[0, 0] = -1 assert_array_equal(X, Xt) From 29bdccba82c609dc663c583c1062e2bc9f75729f Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 17 Aug 2017 22:39:07 -0500 Subject: [PATCH 33/97] Make NearestNeighbor import local to fit --- sklearn/preprocessing/imputation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 1a943fceae7de..4f0af2be273e8 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -13,7 +13,6 @@ from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -from ..neighbors import NearestNeighbors from ..neighbors.base import _get_weights, _check_weights from ..externals import six @@ -487,6 +486,9 @@ def fit(self, X, y=None): self : object Returns self. """ + # Import NearestNeighbor here to avoid circular import + from ..neighbors import NearestNeighbors + # Check parameters force_all_finite = False if self.missing_values in ["NaN", np.nan] else True From 6bb5471615c85e931170bef25edc0f144215eed0 Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 17 Aug 2017 23:07:49 -0500 Subject: [PATCH 34/97] Updated doc/modules/preprocessing.rst --- doc/modules/preprocessing.rst | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a4e1364a85ae6..03652e4af503f 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -551,6 +551,34 @@ values than observed values. :class:`Imputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. +The :class:`KNNImputer` class provides imputation for completing missing +values using the k-Nearest Neighbors approach. Broadly speaking, the +imputation is performed using either the weighted or the unweighted +statistic (ex., mean) of the missing feature value of the desired number +of neighbors. In case if all of the 'k' neighbors also have the desired +feature value missing, then the value is imputed to be the "column" mean, +that is the overall feature mean. + +The following snippet demonstrates how to replace missing values, +encoded as ``np.nan``, using the mean feature value of the two nearest +neighbors of the rows that contain the missing values:: + + >>> import numpy as np + >>> from sklearn.preprocessing.imputation import KNNImputer + >>> nan = np.nan + >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] + >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") + >>> imputer.fit_transform(X) + array([[ 1. , 2. , 4. ], + [ 3. , 4. , 3. ], + [ 5.5, 6. , 5. ], + [ 8. , 8. , 7. ]]) + + +:class:`KNNImputer` can also be used in a Pipeline as a way to build a +composite estimator that supports imputation. +See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. + .. _polynomial_features: Generating polynomial features From e393cb068a8d4d0a5b6d2c80ebeeaf3aa595ac9a Mon Sep 17 00:00:00 2001 From: harke Date: Fri, 18 Aug 2017 00:08:57 -0500 Subject: [PATCH 35/97] More circular import fixes --- sklearn/preprocessing/imputation.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 4f0af2be273e8..b82264e1b67b8 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -13,7 +13,7 @@ from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -from ..neighbors.base import _get_weights, _check_weights +# from ..neighbors.base import _get_weights, _check_weights from ..externals import six @@ -486,8 +486,9 @@ def fit(self, X, y=None): self : object Returns self. """ - # Import NearestNeighbor here to avoid circular import + # Imports here to avoid circular import from ..neighbors import NearestNeighbors + from ..neighbors.base import _check_weights # Check parameters force_all_finite = False if self.missing_values in ["NaN", @@ -549,6 +550,9 @@ def _transform(self, X, n_neighbors_new): Calling transform() automatically sets this to self.n_neighbors while fit_transform() sets it to self.n_neighbors + 1. """ + # Import(s) here to avoid circular import + from ..neighbors.base import _get_weights + check_is_fitted(self, 'statistics_') force_all_finite = False if self.missing_values in ["NaN", np.nan] else True From 6e5ec308918e7ac43fc62024e7755dd5e92d55a4 Mon Sep 17 00:00:00 2001 From: harke Date: Fri, 18 Aug 2017 01:25:24 -0500 Subject: [PATCH 36/97] pep8 fixes --- sklearn/preprocessing/imputation.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index b82264e1b67b8..c1c8774c1d477 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -610,7 +610,8 @@ def _transform(self, X, n_neighbors_new): knn_row_index, row_repeats, axis=0).ravel() # Get column index of donors - # NOTE: Following assumes columns in X and _fit_X are in the same order + # NOTE: Following assumes columns in X and _fit_X are in the same + # order row_missing_index, col_missing_index = np.where(mask) knn_col_index = np.repeat(col_missing_index, self.n_neighbors) @@ -622,7 +623,8 @@ def _transform(self, X, n_neighbors_new): donors, mask=donors_mask) imputed = np.ma.average(donors, axis=1, weights=weights) X[mask] = imputed.data - unimputed_index = np.where(donors_mask.sum(axis=1) == self.n_neighbors) + unimputed_index = np.where( + donors_mask.sum(axis=1) == self.n_neighbors) # imputed_mask = _get_mask(imputed.data, self.missing_values) if len(unimputed_index[0]) > 0: # unimputed_loc = np.where(imputed_mask) @@ -643,7 +645,8 @@ def _transform(self, X, n_neighbors_new): X = X_merged # Impute bad_rows and leftover missing with column means - # mask_after_knn = _get_mask(X[row_has_missing, :], self.missing_values) + # mask_after_knn = _get_mask(X[row_has_missing, :], + # self.missing_values) # if np.any(mask_after_knn): # missing_index = np.where(mask_after_knn) # X[row_has_missing, :][missing_index] = np.take(self.statistics_, @@ -670,4 +673,4 @@ def transform(self, X): The input data to complete. """ check_is_fitted(self, 'statistics_') - return self._transform(X, n_neighbors_new=self.n_neighbors) \ No newline at end of file + return self._transform(X, n_neighbors_new=self.n_neighbors) From dd027f9909f0503dc264ffe6fede327633d59f1d Mon Sep 17 00:00:00 2001 From: harke Date: Fri, 18 Aug 2017 13:38:18 -0500 Subject: [PATCH 37/97] Minor comment updates --- sklearn/preprocessing/imputation.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index c1c8774c1d477..8fd7471e24ba1 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -655,12 +655,18 @@ def _transform(self, X, n_neighbors_new): return X def fit_transform(self, X, y=None, **fit_params): - """Impute all missing values in X. + """Fit KNNImputer and impute all missing values in X. Parameters ---------- - X : {array-like}, shape = [n_samples, n_features] - The input data to complete. + X : {array-like}, shape (n_samples, n_features) + Input data, where ``n_samples`` is the number of samples and + ``n_features`` is the number of features. + + Returns + ------- + X : {array-like}, shape (n_samples, n_features) + Returns imputed dataset. """ return self.fit(X)._transform(X, n_neighbors_new=self.n_neighbors + 1) @@ -671,6 +677,11 @@ def transform(self, X): ---------- X : {array-like}, shape = [n_samples, n_features] The input data to complete. + + Returns + ------- + X : {array-like}, shape (n_samples, n_features) + Returns imputed dataset. """ check_is_fitted(self, 'statistics_') return self._transform(X, n_neighbors_new=self.n_neighbors) From f33bff48847c7c6900ea6e7cd19b5e33df09f993 Mon Sep 17 00:00:00 2001 From: harke Date: Sat, 19 Aug 2017 21:43:20 -0500 Subject: [PATCH 38/97] Addressed review comments (part 2) --- doc/modules/preprocessing.rst | 20 +- sklearn/preprocessing/imputation.py | 84 ++++---- .../tests/test_knn_imputation.py | 193 +++++++++++++++++- 3 files changed, 239 insertions(+), 58 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 03652e4af503f..b54f3ddde4987 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -552,12 +552,15 @@ values than observed values. estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. The :class:`KNNImputer` class provides imputation for completing missing -values using the k-Nearest Neighbors approach. Broadly speaking, the -imputation is performed using either the weighted or the unweighted -statistic (ex., mean) of the missing feature value of the desired number -of neighbors. In case if all of the 'k' neighbors also have the desired -feature value missing, then the value is imputed to be the "column" mean, -that is the overall feature mean. +values using the k-Nearest Neighbors approach. Each sample's missing values +are imputed from up to n_neighbors nearest neighbors found in the training set. +Each missing feature is then imputed as the average, either weighted or +unweighted, of the neighbors who have a value for it. +When any of the neighbors themselves have the feature value missing then +the remaining n_neighbors-1 neighbors are used and, if need be, +the process repeats until a single neighbor remains. Where all neighbors have +that feature value missing, the training set average for that feature is used. +For more information on the methodology, see ref. [#]_. The following snippet demonstrates how to replace missing values, encoded as ``np.nan``, using the mean feature value of the two nearest @@ -579,6 +582,11 @@ neighbors of the rows that contain the missing values:: composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. +.. [#] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor +Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value +estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 +Pages 520-525. + .. _polynomial_features: Generating polynomial features diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 8fd7471e24ba1..8715cbaea05f9 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -13,7 +13,6 @@ from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES -# from ..neighbors.base import _get_weights, _check_weights from ..externals import six @@ -379,10 +378,13 @@ def transform(self, X): class KNNImputer(BaseEstimator, TransformerMixin): - """Imputation for completing missing values using Nearest Neighbors. + """Imputation for completing missing values using k-Nearest Neighbors. - Broadly speaking, the imputation is performed using either - the weighted or the unweighted mean of the desired number of neighbors. + Each sample's missing values are imputed from up to "n_neighbors" nearest + neighbors found in the training set. Each missing feature is then + imputed as the average, either weighted or unweighted, of these neighbors + who have a value for it. Where all neighbors have that feature value + missing, the training set average for that feature is used for imputation. Parameters ---------- @@ -392,12 +394,10 @@ class KNNImputer(BaseEstimator, TransformerMixin): use the string value "NaN". n_neighbors : int, optional (default = 5) - Maximum number of neighboring samples to use for imputation. When - any of the neighbors themselves have the feature value missing then - the remaining n_neighbors-1 neighbors are used and, if need be, - the process repeats until a single neighbor remains. If all - the neighbors have the feature value missing, then the overall feature - mean is used for imputation. + Maximum number of neighboring samples to use for imputation. When any + of the neighbors themselves have the feature value missing then the + remaining n_neighbors-1 neighbors are used and, if need be, the + process repeats until a single neighbor remains. weights : str or callable weight function used in prediction. Possible values: @@ -498,7 +498,7 @@ def fit(self, X, y=None): self.weights = _check_weights(self.weights) # Check for +/- inf - if (np.any(np.isinf(X))): + if np.any(np.isinf(X)): raise ValueError("+/- inf values are not allowed.") # Check if % missing in any column > col_max_missing @@ -515,9 +515,9 @@ def fit(self, X, y=None): bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) if np.any(bad_rows): warnings.warn( - "The following rows have more than {0}% missing values and " - "are not included as donor neighbors: {1}" - .format(self.row_max_missing*100, np.where(bad_rows))) + "There are rows with more than {0}% missing values. These " + "rows are not included as donor neighbors." + .format(self.row_max_missing*100)) # Remove rows that have more than row_max_missing % missing X = X[~bad_rows, :] @@ -559,7 +559,7 @@ def _transform(self, X, n_neighbors_new): X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) # Check for +/- inf - if (np.any(np.isinf(X))): + if np.any(np.isinf(X)): raise ValueError("+/- inf values are not allowed.") # Get fitted data and ensure correct dimension @@ -572,46 +572,45 @@ def _transform(self, X, n_neighbors_new): row_total_missing = mask.sum(axis=1) if not np.any(row_total_missing > 0): return X - # row_has_missing = row_total_missing.astype(np.bool) # Check for excessive missingness in rows bad_rows = row_total_missing > (mask.shape[1] * self.row_max_missing) if np.any(bad_rows): warnings.warn( - "The following rows have more than {0}% missing values and " - "are imputed with column means: {1}" - .format(self.row_max_missing*100, np.where(bad_rows))) + "There are rows with more than {0}% missing values. The " + "missing features in these rows are imputed with column means." + .format(self.row_max_missing*100)) X_bad = X[bad_rows, :] X = X[~bad_rows, :] mask = _get_mask(X, self.missing_values) row_total_missing = mask.sum(axis=1) row_has_missing = row_total_missing.astype(np.bool) - # Check if the X in fit() and transform() are the same - # if X is fitted_X or X.base is fitted_X.base: - # neighbors = self._fitted_neighbors.kneighbors( - # n_neighbors=self.n_neighbors) - # else: - # neighbors = self._fitted_neighbors.kneighbors( - # X[row_has_missing, :], n_neighbors=self.n_neighbors) + if np.any(row_has_missing): neighbors = self._fitted_neighbors.kneighbors( X[row_has_missing, :], n_neighbors=n_neighbors_new) # Get row index, distance, and weights of donors knn_distances, knn_row_index = neighbors + # Remove self from list of donors if n_neighbors_new > self.n_neighbors: - knn_distances = knn_distances[:, 1:] - knn_row_index = knn_row_index[:, 1:] + row_index = np.arange(X.shape[0]).reshape((X.shape[0], 1)) + row_index = row_index[row_has_missing, :] + not_duplicate_index = np.where(~(row_index == knn_row_index)) + knn_row_index = knn_row_index[not_duplicate_index].reshape( + (-1, self.n_neighbors)) + knn_distances = knn_distances[not_duplicate_index].reshape( + (-1, self.n_neighbors)) weights = _get_weights(knn_distances, self.weights) + # Vertically split sets of k-donor indices and repeat each set by + # missing count in the corresponding recipient row knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) row_repeats = row_total_missing[row_total_missing != 0] knn_row_index = np.repeat( knn_row_index, row_repeats, axis=0).ravel() # Get column index of donors - # NOTE: Following assumes columns in X and _fit_X are in the same - # order row_missing_index, col_missing_index = np.where(mask) knn_col_index = np.repeat(col_missing_index, self.n_neighbors) @@ -625,16 +624,13 @@ def _transform(self, X, n_neighbors_new): X[mask] = imputed.data unimputed_index = np.where( donors_mask.sum(axis=1) == self.n_neighbors) - # imputed_mask = _get_mask(imputed.data, self.missing_values) if len(unimputed_index[0]) > 0: - # unimputed_loc = np.where(imputed_mask) - # unimputed_rows, unimputed_cols = np.where(mask) unimputed_rows = row_missing_index[unimputed_index] unimputed_cols = col_missing_index[unimputed_index] X[(unimputed_rows, unimputed_cols)] = np.take(self.statistics_, unimputed_cols) - # Merge bad rows to X and mean impute any leftover missing + # Merge bad rows to X and mean impute their missing if np.any(bad_rows): bad_missing_index = np.where(_get_mask(X_bad, self.missing_values)) X_bad[bad_missing_index] = np.take(self.statistics_, @@ -643,19 +639,12 @@ def _transform(self, X, n_neighbors_new): X_merged[bad_rows, :] = X_bad X_merged[~bad_rows, :] = X X = X_merged - - # Impute bad_rows and leftover missing with column means - # mask_after_knn = _get_mask(X[row_has_missing, :], - # self.missing_values) - # if np.any(mask_after_knn): - # missing_index = np.where(mask_after_knn) - # X[row_has_missing, :][missing_index] = np.take(self.statistics_, - # missing_index[1]) - return X def fit_transform(self, X, y=None, **fit_params): - """Fit KNNImputer and impute all missing values in X. + """Fit KNNImputer and impute all missing values in X. This method + should be used if the data to be fitted is the same as the data to + be transformed. Parameters ---------- @@ -671,7 +660,12 @@ def fit_transform(self, X, y=None, **fit_params): return self.fit(X)._transform(X, n_neighbors_new=self.n_neighbors + 1) def transform(self, X): - """Impute all missing values in X. + """Impute all missing values in X. This method should be used if the + data to be fitted is different from the data to be transformed. + + WARNING: If the same dataset is passed in fit() and transform(), + one of the returned "neighbors" maybe the sample itself. Use + fit_transform() to avoid this behavior. Parameters ---------- diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py index f8dcd56321d70..6f7fc428474c2 100644 --- a/sklearn/preprocessing/tests/test_knn_imputation.py +++ b/sklearn/preprocessing/tests/test_knn_imputation.py @@ -8,6 +8,7 @@ from sklearn.utils.testing import assert_false from sklearn.preprocessing.imputation import KNNImputer +from sklearn.neighbors import NearestNeighbors from sklearn.random_projection import sparse_random_matrix @@ -33,6 +34,9 @@ def test_knn_imputation_zero(): imputer = KNNImputer(missing_values=missing_values, n_neighbors=n_neighbors, weights="uniform") + imputer_nan = KNNImputer(missing_values=np.nan, + n_neighbors=n_neighbors, + weights="uniform") # Test with missing_values=0 when NaN present X = np.array([ @@ -52,24 +56,42 @@ def test_knn_imputation_zero(): ]) assert_raises(ValueError, imputer.fit, X) - # Test with an imputable matrix + # Test with an imputable matrix and also compare with missing_values="NaN" + # X = np.array([ + # [1, 0, 1, 0, 5], + # [2, 1, 2, 2, 3], + # [3, 2, 3, 0, 0], + # [6, 6, 0, 5, 13], + # ]) + X = np.array([ - [1, 0, 1, 0, 5], + [1, 0, 1, 0, 1], [2, 1, 2, 2, 3], [3, 2, 3, 0, 0], - [6, 6, 0, 5, 13], + [6, 6, 0, 5, 17], + ]) + + X_nan = np.array([ + [1, np.nan, 1, np.nan, 1], + [2, 1, 2, 2, 3], + [3, 2, 3, np.nan, np.nan], + [6, 6, np.nan, 5, 17], ]) statistics_mean = [3, 3, 2, 3.5, 7] X_imputed = np.array([ - [1, 1.5, 1, 2, 5], + [1, 1.5, 1, 2, 1], [2, 1, 2, 2, 3], - [3, 2, 3, 2, 4], - [6, 6, 1.5, 5, 13], + [3, 2, 3, 2, 2], + [6, 6, 1.5, 5, 17], ]) assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) + # The following fails at the moment as NearestNeighbors object does not + # pass missing_values=0 to pairwise_distances() + # assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( + # X_nan)) def test_knn_imputation_default(): @@ -188,7 +210,164 @@ def test_knn_imputation_default(): assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) - # Test with weights = "distance" + # Test when data in fit() and transform() are different + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 16] + ]) + statistics_mean = [6, 6] + + Y = np.array([ + [1, 0], + [3, 2], + [4, np.nan] + ]) + + Y_imputed = np.array([ + [1, 0], + [3, 2], + [4, 4.8] + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit(X).transform(Y), Y_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + +def test_knn_n_neighbors(): + + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, np.nan], + [7, 7], + [np.nan, 8], + [14, 13] + ]) + statistics_mean = [6, 5.5] + + # Test with 1 neighbor + X_imputed_1NN = np.array([ + [0, 0], + [4, 2], + [4, 3], + [5, 3], + [7, 7], + [7, 8], + [14, 13] + ]) + + n_neighbors = 1 + imputer = KNNImputer(n_neighbors=n_neighbors) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors+1) + + assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( + X).transform(X)) + + # Test with 6 neighbors + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, np.nan], + [7, 7], + [np.nan, 8], + [14, 13] + ]) + + X_imputed_6NN = np.array([ + [0, 0], + [6, 2], + [4, 3], + [5, 5.5], + [7, 7], + [6, 8], + [14, 13] + ]) + + n_neighbors = 6 + imputer = KNNImputer(n_neighbors=6) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) + + assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( + X).transform(X)) + + +def test_weight_type(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # Test with "uniform" weight (or unweighted) + X_imputed_uniform = np.array([ + [0, 0], + [5, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + imputer = KNNImputer(weights="uniform") + assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + + # Test with "distance" weight + nn = NearestNeighbors(metric="masked_euclidean") + nn.fit(X) + # Get distance of "n_neighbors" neighbors of row 1 + dist, index = nn.kneighbors() + dist = dist[1, :] + index = index[1, :] + weights = 1 / dist + values = X[index, 0] + imputed = np.dot(values, weights) / np.sum(weights) + + # Manual calculation + X_imputed_distance1 = np.array([ + [0, 0], + [3.850393700787402, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # NearestNeighbor calculation + X_imputed_distance2 = np.array([ + [0, 0], + [imputed, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + imputer = KNNImputer(weights="distance") + assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1, + decimal=6) + assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, + decimal=6) + + # Test with weights = "distance" and n_neighbors=2 X = np.array([ [np.nan, 0, 0], [2, 1, 2], From 2e1ea48c5e1da3874c6cc9801a27a56df6de4cc0 Mon Sep 17 00:00:00 2001 From: harke Date: Sat, 19 Aug 2017 23:11:37 -0500 Subject: [PATCH 39/97] Fixed pyflex issues --- .../preprocessing/tests/test_knn_imputation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py index 6f7fc428474c2..f9d5ea1af2483 100644 --- a/sklearn/preprocessing/tests/test_knn_imputation.py +++ b/sklearn/preprocessing/tests/test_knn_imputation.py @@ -34,9 +34,9 @@ def test_knn_imputation_zero(): imputer = KNNImputer(missing_values=missing_values, n_neighbors=n_neighbors, weights="uniform") - imputer_nan = KNNImputer(missing_values=np.nan, - n_neighbors=n_neighbors, - weights="uniform") + # imputer_nan = KNNImputer(missing_values=np.nan, + # n_neighbors=n_neighbors, + # weights="uniform") # Test with missing_values=0 when NaN present X = np.array([ @@ -71,12 +71,12 @@ def test_knn_imputation_zero(): [6, 6, 0, 5, 17], ]) - X_nan = np.array([ - [1, np.nan, 1, np.nan, 1], - [2, 1, 2, 2, 3], - [3, 2, 3, np.nan, np.nan], - [6, 6, np.nan, 5, 17], - ]) + # X_nan = np.array([ + # [1, np.nan, 1, np.nan, 1], + # [2, 1, 2, 2, 3], + # [3, 2, 3, np.nan, np.nan], + # [6, 6, np.nan, 5, 17], + # ]) statistics_mean = [3, 3, 2, 3.5, 7] X_imputed = np.array([ From 109849928871522fd8a97a01f526e854a2b274b4 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 3 Sep 2017 18:43:12 -0500 Subject: [PATCH 40/97] Added test for callable weights and updated comments. --- sklearn/preprocessing/imputation.py | 25 ++++++++----------- .../tests/test_knn_imputation.py | 23 ++++++++++++++++- 2 files changed, 33 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 8715cbaea05f9..86a651dc1c02c 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -388,7 +388,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : integer or "NaN", optional (default="NaN") + missing_values : integer or "NaN", optional (default = "NaN") The placeholder for the missing values. All occurrences of `missing_values` will be imputed. For missing values encoded as np.nan, use the string value "NaN". @@ -399,8 +399,8 @@ class KNNImputer(BaseEstimator, TransformerMixin): remaining n_neighbors-1 neighbors are used and, if need be, the process repeats until a single neighbor remains. - weights : str or callable - weight function used in prediction. Possible values: + weights : str or callable, optional (default = "uniform") + Weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood are weighted equally. @@ -411,10 +411,8 @@ class KNNImputer(BaseEstimator, TransformerMixin): array of distances, and returns an array of the same shape containing the weights. - Uniform weights are used by default. - - metric : string or callable, optional (default = 'masked_euclidean') - metric to use for distance computation. + metric : string, optional (default = 'masked_euclidean') + Metric to use for distance computation. row_max_missing : float, optional (default = 0.5) The maximum percentage of columns (i.e. features) that can be missing @@ -427,7 +425,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): The maximum percentage of rows (or samples) that can be missing for a given feature beyond which an error is raised. - copy : boolean, optional (default=True) + copy : boolean, optional (default = True) If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, if metric is "masked_euclidean" and copy=False then missing_values in the @@ -435,10 +433,9 @@ class KNNImputer(BaseEstimator, TransformerMixin): Attributes ---------- - statistics_ : {tuple} - A tuple whose first element is the input dataset used to fit the - KNNImputer object and the second element is the column means of that - dataset using observed (i.e. non-missing) values. + statistics_ : 1-D array of length {n_features} + The 1-D array contains the mean of each feature calculated using + observed (i.e. non-missing) values. References ---------- @@ -660,8 +657,8 @@ def fit_transform(self, X, y=None, **fit_params): return self.fit(X)._transform(X, n_neighbors_new=self.n_neighbors + 1) def transform(self, X): - """Impute all missing values in X. This method should be used if the - data to be fitted is different from the data to be transformed. + """Impute all missing values in X. This method should only be used + if the data to be fitted is different from the data to be transformed. WARNING: If the same dataset is passed in fit() and transform(), one of the returned "neighbors" maybe the sample itself. Use diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py index f9d5ea1af2483..3508d1be03161 100644 --- a/sklearn/preprocessing/tests/test_knn_imputation.py +++ b/sklearn/preprocessing/tests/test_knn_imputation.py @@ -11,7 +11,6 @@ from sklearn.neighbors import NearestNeighbors from sklearn.random_projection import sparse_random_matrix - def test_knn_imputation_shape(): # Verify the shapes of the imputed matrix for different weights and # number of neighbors. @@ -328,6 +327,13 @@ def test_weight_type(): imputer = KNNImputer(weights="uniform") assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + # Test with "callable" weight + def no_weight(dist): + return None + + imputer = KNNImputer(weights=no_weight) + assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + # Test with "distance" weight nn = NearestNeighbors(metric="masked_euclidean") nn.fit(X) @@ -390,6 +396,21 @@ def test_weight_type(): assert_array_equal(imputer.statistics_, statistics_mean) +def test_metric_type(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # Test with a metric type without NaN support + imputer = KNNImputer(metric="euclidean") + assert_raises(ValueError, imputer.fit, X) + def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(10, 10, density=0.75, random_state=0) From a698120dd3be4b78208cd1a51d20a4877f885a1a Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 3 Sep 2017 18:59:46 -0500 Subject: [PATCH 41/97] Pep8 fixes --- sklearn/preprocessing/tests/test_knn_imputation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py index 3508d1be03161..b8e36e2026faf 100644 --- a/sklearn/preprocessing/tests/test_knn_imputation.py +++ b/sklearn/preprocessing/tests/test_knn_imputation.py @@ -11,6 +11,7 @@ from sklearn.neighbors import NearestNeighbors from sklearn.random_projection import sparse_random_matrix + def test_knn_imputation_shape(): # Verify the shapes of the imputed matrix for different weights and # number of neighbors. @@ -411,6 +412,7 @@ def test_metric_type(): imputer = KNNImputer(metric="euclidean") assert_raises(ValueError, imputer.fit, X) + def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(10, 10, density=0.75, random_state=0) From 95e0f56ec74901a886b1166a6347682c140d4128 Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 14 Sep 2017 20:30:33 -0500 Subject: [PATCH 42/97] Comment, doc, and pep8 fixes --- doc/modules/preprocessing.rst | 6 ++--- examples/plot_missing_values.py | 7 +++--- sklearn/metrics/pairwise.py | 2 +- sklearn/preprocessing/imputation.py | 35 +++++++++++++++-------------- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index b54f3ddde4987..d765423aa3244 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -583,9 +583,9 @@ composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. .. [#] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor -Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value -estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 -Pages 520-525. + Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value + estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 + Pages 520-525. .. _polynomial_features: diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index cad10a95051dd..2dcd068df8ebe 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -10,14 +10,15 @@ more effective. Imputer: -Missing values can be replaced by the mean, the median or the most frequent +Using Imputer, missing values can be replaced by the mean, the median or the +most frequent value using the ``strategy`` hyper-parameter. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). KNNImputer: -Missing values can be imputed using the weighted or unweighted mean of the -desired number of nearest neighbors. +Using KNNImputer, missing values can be imputed using the weighted or +unweighted mean of the desired number of nearest neighbors, if available. Script output:: diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index c00b6847e379d..2afd6b60026f7 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1381,7 +1381,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X elif metric in PAIRWISE_DISTANCE_FUNCTIONS: - func = PAIRWISE_DISTANCE_FUNCTIONS[metric] + func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 86a651dc1c02c..a4652988f1ec3 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -396,8 +396,8 @@ class KNNImputer(BaseEstimator, TransformerMixin): n_neighbors : int, optional (default = 5) Maximum number of neighboring samples to use for imputation. When any of the neighbors themselves have the feature value missing then the - remaining n_neighbors-1 neighbors are used and, if need be, the - process repeats until a single neighbor remains. + remaining neighbors, if any, that have the feature value available are + used. weights : str or callable, optional (default = "uniform") Weight function used in prediction. Possible values: @@ -411,9 +411,6 @@ class KNNImputer(BaseEstimator, TransformerMixin): array of distances, and returns an array of the same shape containing the weights. - metric : string, optional (default = 'masked_euclidean') - Metric to use for distance computation. - row_max_missing : float, optional (default = 0.5) The maximum percentage of columns (i.e. features) that can be missing before the sample is excluded from nearest neighbor imputation. It @@ -514,7 +511,7 @@ def fit(self, X, y=None): warnings.warn( "There are rows with more than {0}% missing values. These " "rows are not included as donor neighbors." - .format(self.row_max_missing*100)) + .format(self.row_max_missing * 100)) # Remove rows that have more than row_max_missing % missing X = X[~bad_rows, :] @@ -533,7 +530,7 @@ def fit(self, X, y=None): return self - def _transform(self, X, n_neighbors_new): + def _transform(self, X, adjusted_n_neighbors): """Impute all missing values in X. Parameters @@ -541,7 +538,7 @@ def _transform(self, X, n_neighbors_new): X : {array-like}, shape = [n_samples, n_features] The input data to complete. - n_neighbors_new : int + adjusted_n_neighbors : int Indicates whether to pass n_neighbors or n_neighbors+1 to _tranform(). Calling transform() automatically sets this to self.n_neighbors @@ -585,12 +582,12 @@ def _transform(self, X, n_neighbors_new): if np.any(row_has_missing): neighbors = self._fitted_neighbors.kneighbors( - X[row_has_missing, :], n_neighbors=n_neighbors_new) + X[row_has_missing, :], n_neighbors=adjusted_n_neighbors) # Get row index, distance, and weights of donors knn_distances, knn_row_index = neighbors # Remove self from list of donors - if n_neighbors_new > self.n_neighbors: + if adjusted_n_neighbors > self.n_neighbors: row_index = np.arange(X.shape[0]).reshape((X.shape[0], 1)) row_index = row_index[row_has_missing, :] not_duplicate_index = np.where(~(row_index == knn_row_index)) @@ -639,9 +636,10 @@ def _transform(self, X, n_neighbors_new): return X def fit_transform(self, X, y=None, **fit_params): - """Fit KNNImputer and impute all missing values in X. This method - should be used if the data to be fitted is the same as the data to - be transformed. + """Fit KNNImputer and impute all missing values in X. + + This method should *only* be used if the data to be fitted is the + same as the data to be transformed. Parameters ---------- @@ -654,11 +652,14 @@ def fit_transform(self, X, y=None, **fit_params): X : {array-like}, shape (n_samples, n_features) Returns imputed dataset. """ - return self.fit(X)._transform(X, n_neighbors_new=self.n_neighbors + 1) + return self.fit(X)._transform( + X, adjusted_n_neighbors=self.n_neighbors + 1) def transform(self, X): - """Impute all missing values in X. This method should only be used - if the data to be fitted is different from the data to be transformed. + """Impute all missing values in X. + + This method should *only* be used if the data to be fitted is different + from the data to be transformed. WARNING: If the same dataset is passed in fit() and transform(), one of the returned "neighbors" maybe the sample itself. Use @@ -675,4 +676,4 @@ def transform(self, X): Returns imputed dataset. """ check_is_fitted(self, 'statistics_') - return self._transform(X, n_neighbors_new=self.n_neighbors) + return self._transform(X, adjusted_n_neighbors=self.n_neighbors) From 215c8c98b5447661532028ac67850beaba06f54d Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 14 Sep 2017 21:14:45 -0500 Subject: [PATCH 43/97] Docstring changes --- sklearn/preprocessing/imputation.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index a4652988f1ec3..eea82fd5dd4a3 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -411,6 +411,10 @@ class KNNImputer(BaseEstimator, TransformerMixin): array of distances, and returns an array of the same shape containing the weights. + metric : str, optional (default = "masked_euclidean") + Distance metric for searching neighbors. Possible values: + - 'masked_euclidean' + row_max_missing : float, optional (default = 0.5) The maximum percentage of columns (i.e. features) that can be missing before the sample is excluded from nearest neighbor imputation. It From fab313b15888535b1ea01a376f3f3664ce62e00c Mon Sep 17 00:00:00 2001 From: harke Date: Fri, 15 Sep 2017 12:50:37 -0500 Subject: [PATCH 44/97] Changes to unit tests as per review comments --- sklearn/neighbors/base.py | 5 +- sklearn/neighbors/unsupervised.py | 2 +- sklearn/preprocessing/imputation.py | 14 +- .../tests/test_knn_imputation.py | 162 +++++++++--------- 4 files changed, 92 insertions(+), 91 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 66dbc87fe39f9..b3633c9717284 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -367,8 +367,9 @@ class from an array representing our data set and ask who's # for efficiency, use squared euclidean distances if self.effective_metric_ in ['euclidean', 'masked_euclidean']: dist = pairwise_distances(X, self._fit_X, - self.effective_metric_, - n_jobs=n_jobs, squared=True) + metric=self.effective_metric_, + n_jobs=n_jobs, squared=True, + **self.effective_metric_params_) else: dist = pairwise_distances( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py index cf7bf82d17fbd..fd103a65cf4aa 100644 --- a/sklearn/neighbors/unsupervised.py +++ b/sklearn/neighbors/unsupervised.py @@ -60,7 +60,7 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan'] + 'manhattan', 'masked_euclidean] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index eea82fd5dd4a3..61cf043ca2508 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -502,11 +502,8 @@ def fit(self, X, y=None): # Check if % missing in any column > col_max_missing mask = _get_mask(X, self.missing_values) if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): - raise ValueError("The following columns have, " - "more than {0}% missing values: {1}" - .format(self.col_max_missing*100, np.where( - mask.sum(axis=0) > (X.shape[0] * - self.col_max_missing)))) + raise ValueError("Some column(s) have more than {}% missing values" + .format(self.col_max_missing*100)) X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data # Check if % missing in any row > col_max_missing @@ -522,13 +519,14 @@ def fit(self, X, y=None): # Check if sufficient neighboring samples available if X.shape[0] < self.n_neighbors: - raise ValueError("There are only %d samples, " - "but n_neighbors=%d." + raise ValueError("There are only %d samples, but n_neighbors=%d." % (X.shape[0], self.n_neighbors)) # Instantiate NN object, get column means, and store in statistics_ neigh = NearestNeighbors(n_neighbors=self.n_neighbors, - metric=self.metric) + metric=self.metric, + metric_params={"missing_values": + self.missing_values}) self._fitted_neighbors = neigh.fit(X) self.statistics_ = X_col_means diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py index b8e36e2026faf..9a4abe9ef5a71 100644 --- a/sklearn/preprocessing/tests/test_knn_imputation.py +++ b/sklearn/preprocessing/tests/test_knn_imputation.py @@ -5,6 +5,7 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_false from sklearn.preprocessing.imputation import KNNImputer @@ -34,9 +35,9 @@ def test_knn_imputation_zero(): imputer = KNNImputer(missing_values=missing_values, n_neighbors=n_neighbors, weights="uniform") - # imputer_nan = KNNImputer(missing_values=np.nan, - # n_neighbors=n_neighbors, - # weights="uniform") + imputer_nan = KNNImputer(missing_values="NaN", + n_neighbors=n_neighbors, + weights="uniform") # Test with missing_values=0 when NaN present X = np.array([ @@ -45,7 +46,8 @@ def test_knn_imputation_zero(): [np.nan, 2, 0, 0, 0], [np.nan, 6, 0, 5, 13], ]) - assert_raises(ValueError, imputer.fit, X) + msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype + assert_raise_message(ValueError, msg, imputer.fit, X) # Test with % zeros in column > col_max_missing X = np.array([ @@ -54,16 +56,11 @@ def test_knn_imputation_zero(): [3, 2, 0, 0, 0], [4, 6, 0, 5, 13], ]) - assert_raises(ValueError, imputer.fit, X) + msg = "Some column(s) have more than {}% missing values".format( + imputer.col_max_missing * 100) + assert_raise_message(ValueError, msg, imputer.fit, X) # Test with an imputable matrix and also compare with missing_values="NaN" - # X = np.array([ - # [1, 0, 1, 0, 5], - # [2, 1, 2, 2, 3], - # [3, 2, 3, 0, 0], - # [6, 6, 0, 5, 13], - # ]) - X = np.array([ [1, 0, 1, 0, 1], [2, 1, 2, 2, 3], @@ -71,64 +68,29 @@ def test_knn_imputation_zero(): [6, 6, 0, 5, 17], ]) - # X_nan = np.array([ - # [1, np.nan, 1, np.nan, 1], - # [2, 1, 2, 2, 3], - # [3, 2, 3, np.nan, np.nan], - # [6, 6, np.nan, 5, 17], - # ]) + X_nan = np.array([ + [1, np.nan, 1, np.nan, 1], + [2, 1, 2, 2, 3], + [3, 2, 3, np.nan, np.nan], + [6, 6, np.nan, 5, 17], + ]) + statistics_mean = np.nanmean(X_nan, axis=0) - statistics_mean = [3, 3, 2, 3.5, 7] X_imputed = np.array([ [1, 1.5, 1, 2, 1], [2, 1, 2, 2, 3], [3, 2, 3, 2, 2], - [6, 6, 1.5, 5, 17], + [6, 6, 2.5, 5, 17], ]) assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) - # The following fails at the moment as NearestNeighbors object does not - # pass missing_values=0 to pairwise_distances() - # assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( - # X_nan)) + assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( + X_nan)) def test_knn_imputation_default(): - # Test imputation with default values - # imputer = KNNImputer() - - # Test with % missing in a column > col_max_missing - X = np.array([ - [np.nan, 0, 0, 0, 5], - [np.nan, 1, 0, np.nan, 3], - [np.nan, 2, 0, 0, 0], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [np.nan, 8, 0, 8, 9], - ]) - assert_raises(ValueError, KNNImputer().fit, X) - - # Test with insufficient number of neighbors - imputer = KNNImputer() - X = np.array([ - [1, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [6, 6, 2, 5, 13], - ]) - assert_raises(ValueError, KNNImputer().fit, X) - - # Test with inf present - X = np.array([ - [np.inf, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [6, 6, 2, 5, 7], - ]) - assert_raises(ValueError, KNNImputer().fit, X) + # Test imputation with default parameter values # Test with an imputable matrix X = np.array([ @@ -140,8 +102,7 @@ def test_knn_imputation_default(): [8, 8, 8, 8], [16, 15, 18, 19], ]) - - statistics_mean = [6, 5, 6, 8] + statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ [1, 0, 0, 1], @@ -167,8 +128,8 @@ def test_knn_imputation_default(): [8, 8, 8, 8], [np.nan, np.nan, np.nan, 19], ]) + statistics_mean = np.nanmean(X, axis=0) - statistics_mean = [4, 3, 4, 8] X_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 5.25], @@ -188,25 +149,26 @@ def test_knn_imputation_default(): [1, 0, 0, np.nan], [2, 1, 2, np.nan], [3, 2, 3, np.nan], - [np.nan, 4, 5, 5], - [6, np.nan, 6, 7], - [8, 8, 8, 8], - [np.nan, np.nan, np.nan, 20], + [4, 4, 5, np.nan], + [6, 7, 6, np.nan], + [8, 8, 8, np.nan], + [20, 20, 20, 20], + [22, 22, 22, 22] ]) - - statistics_mean = [4, 3, 4, 10] + statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ - [1, 0, 0, 10], - [2, 1, 2, 10], - [3, 2, 3, 5], - [4.5, 4, 5, 5], - [6, 6, 6, 7], - [8, 8, 8, 8], - [4, 3, 4, 20], + [1, 0, 0, 21], + [2, 1, 2, 21], + [3, 2, 3, 21], + [4, 4, 5, 21], + [6, 7, 6, 21], + [8, 8, 8, 21], + [20, 20, 20, 20], + [22, 22, 22, 22] ]) - imputer = KNNImputer(n_neighbors=2) + imputer = KNNImputer() assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) @@ -220,7 +182,7 @@ def test_knn_imputation_default(): [9, 8], [11, 16] ]) - statistics_mean = [6, 6] + statistics_mean = np.nanmean(X, axis=0) Y = np.array([ [1, 0], @@ -239,6 +201,47 @@ def test_knn_imputation_default(): assert_array_equal(imputer.statistics_, statistics_mean) +def test_default_with_invalid_input(): + # Test imputation with default values and invalid input + + # Test with % missing in a column > col_max_missing + X = np.array([ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [np.nan, 8, 0, 8, 9], + ]) + imputer = KNNImputer() + msg = "Some column(s) have more than {}% missing values".format( + imputer.col_max_missing * 100) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with insufficient number of neighbors + X = np.array([ + [1, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [6, 6, 2, 5, 13], + ]) + msg = "There are only %d samples, but n_neighbors=%d." % \ + (X.shape[0], imputer.n_neighbors) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with inf present + X = np.array([ + [np.inf, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + msg = "+/- inf values are not allowed." + assert_raise_message(ValueError, msg, KNNImputer().fit, X) + + def test_knn_n_neighbors(): X = np.array([ @@ -250,7 +253,7 @@ def test_knn_n_neighbors(): [np.nan, 8], [14, 13] ]) - statistics_mean = [6, 5.5] + statistics_mean = np.nanmean(X, axis=0) # Test with 1 neighbor X_imputed_1NN = np.array([ @@ -349,7 +352,7 @@ def no_weight(dist): # Manual calculation X_imputed_distance1 = np.array([ [0, 0], - [3.850393700787402, 2], + [3.850394, 2], [4, 3], [5, 6], [7, 7], @@ -381,8 +384,7 @@ def no_weight(dist): [3, 2, 3], [4, 5, 5], ]) - - statistics_mean = [3, 2, 2.5] + statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ [2.3828, 0, 0], From b2d56402e3f4684ea57b11306893d16b3218f000 Mon Sep 17 00:00:00 2001 From: harke Date: Fri, 15 Sep 2017 13:11:37 -0500 Subject: [PATCH 45/97] Tests moved to test_imputation --- .../preprocessing/tests/test_imputation.py | 424 ++++++++++++++++- .../tests/test_knn_imputation.py | 434 ------------------ 2 files changed, 423 insertions(+), 435 deletions(-) delete mode 100644 sklearn/preprocessing/tests/test_knn_imputation.py diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 1bfbcd3adbaee..93533792f1f1d 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -1,13 +1,17 @@ - +from __future__ import division import numpy as np from scipy import sparse from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_false from sklearn.preprocessing.imputation import Imputer +from sklearn.preprocessing.imputation import KNNImputer +from sklearn.neighbors import NearestNeighbors from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree @@ -294,6 +298,7 @@ def test_imputation_pickle(): def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) + X_orig_knn = sparse_random_matrix(10, 10, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() @@ -309,6 +314,12 @@ def test_imputation_copy(): Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) + X = X_orig_knn.copy().toarray() + imputer = KNNImputer(missing_values=0, copy=True) + Xt = imputer.fit_transform(X) + Xt[0, 0] = -1 + assert_false(np.all(X == Xt)) + # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=False) @@ -316,6 +327,12 @@ def test_imputation_copy(): Xt[0, 0] = -1 assert_array_equal(X, Xt) + X = X_orig_knn.copy().toarray() + imputer = KNNImputer(missing_values=0, copy=False) + Xt = imputer.fit_transform(X) + Xt[0, 0] = -1 + assert_array_equal(X, Xt) + # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", @@ -357,3 +374,408 @@ def test_imputation_copy(): # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is # made, even if copy=False. + + +"""--------------------- BEGIN KNNIMPUTER TEST ---------------------""" + + +def test_knn_imputation_shape(): + # Verify the shapes of the imputed matrix for different weights and + # number of neighbors. + n_rows = 10 + n_cols = 2 + X = np.random.rand(n_rows, n_cols) + X[0, 0] = np.nan + + for weights in ['uniform', 'distance']: + for n_neighbors in range(1, 6): + imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) + X_imputed = imputer.fit_transform(X) + assert_equal(X_imputed.shape, (n_rows, n_cols)) + + +def test_knn_imputation_zero(): + # Test imputation when missing_values == 0 + missing_values = 0 + n_neighbors = 2 + imputer = KNNImputer(missing_values=missing_values, + n_neighbors=n_neighbors, + weights="uniform") + imputer_nan = KNNImputer(missing_values="NaN", + n_neighbors=n_neighbors, + weights="uniform") + + # Test with missing_values=0 when NaN present + X = np.array([ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + ]) + msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with % zeros in column > col_max_missing + X = np.array([ + [1, 0, 0, 0, 5], + [2, 1, 0, 2, 3], + [3, 2, 0, 0, 0], + [4, 6, 0, 5, 13], + ]) + msg = "Some column(s) have more than {}% missing values".format( + imputer.col_max_missing * 100) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with an imputable matrix and also compare with missing_values="NaN" + X = np.array([ + [1, 0, 1, 0, 1], + [2, 1, 2, 2, 3], + [3, 2, 3, 0, 0], + [6, 6, 0, 5, 17], + ]) + + X_nan = np.array([ + [1, np.nan, 1, np.nan, 1], + [2, 1, 2, 2, 3], + [3, 2, 3, np.nan, np.nan], + [6, 6, np.nan, 5, 17], + ]) + statistics_mean = np.nanmean(X_nan, axis=0) + + X_imputed = np.array([ + [1, 1.5, 1, 2, 1], + [2, 1, 2, 2, 3], + [3, 2, 3, 2, 2], + [6, 6, 2.5, 5, 17], + ]) + + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( + X_nan)) + + +def test_knn_imputation_default(): + # Test imputation with default parameter values + + # Test with an imputable matrix + X = np.array([ + [1, 0, 0, 1], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [np.nan, 4, 5, 5], + [6, np.nan, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [1, 0, 0, 1], + [2, 1, 2, 5.25], + [3, 2, 3, 5.25], + [4, 4, 5, 5], + [6, 3, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test with % missing in row > row_max_missing + X = np.array([ + [1, 0, 0, 1], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [np.nan, 4, 5, 5], + [6, np.nan, 6, 7], + [8, 8, 8, 8], + [np.nan, np.nan, np.nan, 19], + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [1, 0, 0, 1], + [2, 1, 2, 5.25], + [3, 2, 3, 5.25], + [4, 4, 5, 5], + [6, 3, 6, 7], + [8, 8, 8, 8], + [4, 3, 4, 19], + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test with all neighboring donors also having missing feature values + X = np.array([ + [1, 0, 0, np.nan], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [4, 4, 5, np.nan], + [6, 7, 6, np.nan], + [8, 8, 8, np.nan], + [20, 20, 20, 20], + [22, 22, 22, 22] + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [1, 0, 0, 21], + [2, 1, 2, 21], + [3, 2, 3, 21], + [4, 4, 5, 21], + [6, 7, 6, 21], + [8, 8, 8, 21], + [20, 20, 20, 20], + [22, 22, 22, 22] + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test when data in fit() and transform() are different + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 16] + ]) + statistics_mean = np.nanmean(X, axis=0) + + Y = np.array([ + [1, 0], + [3, 2], + [4, np.nan] + ]) + + Y_imputed = np.array([ + [1, 0], + [3, 2], + [4, 4.8] + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit(X).transform(Y), Y_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + +def test_default_with_invalid_input(): + # Test imputation with default values and invalid input + + # Test with % missing in a column > col_max_missing + X = np.array([ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [np.nan, 8, 0, 8, 9], + ]) + imputer = KNNImputer() + msg = "Some column(s) have more than {}% missing values".format( + imputer.col_max_missing * 100) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with insufficient number of neighbors + X = np.array([ + [1, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [6, 6, 2, 5, 13], + ]) + msg = "There are only %d samples, but n_neighbors=%d." % \ + (X.shape[0], imputer.n_neighbors) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with inf present + X = np.array([ + [np.inf, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + msg = "+/- inf values are not allowed." + assert_raise_message(ValueError, msg, KNNImputer().fit, X) + + +def test_knn_n_neighbors(): + + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, np.nan], + [7, 7], + [np.nan, 8], + [14, 13] + ]) + statistics_mean = np.nanmean(X, axis=0) + + # Test with 1 neighbor + X_imputed_1NN = np.array([ + [0, 0], + [4, 2], + [4, 3], + [5, 3], + [7, 7], + [7, 8], + [14, 13] + ]) + + n_neighbors = 1 + imputer = KNNImputer(n_neighbors=n_neighbors) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors+1) + + assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( + X).transform(X)) + + # Test with 6 neighbors + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, np.nan], + [7, 7], + [np.nan, 8], + [14, 13] + ]) + + X_imputed_6NN = np.array([ + [0, 0], + [6, 2], + [4, 3], + [5, 5.5], + [7, 7], + [6, 8], + [14, 13] + ]) + + n_neighbors = 6 + imputer = KNNImputer(n_neighbors=6) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) + + assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( + X).transform(X)) + + +def test_weight_type(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # Test with "uniform" weight (or unweighted) + X_imputed_uniform = np.array([ + [0, 0], + [5, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + imputer = KNNImputer(weights="uniform") + assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + + # Test with "callable" weight + def no_weight(dist): + return None + + imputer = KNNImputer(weights=no_weight) + assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + + # Test with "distance" weight + nn = NearestNeighbors(metric="masked_euclidean") + nn.fit(X) + # Get distance of "n_neighbors" neighbors of row 1 + dist, index = nn.kneighbors() + dist = dist[1, :] + index = index[1, :] + weights = 1 / dist + values = X[index, 0] + imputed = np.dot(values, weights) / np.sum(weights) + + # Manual calculation + X_imputed_distance1 = np.array([ + [0, 0], + [3.850394, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # NearestNeighbor calculation + X_imputed_distance2 = np.array([ + [0, 0], + [imputed, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + imputer = KNNImputer(weights="distance") + assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1, + decimal=6) + assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, + decimal=6) + + # Test with weights = "distance" and n_neighbors=2 + X = np.array([ + [np.nan, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [2.3828, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ]) + + imputer = KNNImputer(n_neighbors=2, weights="distance") + assert_array_almost_equal(imputer.fit_transform(X), X_imputed, + decimal=4) + assert_array_equal(imputer.statistics_, statistics_mean) + + +def test_metric_type(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # Test with a metric type without NaN support + imputer = KNNImputer(metric="euclidean") + assert_raises(ValueError, imputer.fit, X) diff --git a/sklearn/preprocessing/tests/test_knn_imputation.py b/sklearn/preprocessing/tests/test_knn_imputation.py deleted file mode 100644 index 9a4abe9ef5a71..0000000000000 --- a/sklearn/preprocessing/tests/test_knn_imputation.py +++ /dev/null @@ -1,434 +0,0 @@ -from __future__ import division -import numpy as np - -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raise_message -from sklearn.utils.testing import assert_false - -from sklearn.preprocessing.imputation import KNNImputer -from sklearn.neighbors import NearestNeighbors -from sklearn.random_projection import sparse_random_matrix - - -def test_knn_imputation_shape(): - # Verify the shapes of the imputed matrix for different weights and - # number of neighbors. - n_rows = 10 - n_cols = 2 - X = np.random.rand(n_rows, n_cols) - X[0, 0] = np.nan - - for weights in ['uniform', 'distance']: - for n_neighbors in range(1, 6): - imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) - X_imputed = imputer.fit_transform(X) - assert_equal(X_imputed.shape, (n_rows, n_cols)) - - -def test_knn_imputation_zero(): - # Test imputation when missing_values == 0 - missing_values = 0 - n_neighbors = 2 - imputer = KNNImputer(missing_values=missing_values, - n_neighbors=n_neighbors, - weights="uniform") - imputer_nan = KNNImputer(missing_values="NaN", - n_neighbors=n_neighbors, - weights="uniform") - - # Test with missing_values=0 when NaN present - X = np.array([ - [np.nan, 0, 0, 0, 5], - [np.nan, 1, 0, np.nan, 3], - [np.nan, 2, 0, 0, 0], - [np.nan, 6, 0, 5, 13], - ]) - msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with % zeros in column > col_max_missing - X = np.array([ - [1, 0, 0, 0, 5], - [2, 1, 0, 2, 3], - [3, 2, 0, 0, 0], - [4, 6, 0, 5, 13], - ]) - msg = "Some column(s) have more than {}% missing values".format( - imputer.col_max_missing * 100) - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with an imputable matrix and also compare with missing_values="NaN" - X = np.array([ - [1, 0, 1, 0, 1], - [2, 1, 2, 2, 3], - [3, 2, 3, 0, 0], - [6, 6, 0, 5, 17], - ]) - - X_nan = np.array([ - [1, np.nan, 1, np.nan, 1], - [2, 1, 2, 2, 3], - [3, 2, 3, np.nan, np.nan], - [6, 6, np.nan, 5, 17], - ]) - statistics_mean = np.nanmean(X_nan, axis=0) - - X_imputed = np.array([ - [1, 1.5, 1, 2, 1], - [2, 1, 2, 2, 3], - [3, 2, 3, 2, 2], - [6, 6, 2.5, 5, 17], - ]) - - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( - X_nan)) - - -def test_knn_imputation_default(): - # Test imputation with default parameter values - - # Test with an imputable matrix - X = np.array([ - [1, 0, 0, 1], - [2, 1, 2, np.nan], - [3, 2, 3, np.nan], - [np.nan, 4, 5, 5], - [6, np.nan, 6, 7], - [8, 8, 8, 8], - [16, 15, 18, 19], - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [1, 0, 0, 1], - [2, 1, 2, 5.25], - [3, 2, 3, 5.25], - [4, 4, 5, 5], - [6, 3, 6, 7], - [8, 8, 8, 8], - [16, 15, 18, 19], - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - # Test with % missing in row > row_max_missing - X = np.array([ - [1, 0, 0, 1], - [2, 1, 2, np.nan], - [3, 2, 3, np.nan], - [np.nan, 4, 5, 5], - [6, np.nan, 6, 7], - [8, 8, 8, 8], - [np.nan, np.nan, np.nan, 19], - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [1, 0, 0, 1], - [2, 1, 2, 5.25], - [3, 2, 3, 5.25], - [4, 4, 5, 5], - [6, 3, 6, 7], - [8, 8, 8, 8], - [4, 3, 4, 19], - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - # Test with all neighboring donors also having missing feature values - X = np.array([ - [1, 0, 0, np.nan], - [2, 1, 2, np.nan], - [3, 2, 3, np.nan], - [4, 4, 5, np.nan], - [6, 7, 6, np.nan], - [8, 8, 8, np.nan], - [20, 20, 20, 20], - [22, 22, 22, 22] - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [1, 0, 0, 21], - [2, 1, 2, 21], - [3, 2, 3, 21], - [4, 4, 5, 21], - [6, 7, 6, 21], - [8, 8, 8, 21], - [20, 20, 20, 20], - [22, 22, 22, 22] - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - # Test when data in fit() and transform() are different - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 16] - ]) - statistics_mean = np.nanmean(X, axis=0) - - Y = np.array([ - [1, 0], - [3, 2], - [4, np.nan] - ]) - - Y_imputed = np.array([ - [1, 0], - [3, 2], - [4, 4.8] - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit(X).transform(Y), Y_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - -def test_default_with_invalid_input(): - # Test imputation with default values and invalid input - - # Test with % missing in a column > col_max_missing - X = np.array([ - [np.nan, 0, 0, 0, 5], - [np.nan, 1, 0, np.nan, 3], - [np.nan, 2, 0, 0, 0], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [np.nan, 8, 0, 8, 9], - ]) - imputer = KNNImputer() - msg = "Some column(s) have more than {}% missing values".format( - imputer.col_max_missing * 100) - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with insufficient number of neighbors - X = np.array([ - [1, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [6, 6, 2, 5, 13], - ]) - msg = "There are only %d samples, but n_neighbors=%d." % \ - (X.shape[0], imputer.n_neighbors) - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with inf present - X = np.array([ - [np.inf, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [6, 6, 2, 5, 7], - ]) - msg = "+/- inf values are not allowed." - assert_raise_message(ValueError, msg, KNNImputer().fit, X) - - -def test_knn_n_neighbors(): - - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, np.nan], - [7, 7], - [np.nan, 8], - [14, 13] - ]) - statistics_mean = np.nanmean(X, axis=0) - - # Test with 1 neighbor - X_imputed_1NN = np.array([ - [0, 0], - [4, 2], - [4, 3], - [5, 3], - [7, 7], - [7, 8], - [14, 13] - ]) - - n_neighbors = 1 - imputer = KNNImputer(n_neighbors=n_neighbors) - imputer_plus1 = KNNImputer(n_neighbors=n_neighbors+1) - - assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) - assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( - X).transform(X)) - - # Test with 6 neighbors - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, np.nan], - [7, 7], - [np.nan, 8], - [14, 13] - ]) - - X_imputed_6NN = np.array([ - [0, 0], - [6, 2], - [4, 3], - [5, 5.5], - [7, 7], - [6, 8], - [14, 13] - ]) - - n_neighbors = 6 - imputer = KNNImputer(n_neighbors=6) - imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) - - assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) - assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( - X).transform(X)) - - -def test_weight_type(): - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - # Test with "uniform" weight (or unweighted) - X_imputed_uniform = np.array([ - [0, 0], - [5, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - imputer = KNNImputer(weights="uniform") - assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) - - # Test with "callable" weight - def no_weight(dist): - return None - - imputer = KNNImputer(weights=no_weight) - assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) - - # Test with "distance" weight - nn = NearestNeighbors(metric="masked_euclidean") - nn.fit(X) - # Get distance of "n_neighbors" neighbors of row 1 - dist, index = nn.kneighbors() - dist = dist[1, :] - index = index[1, :] - weights = 1 / dist - values = X[index, 0] - imputed = np.dot(values, weights) / np.sum(weights) - - # Manual calculation - X_imputed_distance1 = np.array([ - [0, 0], - [3.850394, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - # NearestNeighbor calculation - X_imputed_distance2 = np.array([ - [0, 0], - [imputed, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - imputer = KNNImputer(weights="distance") - assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1, - decimal=6) - assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, - decimal=6) - - # Test with weights = "distance" and n_neighbors=2 - X = np.array([ - [np.nan, 0, 0], - [2, 1, 2], - [3, 2, 3], - [4, 5, 5], - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [2.3828, 0, 0], - [2, 1, 2], - [3, 2, 3], - [4, 5, 5], - ]) - - imputer = KNNImputer(n_neighbors=2, weights="distance") - assert_array_almost_equal(imputer.fit_transform(X), X_imputed, - decimal=4) - assert_array_equal(imputer.statistics_, statistics_mean) - - -def test_metric_type(): - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - # Test with a metric type without NaN support - imputer = KNNImputer(metric="euclidean") - assert_raises(ValueError, imputer.fit, X) - - -def test_imputation_copy(): - # Test imputation with copy - X_orig = sparse_random_matrix(10, 10, density=0.75, random_state=0) - - # copy=True, dense => copy - X = X_orig.copy().toarray() - imputer = KNNImputer(missing_values=0, copy=True) - Xt = imputer.fit_transform(X) - Xt[0, 0] = -1 - assert_false(np.all(X == Xt)) - - # copy=False, dense => no copy - X = X_orig.copy().toarray() - imputer = KNNImputer(missing_values=0, copy=False) - Xt = imputer.fit_transform(X) - Xt[0, 0] = -1 - assert_array_equal(X, Xt) From cd906143a6afed12b2b09c158754a091f405ebab Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 19 Sep 2017 00:36:59 -0500 Subject: [PATCH 46/97] Addressed review comments --- sklearn/preprocessing/imputation.py | 12 ++-- .../preprocessing/tests/test_imputation.py | 67 +++++++++++-------- 2 files changed, 44 insertions(+), 35 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 61cf043ca2508..9baac3860f280 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -506,7 +506,7 @@ def fit(self, X, y=None): .format(self.col_max_missing*100)) X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data - # Check if % missing in any row > col_max_missing + # Check if % missing in any row > row_max_missing bad_rows = mask.sum(axis=1) > (mask.shape[1] * self.row_max_missing) if np.any(bad_rows): warnings.warn( @@ -556,7 +556,8 @@ def _transform(self, X, adjusted_n_neighbors): force_all_finite=force_all_finite, copy=self.copy) # Check for +/- inf if np.any(np.isinf(X)): - raise ValueError("+/- inf values are not allowed.") + raise ValueError("+/- inf values are not allowed in data to be " + "transformed.") # Get fitted data and ensure correct dimension fitted_X = self._fitted_neighbors._fit_X @@ -566,7 +567,7 @@ def _transform(self, X, adjusted_n_neighbors): mask = _get_mask(X, self.missing_values) n_rows_X, n_cols_X = X.shape row_total_missing = mask.sum(axis=1) - if not np.any(row_total_missing > 0): + if not np.any(row_total_missing): return X # Check for excessive missingness in rows @@ -578,7 +579,7 @@ def _transform(self, X, adjusted_n_neighbors): .format(self.row_max_missing*100)) X_bad = X[bad_rows, :] X = X[~bad_rows, :] - mask = _get_mask(X, self.missing_values) + mask = mask[~bad_rows] row_total_missing = mask.sum(axis=1) row_has_missing = row_total_missing.astype(np.bool) @@ -614,8 +615,7 @@ def _transform(self, X, adjusted_n_neighbors): donors = fitted_X[ (knn_row_index, knn_col_index)].reshape((-1, self.n_neighbors)) donors_mask = _get_mask(donors, self.missing_values) - donors = np.ma.array( - donors, mask=donors_mask) + donors = np.ma.array(donors, mask=donors_mask) imputed = np.ma.average(donors, axis=1, weights=weights) X[mask] = imputed.data unimputed_index = np.where( diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 93533792f1f1d..bf6221988159d 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -297,15 +297,22 @@ def test_imputation_pickle(): def test_imputation_copy(): # Test imputation with copy - X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) - X_orig_knn = sparse_random_matrix(10, 10, density=0.75, random_state=0) - - # copy=True, dense => copy - X = X_orig.copy().toarray() - imputer = Imputer(missing_values=0, strategy="mean", copy=True) - Xt = imputer.fit(X).transform(X) - Xt[0, 0] = -1 - assert_false(np.all(X == Xt)) + X_orig = sparse_random_matrix(10, 10, density=0.75, random_state=0) + imputers = {Imputer: {"missing_values": 0, "strategy": "mean"}, + KNNImputer: {"missing_values": 0}} + + # 1) copy=True, dense => copy + # 2) copy=False, dense => no copy + for imputer_est, params in imputers.items(): + for copy in [True, False]: + X = X_orig.copy().toarray() + imputer = imputer_est(copy=copy, **params) + Xt = imputer.fit(X).transform(X) + Xt[0, 0] = -1 + if copy: + assert_false(np.all(X == Xt)) + else: + assert_array_equal(X, Xt) # copy=True, sparse csr => copy X = X_orig.copy() @@ -314,25 +321,6 @@ def test_imputation_copy(): Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) - X = X_orig_knn.copy().toarray() - imputer = KNNImputer(missing_values=0, copy=True) - Xt = imputer.fit_transform(X) - Xt[0, 0] = -1 - assert_false(np.all(X == Xt)) - - # copy=False, dense => no copy - X = X_orig.copy().toarray() - imputer = Imputer(missing_values=0, strategy="mean", copy=False) - Xt = imputer.fit(X).transform(X) - Xt[0, 0] = -1 - assert_array_equal(X, Xt) - - X = X_orig_knn.copy().toarray() - imputer = KNNImputer(missing_values=0, copy=False) - Xt = imputer.fit_transform(X) - Xt[0, 0] = -1 - assert_array_equal(X, Xt) - # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", @@ -376,7 +364,8 @@ def test_imputation_copy(): # made, even if copy=False. -"""--------------------- BEGIN KNNIMPUTER TEST ---------------------""" +############################################################################# +# BEGIN KNNIMPUTER TEST def test_knn_imputation_shape(): @@ -607,6 +596,26 @@ def test_default_with_invalid_input(): msg = "+/- inf values are not allowed." assert_raise_message(ValueError, msg, KNNImputer().fit, X) + # Test with inf present in matrix passed in tranform + X = np.array([ + [np.inf, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + + X_fit = np.array([ + [0, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + msg = "+/- inf values are not allowed in data to be transformed." + assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X) def test_knn_n_neighbors(): From 2c9993a61986bcde77b688827140c651b39c1553 Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 19 Sep 2017 01:55:53 -0500 Subject: [PATCH 47/97] test changes --- sklearn/preprocessing/tests/test_imputation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index bf6221988159d..f6add1661fc84 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -301,8 +301,8 @@ def test_imputation_copy(): imputers = {Imputer: {"missing_values": 0, "strategy": "mean"}, KNNImputer: {"missing_values": 0}} - # 1) copy=True, dense => copy - # 2) copy=False, dense => no copy + # copy=True, dense => copy + # copy=False, dense => no copy for imputer_est, params in imputers.items(): for copy in [True, False]: X = X_orig.copy().toarray() @@ -312,7 +312,7 @@ def test_imputation_copy(): if copy: assert_false(np.all(X == Xt)) else: - assert_array_equal(X, Xt) + assert_array_almost_equal(X, Xt) # copy=True, sparse csr => copy X = X_orig.copy() @@ -617,6 +617,7 @@ def test_default_with_invalid_input(): msg = "+/- inf values are not allowed in data to be transformed." assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X) + def test_knn_n_neighbors(): X = np.array([ From 473b19111688d3e480110cfd4c8031252996df1f Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 19 Sep 2017 02:08:47 -0500 Subject: [PATCH 48/97] Test changes part 2 --- sklearn/preprocessing/tests/test_imputation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index f6add1661fc84..8faf5c5038088 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -306,7 +306,8 @@ def test_imputation_copy(): for imputer_est, params in imputers.items(): for copy in [True, False]: X = X_orig.copy().toarray() - imputer = imputer_est(copy=copy, **params) + params["copy"] = copy + imputer = imputer_est(**params) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 if copy: From de587b337b1e6e897db62d4ccc2ac2ca156e20ac Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 20 Sep 2017 22:24:49 -0500 Subject: [PATCH 49/97] Fixed weight matrix shape issue --- sklearn/preprocessing/imputation.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 9baac3860f280..55707988cffd5 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -576,7 +576,7 @@ def _transform(self, X, adjusted_n_neighbors): warnings.warn( "There are rows with more than {0}% missing values. The " "missing features in these rows are imputed with column means." - .format(self.row_max_missing*100)) + .format(self.row_max_missing * 100)) X_bad = X[bad_rows, :] X = X[~bad_rows, :] mask = mask[~bad_rows] @@ -598,7 +598,6 @@ def _transform(self, X, adjusted_n_neighbors): (-1, self.n_neighbors)) knn_distances = knn_distances[not_duplicate_index].reshape( (-1, self.n_neighbors)) - weights = _get_weights(knn_distances, self.weights) # Vertically split sets of k-donor indices and repeat each set by # missing count in the corresponding recipient row @@ -607,6 +606,16 @@ def _transform(self, X, adjusted_n_neighbors): knn_row_index = np.repeat( knn_row_index, row_repeats, axis=0).ravel() + weight_matrix = _get_weights(knn_distances, self.weights) + # If weight applied, repeat and resize weight matrix + if self.weights is not None and self.weights != "uniform" and \ + weight_matrix is not None: + weight_matrix = np.vsplit(weight_matrix, + weight_matrix.shape[0]) + weight_matrix = np.repeat(weight_matrix, + row_repeats, axis=0).ravel() + weight_matrix = weight_matrix.reshape((-1, self.n_neighbors)) + # Get column index of donors row_missing_index, col_missing_index = np.where(mask) knn_col_index = np.repeat(col_missing_index, self.n_neighbors) @@ -616,7 +625,7 @@ def _transform(self, X, adjusted_n_neighbors): (knn_row_index, knn_col_index)].reshape((-1, self.n_neighbors)) donors_mask = _get_mask(donors, self.missing_values) donors = np.ma.array(donors, mask=donors_mask) - imputed = np.ma.average(donors, axis=1, weights=weights) + imputed = np.ma.average(donors, axis=1, weights=weight_matrix) X[mask] = imputed.data unimputed_index = np.where( donors_mask.sum(axis=1) == self.n_neighbors) From 3d58616490c6c96001e71bfb0441bc0c1c48efb0 Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 21 Sep 2017 03:00:39 -0500 Subject: [PATCH 50/97] Minor changes --- sklearn/preprocessing/imputation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 55707988cffd5..67311f5ca329c 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -608,8 +608,7 @@ def _transform(self, X, adjusted_n_neighbors): weight_matrix = _get_weights(knn_distances, self.weights) # If weight applied, repeat and resize weight matrix - if self.weights is not None and self.weights != "uniform" and \ - weight_matrix is not None: + if weight_matrix is not None: weight_matrix = np.vsplit(weight_matrix, weight_matrix.shape[0]) weight_matrix = np.repeat(weight_matrix, From 5873d17336e9017a2e243bc9fe3b995f091425e4 Mon Sep 17 00:00:00 2001 From: harke Date: Fri, 22 Sep 2017 16:50:04 -0500 Subject: [PATCH 51/97] Fixed degenerate donor issue. Added tests --- sklearn/preprocessing/imputation.py | 52 +++++++++------- .../preprocessing/tests/test_imputation.py | 60 +++++++++++++++++-- 2 files changed, 87 insertions(+), 25 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 67311f5ca329c..d8b0335bd6b13 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -584,50 +584,62 @@ def _transform(self, X, adjusted_n_neighbors): row_has_missing = row_total_missing.astype(np.bool) if np.any(row_has_missing): + receiver_row_index = np.arange( + X.shape[0]).reshape((X.shape[0], 1))[row_has_missing, :] neighbors = self._fitted_neighbors.kneighbors( X[row_has_missing, :], n_neighbors=adjusted_n_neighbors) # Get row index, distance, and weights of donors knn_distances, knn_row_index = neighbors - # Remove self from list of donors - if adjusted_n_neighbors > self.n_neighbors: - row_index = np.arange(X.shape[0]).reshape((X.shape[0], 1)) - row_index = row_index[row_has_missing, :] - not_duplicate_index = np.where(~(row_index == knn_row_index)) - knn_row_index = knn_row_index[not_duplicate_index].reshape( - (-1, self.n_neighbors)) - knn_distances = knn_distances[not_duplicate_index].reshape( - (-1, self.n_neighbors)) - - # Vertically split sets of k-donor indices and repeat each set by - # missing count in the corresponding recipient row + # Vertically split sets of k-donor indices knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) row_repeats = row_total_missing[row_total_missing != 0] + + # Weighting: Set self and degenerate donor(s) distance to inf + if self.weights not in [None, "uniform"]: + receiver_row_index = np.split( + receiver_row_index, receiver_row_index.shape[0]) + nbors_anti_mask = ~mask[knn_row_index, np.newaxis] + receiver_anti_mask = ~mask[receiver_row_index, np.newaxis] + # Sum anti-masks to see if both donor & receiver are missing + # A zero value indicates that a feature is missing in both + anti_masks_combined = receiver_anti_mask + nbors_anti_mask + anti_masks_combined = anti_masks_combined.squeeze().sum( + axis=-1) # Sum over all cols to locate degenerate donors + degenerate_nbors = anti_masks_combined < X.shape[1] + receiver_rows, _ = knn_distances.shape + degenerate_nbors_mask = degenerate_nbors.reshape( + (receiver_rows, -1)) + knn_distances[degenerate_nbors_mask] = np.inf + + # Repeat each set of v-splitted donor indices by + # missing count in the corresponding recipient row knn_row_index = np.repeat( knn_row_index, row_repeats, axis=0).ravel() + # Retreive and, if applicable, transform weight matrix weight_matrix = _get_weights(knn_distances, self.weights) - # If weight applied, repeat and resize weight matrix if weight_matrix is not None: weight_matrix = np.vsplit(weight_matrix, weight_matrix.shape[0]) weight_matrix = np.repeat(weight_matrix, row_repeats, axis=0).ravel() - weight_matrix = weight_matrix.reshape((-1, self.n_neighbors)) + weight_matrix = weight_matrix.reshape( + (-1, adjusted_n_neighbors)) # Get column index of donors row_missing_index, col_missing_index = np.where(mask) - knn_col_index = np.repeat(col_missing_index, self.n_neighbors) + knn_col_index = np.repeat(col_missing_index, adjusted_n_neighbors) # Calculate kNN score and impute - donors = fitted_X[ - (knn_row_index, knn_col_index)].reshape((-1, self.n_neighbors)) + donors = fitted_X[(knn_row_index, knn_col_index)].reshape( + (-1, adjusted_n_neighbors)) donors_mask = _get_mask(donors, self.missing_values) donors = np.ma.array(donors, mask=donors_mask) imputed = np.ma.average(donors, axis=1, weights=weight_matrix) X[mask] = imputed.data unimputed_index = np.where( - donors_mask.sum(axis=1) == self.n_neighbors) + donors_mask.sum(axis=1) == adjusted_n_neighbors) if len(unimputed_index[0]) > 0: unimputed_rows = row_missing_index[unimputed_index] unimputed_cols = col_missing_index[unimputed_index] @@ -672,8 +684,8 @@ def transform(self, X): from the data to be transformed. WARNING: If the same dataset is passed in fit() and transform(), - one of the returned "neighbors" maybe the sample itself. Use - fit_transform() to avoid this behavior. + one of the returned "neighbors" maybe the sample itself. If you will be + passing the same dataset, use fit_transform() to avoid this behavior. Parameters ---------- diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 8faf5c5038088..465214c31c5e6 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -11,6 +11,7 @@ from sklearn.preprocessing.imputation import Imputer from sklearn.preprocessing.imputation import KNNImputer +from sklearn.metrics.pairwise import masked_euclidean_distances from sklearn.neighbors import NearestNeighbors from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV @@ -425,10 +426,10 @@ def test_knn_imputation_zero(): ]) X_nan = np.array([ - [1, np.nan, 1, np.nan, 1], - [2, 1, 2, 2, 3], - [3, 2, 3, np.nan, np.nan], - [6, 6, np.nan, 5, 17], + [1, np.nan, 1, np.nan, 1], + [2, 1, 2, 2, 3], + [3, 2, 3, np.nan, np.nan], + [6, 6, np.nan, 5, 17], ]) statistics_mean = np.nanmean(X_nan, axis=0) @@ -709,7 +710,7 @@ def test_weight_type(): assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) # Test with "callable" weight - def no_weight(dist): + def no_weight(dist=None): return None imputer = KNNImputer(weights=no_weight) @@ -775,6 +776,55 @@ def no_weight(dist): decimal=4) assert_array_equal(imputer.statistics_, statistics_mean) + # Test with varying missingness patterns + X = np.array([ + [1, 0, 0, 1], + [0, np.nan, 1, np.nan], + [1, 1, 1, np.nan], + [0, 1, 0, 0], + [0, 1, 0, 0], + [1, 1, 1, 1], + [10, 10, 10, 10], + ]) + statistics_mean = np.nanmean(X, axis=0) + + # Get weights of donor neighbors + dist = masked_euclidean_distances(X) + row1_nbor_dists = dist[1, :6] + row1_nbor_dists[np.array([1, 2])] = np.inf # Degenerate neighbors + row1_nbor_wt = 1 / row1_nbor_dists + + row2_nbor_dists = dist[2, :6] + row2_nbor_dists[np.array([1, 2])] = np.inf # Degenerate neighbors + row2_nbor_wt = 1 / row2_nbor_dists + # One of the non-denerate neighbors has zero distance so its weight=1 + # and for others, weight=0 + row2_nbor_wt[~np.isinf(row2_nbor_wt)] = 0 + row2_nbor_wt[np.isinf(row2_nbor_wt)] = 1 + + # Collect donor values + col1_donors = np.ma.masked_invalid(X[:6, 1].copy()) + col3_donors = np.ma.masked_invalid(X[:6, 3].copy()) + + # Final imputed values + r1c1_imp = np.ma.average(col1_donors, weights=row1_nbor_wt) + r1c3_imp = np.ma.average(col3_donors, weights=row1_nbor_wt) + r2c3_imp = np.ma.average(col3_donors, weights=row2_nbor_wt) + + X_imputed = np.array([ + [1, 0, 0, 1], + [0, r1c1_imp, 1, r1c3_imp], + [1, 1, 1, r2c3_imp], + [0, 1, 0, 0], + [0, 1, 0, 0], + [1, 1, 1, 1], + [10, 10, 10, 10], + ]) + + imputer = KNNImputer(weights="distance") + assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6) + assert_array_equal(imputer.statistics_, statistics_mean) + def test_metric_type(): X = np.array([ From fd1100299da371140f75cefbf586de422354030d Mon Sep 17 00:00:00 2001 From: harke Date: Fri, 22 Sep 2017 17:40:25 -0500 Subject: [PATCH 52/97] Further test updates --- .../preprocessing/tests/test_imputation.py | 60 +++++++++++-------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 465214c31c5e6..1367a888621f1 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -323,6 +323,13 @@ def test_imputation_copy(): Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) + # copy=False, dense => no copy + X = X_orig.copy().toarray() + imputer = Imputer(missing_values=0, strategy="mean", copy=False) + Xt = imputer.fit(X).transform(X) + Xt[0, 0] = -1 + assert_array_almost_equal(X, Xt) + # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", @@ -778,47 +785,52 @@ def no_weight(dist=None): # Test with varying missingness patterns X = np.array([ - [1, 0, 0, 1], - [0, np.nan, 1, np.nan], - [1, 1, 1, np.nan], - [0, 1, 0, 0], - [0, 1, 0, 0], - [1, 1, 1, 1], - [10, 10, 10, 10], + [1, 0, 0, 1], + [0, np.nan, 1, np.nan], + [1, 1, 1, np.nan], + [0, 1, 0, 0], + [0, np.nan, 1, 0], + [1, 1, 1, 1], + [10, 10, 10, 10], ]) statistics_mean = np.nanmean(X, axis=0) # Get weights of donor neighbors dist = masked_euclidean_distances(X) row1_nbor_dists = dist[1, :6] - row1_nbor_dists[np.array([1, 2])] = np.inf # Degenerate neighbors - row1_nbor_wt = 1 / row1_nbor_dists + row1_nbor_dists[np.array([1, 2, 4])] = np.inf # Degenerate neighbors + row1_nbor_wt = 1/row1_nbor_dists row2_nbor_dists = dist[2, :6] row2_nbor_dists[np.array([1, 2])] = np.inf # Degenerate neighbors - row2_nbor_wt = 1 / row2_nbor_dists - # One of the non-denerate neighbors has zero distance so its weight=1 - # and for others, weight=0 + row2_nbor_wt = 1/row2_nbor_dists + # A non-degenerate donor has zero distance so it's weight is 1 and + # others have weight 0 row2_nbor_wt[~np.isinf(row2_nbor_wt)] = 0 row2_nbor_wt[np.isinf(row2_nbor_wt)] = 1 + row4_nbor_dists = dist[4, :6] + row4_nbor_dists[np.array([1, 4])] = np.inf # Degenerate neighbors + row4_nbor_wt = 1/row4_nbor_dists + # Collect donor values - col1_donors = np.ma.masked_invalid(X[:6, 1].copy()) - col3_donors = np.ma.masked_invalid(X[:6, 3].copy()) + col1_donor_values = np.ma.masked_invalid(X[:6, 1].copy()) + col3_donor_values = np.ma.masked_invalid(X[:6, 3].copy()) # Final imputed values - r1c1_imp = np.ma.average(col1_donors, weights=row1_nbor_wt) - r1c3_imp = np.ma.average(col3_donors, weights=row1_nbor_wt) - r2c3_imp = np.ma.average(col3_donors, weights=row2_nbor_wt) + r1c1_imp = np.ma.average(col1_donor_values, weights=row1_nbor_wt) + r1c3_imp = np.ma.average(col3_donor_values, weights=row1_nbor_wt) + r2c3_imp = np.ma.average(col3_donor_values, weights=row2_nbor_wt) + r4c1_imp = np.ma.average(col1_donor_values, weights=row4_nbor_wt) X_imputed = np.array([ - [1, 0, 0, 1], - [0, r1c1_imp, 1, r1c3_imp], - [1, 1, 1, r2c3_imp], - [0, 1, 0, 0], - [0, 1, 0, 0], - [1, 1, 1, 1], - [10, 10, 10, 10], + [1, 0, 0, 1], + [0, r1c1_imp, 1, r1c3_imp], + [1, 1, 1, r2c3_imp], + [0, 1, 0, 0], + [0, r4c1_imp, 1, 0], + [1, 1, 1, 1], + [10, 10, 10, 10], ]) imputer = KNNImputer(weights="distance") From 2f41aa298305c5a628ab7cf48be1411daa91d5e2 Mon Sep 17 00:00:00 2001 From: harke Date: Sat, 23 Sep 2017 14:20:24 -0500 Subject: [PATCH 53/97] minor test fix --- sklearn/preprocessing/tests/test_imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 1367a888621f1..0a7f58cc8a325 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -328,7 +328,7 @@ def test_imputation_copy(): imputer = Imputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 - assert_array_almost_equal(X, Xt) + assert_array_equal(X, Xt) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() From 135056cf957dbe5381da2b14150f83335d3127c7 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 24 Sep 2017 13:15:29 -0500 Subject: [PATCH 54/97] more minor changes --- sklearn/preprocessing/imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index d8b0335bd6b13..e802665b11380 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -596,7 +596,7 @@ def _transform(self, X, adjusted_n_neighbors): row_repeats = row_total_missing[row_total_missing != 0] # Weighting: Set self and degenerate donor(s) distance to inf - if self.weights not in [None, "uniform"]: + if self.weights in ["distance"]: receiver_row_index = np.split( receiver_row_index, receiver_row_index.shape[0]) nbors_anti_mask = ~mask[knn_row_index, np.newaxis] From 8c7190ef9f1fcdaddc805c3a0ebdc58434128971 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 24 Sep 2017 13:23:19 -0500 Subject: [PATCH 55/97] Moved weight_matrix inside if-weighted block --- sklearn/preprocessing/imputation.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index e802665b11380..6e985c1c2090f 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -596,6 +596,7 @@ def _transform(self, X, adjusted_n_neighbors): row_repeats = row_total_missing[row_total_missing != 0] # Weighting: Set self and degenerate donor(s) distance to inf + weight_matrix = None if self.weights in ["distance"]: receiver_row_index = np.split( receiver_row_index, receiver_row_index.shape[0]) @@ -612,21 +613,21 @@ def _transform(self, X, adjusted_n_neighbors): (receiver_rows, -1)) knn_distances[degenerate_nbors_mask] = np.inf + # Retreive and, if applicable, transform weight matrix + weight_matrix = _get_weights(knn_distances, self.weights) + if weight_matrix is not None: + weight_matrix = np.vsplit(weight_matrix, + weight_matrix.shape[0]) + weight_matrix = np.repeat(weight_matrix, + row_repeats, axis=0).ravel() + weight_matrix = weight_matrix.reshape( + (-1, adjusted_n_neighbors)) + # Repeat each set of v-splitted donor indices by # missing count in the corresponding recipient row knn_row_index = np.repeat( knn_row_index, row_repeats, axis=0).ravel() - # Retreive and, if applicable, transform weight matrix - weight_matrix = _get_weights(knn_distances, self.weights) - if weight_matrix is not None: - weight_matrix = np.vsplit(weight_matrix, - weight_matrix.shape[0]) - weight_matrix = np.repeat(weight_matrix, - row_repeats, axis=0).ravel() - weight_matrix = weight_matrix.reshape( - (-1, adjusted_n_neighbors)) - # Get column index of donors row_missing_index, col_missing_index = np.where(mask) knn_col_index = np.repeat(col_missing_index, adjusted_n_neighbors) From 9616c2b0eab0eb84cc6abcbae38398cf2d54ab2d Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Tue, 12 Dec 2017 05:03:34 -0600 Subject: [PATCH 56/97] Addressed Review Comments --- examples/plot_missing_values.py | 14 +- sklearn/metrics/pairwise.py | 18 +- sklearn/neighbors/base.py | 9 +- sklearn/neighbors/tests/test_neighbors.py | 10 + sklearn/preprocessing/__init__.py | 2 + sklearn/preprocessing/imputation.py | 183 +++++++++++------- .../preprocessing/tests/test_imputation.py | 87 ++++++--- 7 files changed, 206 insertions(+), 117 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 2dcd068df8ebe..ea36eb733bb5f 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -11,8 +11,7 @@ Imputer: Using Imputer, missing values can be replaced by the mean, the median or the -most frequent -value using the ``strategy`` hyper-parameter. +most frequent value using the ``strategy`` hyper-parameter. The median is a more robust estimator for data with high magnitude variables which could dominate results (otherwise known as a 'long tail'). @@ -35,7 +34,7 @@ from sklearn.datasets import load_boston from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import Pipeline -from sklearn.preprocessing.imputation import Imputer, KNNImputer +from sklearn.preprocessing import Imputer, KNNImputer from sklearn.model_selection import cross_val_score rng = np.random.RandomState(0) @@ -80,11 +79,8 @@ print("Score after imputation of the missing values = %.2f" % score) # Estimate the score after kNN-imputation of the missing values -X_missing = X_full.copy() -X_missing[np.where(missing_samples)[0], missing_features] = np.nan -y_missing = y_full.copy() -knn_estimator = Pipeline([("knnimputer", KNNImputer(n_neighbors=10)), - ("forest", RandomForestRegressor(random_state=0, - n_estimators=100))]) +knn_estimator = Pipeline( + [("knnimputer", KNNImputer(missing_values=0, n_neighbors=10)), + ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) knn_score = cross_val_score(knn_estimator, X_missing, y_missing).mean() print("Score after knn-imputation of the missing values = %.2f" % knn_score) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2afd6b60026f7..fa508be4d009e 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -322,7 +322,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, Returns ------- - distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) + distances : {array}, shape (n_samples_1, n_samples_2) Examples -------- @@ -371,9 +371,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, raise ValueError("One or more rows only contain missing values.") # else: - if missing_values != "NaN" and \ - (np.any(np.isnan(X)) or - (Y is not X and np.any(np.isnan(Y)))): + if missing_values not in ["NaN", np.nan] and ( + np.any(np.isnan(X)) or (Y is not X and np.any(np.isnan(Y)))): raise ValueError( "NaN values present but missing_value = {0}".format( missing_values)) @@ -387,7 +386,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, X[mask_X] = 0 # Calculate distances - # The following formula was derived in matrix form by: + # The following formula derived by: # Shreya Bhattarai distances = (X.shape[1] / (np.dot(NX, NYT))) * \ @@ -1242,7 +1241,8 @@ def _parallel_pairwise(X, Y, func, n_jobs, **kwds): def _pairwise_callable(X, Y, metric, **kwds): """Handle the callable case for pairwise_{distances,kernels} """ - X, Y = check_pairwise_arrays(X, Y) + force_all_finite = False if callable(metric) else True + X, Y = check_pairwise_arrays(X, Y, force_all_finite=force_all_finite) if X is Y: # Only calculate metric for upper triangle @@ -1279,7 +1279,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski", 'masked_euclidean'] -_MASKED_SUPPORTED_METRICS = ['masked_euclidean'] +_MASKED_METRICS = ['masked_euclidean'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): @@ -1369,11 +1369,11 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) - if metric in _MASKED_SUPPORTED_METRICS: + if metric in _MASKED_METRICS or callable(metric): missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan - if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])): + if np.all(_get_mask(X, missing_values)): raise ValueError( "One or more samples(s) only have missing values.") diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index b3633c9717284..5fd76fc5f5107 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -17,7 +17,7 @@ from ..base import BaseEstimator from ..metrics import pairwise_distances from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS -from ..metrics.pairwise import _MASKED_SUPPORTED_METRICS +from ..metrics.pairwise import _MASKED_METRICS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices from ..utils.multiclass import check_classification_targets from ..externals import six @@ -159,7 +159,7 @@ def _init_params(self, n_neighbors=None, radius=None, self._fit_method = None def _fit(self, X): - allow_nans = self.metric in _MASKED_SUPPORTED_METRICS + allow_nans = self.metric in _MASKED_METRICS or callable(self.metric) if self.metric_params is None: self.effective_metric_params_ = {} @@ -214,7 +214,7 @@ def _fit(self, X): if issparse(X): if allow_nans: raise ValueError( - "Nearest neighbor algorithm does not currently support" + "Nearest neighbor algorithm does not currently support " "the use of sparse matrices for missing values." ) if self.algorithm not in ('auto', 'brute'): @@ -340,7 +340,8 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - if self.effective_metric_ in _MASKED_SUPPORTED_METRICS: + if self.effective_metric_ in _MASKED_METRICS or callable( + self.effective_metric_): X = check_array(X, accept_sparse='csr', force_all_finite=False) else: diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index b9390883f9903..20ba7bed91d92 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -18,6 +18,7 @@ from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_in from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns from sklearn.utils.testing import ignore_warnings @@ -145,6 +146,15 @@ def test_masked_unsupervised_kneighbors(): assert_array_equal(X2_neigh, N3) assert_array_equal(XY2_neigh, N4) + # Test 3 + nan = float("nan") + samples = csc_matrix([[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]]) + neigh = neighbors.NearestNeighbors(n_neighbors=2, + metric="masked_euclidean") + msg = "Nearest neighbor algorithm does not currently support the use of " \ + "sparse matrices for missing values." + assert_raise_message(ValueError, msg, neigh.fit, samples) + def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 2b105709ffe08..41f6ee08cdc8b 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -31,6 +31,7 @@ from .label import MultiLabelBinarizer from .imputation import Imputer +from .imputation import KNNImputer __all__ = [ @@ -38,6 +39,7 @@ 'FunctionTransformer', 'Imputer', 'KernelCenterer', + 'KNNImputer', 'LabelBinarizer', 'LabelEncoder', 'MultiLabelBinarizer', diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 6e985c1c2090f..119de4964ba94 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -1,4 +1,5 @@ # Authors: Nicolas Tresegnie +# Ashim Bhattarai <"ashimb9" + "\100gmail\56com"> # License: BSD 3 clause from __future__ import division import warnings @@ -380,8 +381,8 @@ def transform(self, X): class KNNImputer(BaseEstimator, TransformerMixin): """Imputation for completing missing values using k-Nearest Neighbors. - Each sample's missing values are imputed from up to "n_neighbors" nearest - neighbors found in the training set. Each missing feature is then + Each sample's missing values are imputed from up to ``max_neighbors`` + nearest neighbors found in the training set. Each missing feature is then imputed as the average, either weighted or unweighted, of these neighbors who have a value for it. Where all neighbors have that feature value missing, the training set average for that feature is used for imputation. @@ -393,11 +394,13 @@ class KNNImputer(BaseEstimator, TransformerMixin): `missing_values` will be imputed. For missing values encoded as np.nan, use the string value "NaN". - n_neighbors : int, optional (default = 5) + max_neighbors : int, optional (default = 5) Maximum number of neighboring samples to use for imputation. When any of the neighbors themselves have the feature value missing then the remaining neighbors, if any, that have the feature value available are - used. + used. But if none of the neighbors have the value available, the global + feature mean (i.e., by default, the column mean) is used for + imputation. weights : str or callable, optional (default = "uniform") Weight function used in prediction. Possible values: @@ -411,14 +414,19 @@ class KNNImputer(BaseEstimator, TransformerMixin): array of distances, and returns an array of the same shape containing the weights. - metric : str, optional (default = "masked_euclidean") + metric : str or callable, optional (default = "masked_euclidean") Distance metric for searching neighbors. Possible values: - 'masked_euclidean' + - [callable] : a user-defined function which conforms to the + definition of _pairwise_callable(X, Y, metric, **kwds). In other + words, the function accepts two arrays, X and Y, and a + ``missing_values`` keyword in **kwds and returns a scalar distance + value. row_max_missing : float, optional (default = 0.5) The maximum percentage of columns (i.e. features) that can be missing before the sample is excluded from nearest neighbor imputation. It - means that such rows will not be considered a potential donor in fit() + means that such rows will not be considered a potential donor in fit(), and in transform() their missing feature values will be imputed to be the column mean for the entire dataset. @@ -436,7 +444,12 @@ class KNNImputer(BaseEstimator, TransformerMixin): ---------- statistics_ : 1-D array of length {n_features} The 1-D array contains the mean of each feature calculated using - observed (i.e. non-missing) values. + observed (i.e. non-missing) values. This is used for imputing + missing values in samples that are either excluded from nearest + neighbors search because they have too many ( > row_max_missing) + missing features or because all of the sample's k-nearest neighbors + (i.e., the potential donors) also have the relevant feature value + missing. References ---------- @@ -450,7 +463,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing.imputation import KNNImputer >>> nan = float("NaN") >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] - >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") + >>> imputer = KNNImputer(max_neighbors=2, weights="uniform") >>> imputer.fit_transform(X) array([[ 1. , 2. , 4. ], [ 3. , 4. , 3. ], @@ -458,12 +471,12 @@ class KNNImputer(BaseEstimator, TransformerMixin): [ 8. , 8. , 7. ]]) """ - def __init__(self, missing_values="NaN", n_neighbors=5, + def __init__(self, missing_values="NaN", max_neighbors=5, weights="uniform", metric="masked_euclidean", row_max_missing=0.5, col_max_missing=0.8, copy=True): self.missing_values = missing_values - self.n_neighbors = n_neighbors + self.max_neighbors = max_neighbors self.weights = weights self.metric = metric self.row_max_missing = row_max_missing @@ -503,7 +516,7 @@ def fit(self, X, y=None): mask = _get_mask(X, self.missing_values) if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): raise ValueError("Some column(s) have more than {}% missing values" - .format(self.col_max_missing*100)) + .format(self.col_max_missing * 100)) X_col_means = np.ma.array(X, mask=mask).mean(axis=0).data # Check if % missing in any row > row_max_missing @@ -518,12 +531,12 @@ def fit(self, X, y=None): X = X[~bad_rows, :] # Check if sufficient neighboring samples available - if X.shape[0] < self.n_neighbors: - raise ValueError("There are only %d samples, but n_neighbors=%d." - % (X.shape[0], self.n_neighbors)) + if X.shape[0] < self.max_neighbors: + raise ValueError("There are only %d samples, but max_neighbors=%d." + % (X.shape[0], self.max_neighbors)) # Instantiate NN object, get column means, and store in statistics_ - neigh = NearestNeighbors(n_neighbors=self.n_neighbors, + neigh = NearestNeighbors(n_neighbors=self.max_neighbors, metric=self.metric, metric_params={"missing_values": self.missing_values}) @@ -532,7 +545,44 @@ def fit(self, X, y=None): return self - def _transform(self, X, adjusted_n_neighbors): + def _get_weight_matrix(self, fitted_X, mask, adjusted_max_neighbors, + receiver_row_index, row_repeats, + knn_row_index, knn_distances): + """Get the weight matrix for the donors""" + + # Import(s) here to avoid circular import + from ..neighbors.base import _get_weights + + # If different X in transform, get a new mask + if self.max_neighbors == adjusted_max_neighbors: + nbors_mask = _get_mask(fitted_X[knn_row_index], + value_to_mask=self.missing_values) + else: + nbors_mask = mask[knn_row_index] + + # Anti-mask tells us what is NOT missing + nbors_anti_mask = ~nbors_mask + receiver_anti_mask = ~mask[receiver_row_index] + + # Sum anti-masks to see if both donor & receiver are missing + # A zero value indicates that a feature is missing in both + # Sum over all cols to locate degenerate donors + anti_masks_combined = receiver_anti_mask + nbors_anti_mask + anti_masks_combined = anti_masks_combined.sum(axis=-1) + degenerate_nbors = anti_masks_combined < mask.shape[1] + knn_distances[degenerate_nbors] = np.inf + + # Retreive and, if applicable, transform weight matrix + weight_matrix = _get_weights(knn_distances, self.weights) + if weight_matrix is not None: + weight_matrix = weight_matrix[:, np.newaxis, :] + weight_matrix = np.repeat(weight_matrix, + row_repeats, axis=0).ravel() + weight_matrix = weight_matrix.reshape( + (-1, adjusted_max_neighbors)) + return weight_matrix + + def _transform(self, X, adjusted_max_neighbors): """Impute all missing values in X. Parameters @@ -540,14 +590,13 @@ def _transform(self, X, adjusted_n_neighbors): X : {array-like}, shape = [n_samples, n_features] The input data to complete. - adjusted_n_neighbors : int - Indicates whether to pass n_neighbors or n_neighbors+1 to - _tranform(). - Calling transform() automatically sets this to self.n_neighbors - while fit_transform() sets it to self.n_neighbors + 1. + adjusted_max_neighbors : int + Depending on the calling method, the default value must + either be equal to max_neighbors or max_neighbors + 1. + If the calling method is transform(), then its value needs to be + equal to max_neighbors and if calling method is fit_transform() + then its value must be equal to max_neighbors + 1. """ - # Import(s) here to avoid circular import - from ..neighbors.base import _get_weights check_is_fitted(self, 'statistics_') force_all_finite = False if self.missing_values in ["NaN", @@ -584,70 +633,62 @@ def _transform(self, X, adjusted_n_neighbors): row_has_missing = row_total_missing.astype(np.bool) if np.any(row_has_missing): - receiver_row_index = np.arange( - X.shape[0]).reshape((X.shape[0], 1))[row_has_missing, :] + # Row index of receivers & identify neighbors (potential donors) + receiver_row_index = np.where(row_has_missing)[0].reshape((-1, 1)) neighbors = self._fitted_neighbors.kneighbors( - X[row_has_missing, :], n_neighbors=adjusted_n_neighbors) + X[row_has_missing, :], n_neighbors=adjusted_max_neighbors) # Get row index, distance, and weights of donors knn_distances, knn_row_index = neighbors - # Vertically split sets of k-donor indices - knn_row_index = np.vsplit(knn_row_index, knn_row_index.shape[0]) row_repeats = row_total_missing[row_total_missing != 0] # Weighting: Set self and degenerate donor(s) distance to inf weight_matrix = None - if self.weights in ["distance"]: - receiver_row_index = np.split( - receiver_row_index, receiver_row_index.shape[0]) - nbors_anti_mask = ~mask[knn_row_index, np.newaxis] - receiver_anti_mask = ~mask[receiver_row_index, np.newaxis] - # Sum anti-masks to see if both donor & receiver are missing - # A zero value indicates that a feature is missing in both - anti_masks_combined = receiver_anti_mask + nbors_anti_mask - anti_masks_combined = anti_masks_combined.squeeze().sum( - axis=-1) # Sum over all cols to locate degenerate donors - degenerate_nbors = anti_masks_combined < X.shape[1] - receiver_rows, _ = knn_distances.shape - degenerate_nbors_mask = degenerate_nbors.reshape( - (receiver_rows, -1)) - knn_distances[degenerate_nbors_mask] = np.inf - - # Retreive and, if applicable, transform weight matrix - weight_matrix = _get_weights(knn_distances, self.weights) - if weight_matrix is not None: - weight_matrix = np.vsplit(weight_matrix, - weight_matrix.shape[0]) - weight_matrix = np.repeat(weight_matrix, - row_repeats, axis=0).ravel() - weight_matrix = weight_matrix.reshape( - (-1, adjusted_n_neighbors)) - - # Repeat each set of v-splitted donor indices by + if self.weights in ["distance"] or callable(self.weights): + weight_matrix = self._get_weight_matrix( + fitted_X, + mask, + adjusted_max_neighbors, + receiver_row_index, + row_repeats, + knn_row_index, + knn_distances + ) + + # Repeat each set donor indices by # missing count in the corresponding recipient row - knn_row_index = np.repeat( + knn_row_index_repeat = np.repeat( knn_row_index, row_repeats, axis=0).ravel() - # Get column index of donors - row_missing_index, col_missing_index = np.where(mask) - knn_col_index = np.repeat(col_missing_index, adjusted_n_neighbors) + # Get repeated column index of donors + receiver_row_missing_index, receiver_col_missing_index = \ + np.where(mask) + knn_col_index_repeat = np.repeat(receiver_col_missing_index, + adjusted_max_neighbors) - # Calculate kNN score and impute - donors = fitted_X[(knn_row_index, knn_col_index)].reshape( - (-1, adjusted_n_neighbors)) + # Retrieve donor cells and calculate kNN score + donors = fitted_X[ + knn_row_index_repeat, knn_col_index_repeat].reshape( + (-1, adjusted_max_neighbors)) donors_mask = _get_mask(donors, self.missing_values) donors = np.ma.array(donors, mask=donors_mask) + + # Warning if donor count < max_neighbors + if np.any(donors_mask.sum(axis=1) < self.max_neighbors): + warnings.warn("One or more donor(s) have the relevant " + "feature value missing.") + + # Final imputation imputed = np.ma.average(donors, axis=1, weights=weight_matrix) X[mask] = imputed.data - unimputed_index = np.where( - donors_mask.sum(axis=1) == adjusted_n_neighbors) + unimputed_index = np.where(donors_mask.all(axis=1)) if len(unimputed_index[0]) > 0: - unimputed_rows = row_missing_index[unimputed_index] - unimputed_cols = col_missing_index[unimputed_index] - X[(unimputed_rows, unimputed_cols)] = np.take(self.statistics_, - unimputed_cols) + unimputed_rows = receiver_row_missing_index[unimputed_index] + unimputed_cols = receiver_col_missing_index[unimputed_index] + X[unimputed_rows, unimputed_cols] = np.take(self.statistics_, + unimputed_cols) - # Merge bad rows to X and mean impute their missing + # Merge bad rows to X and mean impute their missing values if np.any(bad_rows): bad_missing_index = np.where(_get_mask(X_bad, self.missing_values)) X_bad[bad_missing_index] = np.take(self.statistics_, @@ -676,7 +717,7 @@ def fit_transform(self, X, y=None, **fit_params): Returns imputed dataset. """ return self.fit(X)._transform( - X, adjusted_n_neighbors=self.n_neighbors + 1) + X, adjusted_max_neighbors=self.max_neighbors + 1) def transform(self, X): """Impute all missing values in X. @@ -699,4 +740,4 @@ def transform(self, X): Returns imputed dataset. """ check_is_fitted(self, 'statistics_') - return self._transform(X, adjusted_n_neighbors=self.n_neighbors) + return self._transform(X, adjusted_max_neighbors=self.max_neighbors) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 0a7f58cc8a325..599f8667a3824 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -304,11 +304,11 @@ def test_imputation_copy(): # copy=True, dense => copy # copy=False, dense => no copy - for imputer_est, params in imputers.items(): + for imputer_cls, params in imputers.items(): for copy in [True, False]: X = X_orig.copy().toarray() params["copy"] = copy - imputer = imputer_est(**params) + imputer = imputer_cls(**params) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 if copy: @@ -386,8 +386,8 @@ def test_knn_imputation_shape(): X[0, 0] = np.nan for weights in ['uniform', 'distance']: - for n_neighbors in range(1, 6): - imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) + for max_neighbors in range(1, 6): + imputer = KNNImputer(max_neighbors=max_neighbors, weights=weights) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols)) @@ -395,12 +395,12 @@ def test_knn_imputation_shape(): def test_knn_imputation_zero(): # Test imputation when missing_values == 0 missing_values = 0 - n_neighbors = 2 + max_neighbors = 2 imputer = KNNImputer(missing_values=missing_values, - n_neighbors=n_neighbors, + max_neighbors=max_neighbors, weights="uniform") imputer_nan = KNNImputer(missing_values="NaN", - n_neighbors=n_neighbors, + max_neighbors=max_neighbors, weights="uniform") # Test with missing_values=0 when NaN present @@ -426,14 +426,14 @@ def test_knn_imputation_zero(): # Test with an imputable matrix and also compare with missing_values="NaN" X = np.array([ - [1, 0, 1, 0, 1], + [1, 0, 1, 0, 1.], [2, 1, 2, 2, 3], [3, 2, 3, 0, 0], [6, 6, 0, 5, 17], ]) X_nan = np.array([ - [1, np.nan, 1, np.nan, 1], + [1, np.nan, 1, np.nan, 1.], [2, 1, 2, 2, 3], [3, 2, 3, np.nan, np.nan], [6, 6, np.nan, 5, 17], @@ -441,7 +441,7 @@ def test_knn_imputation_zero(): statistics_mean = np.nanmean(X_nan, axis=0) X_imputed = np.array([ - [1, 1.5, 1, 2, 1], + [1, 1.5, 1, 2, 1.], [2, 1, 2, 2, 3], [3, 2, 3, 2, 2], [6, 6, 2.5, 5, 17], @@ -589,8 +589,8 @@ def test_default_with_invalid_input(): [3, 2, 3, 3, 8], [6, 6, 2, 5, 13], ]) - msg = "There are only %d samples, but n_neighbors=%d." % \ - (X.shape[0], imputer.n_neighbors) + msg = "There are only %d samples, but max_neighbors=%d." % \ + (X.shape[0], imputer.max_neighbors) assert_raise_message(ValueError, msg, imputer.fit, X) # Test with inf present @@ -605,7 +605,7 @@ def test_default_with_invalid_input(): msg = "+/- inf values are not allowed." assert_raise_message(ValueError, msg, KNNImputer().fit, X) - # Test with inf present in matrix passed in tranform + # Test with inf present in matrix passed in transform() X = np.array([ [np.inf, 1, 1, 2, np.nan], [2, 1, 2, 2, 3], @@ -627,7 +627,7 @@ def test_default_with_invalid_input(): assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X) -def test_knn_n_neighbors(): +def test_knn_max_neighbors(): X = np.array([ [0, 0], @@ -651,9 +651,9 @@ def test_knn_n_neighbors(): [14, 13] ]) - n_neighbors = 1 - imputer = KNNImputer(n_neighbors=n_neighbors) - imputer_plus1 = KNNImputer(n_neighbors=n_neighbors+1) + max_neighbors = 1 + imputer = KNNImputer(max_neighbors=max_neighbors) + imputer_plus1 = KNNImputer(max_neighbors=max_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) assert_array_equal(imputer.statistics_, statistics_mean) @@ -681,9 +681,9 @@ def test_knn_n_neighbors(): [14, 13] ]) - n_neighbors = 6 - imputer = KNNImputer(n_neighbors=6) - imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) + max_neighbors = 6 + imputer = KNNImputer(max_neighbors=6) + imputer_plus1 = KNNImputer(max_neighbors=max_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) assert_array_equal(imputer.statistics_, statistics_mean) @@ -691,7 +691,7 @@ def test_knn_n_neighbors(): X).transform(X)) -def test_weight_type(): +def test_weight_uniform(): X = np.array([ [0, 0], [np.nan, 2], @@ -723,10 +723,22 @@ def no_weight(dist=None): imputer = KNNImputer(weights=no_weight) assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + +def test_weight_distance(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + # Test with "distance" weight nn = NearestNeighbors(metric="masked_euclidean") nn.fit(X) - # Get distance of "n_neighbors" neighbors of row 1 + # Get distance of "max_neighbors" neighbors of row 1 dist, index = nn.kneighbors() dist = dist[1, :] index = index[1, :] @@ -762,7 +774,7 @@ def no_weight(dist=None): assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, decimal=6) - # Test with weights = "distance" and n_neighbors=2 + # Test with weights = "distance" and max_neighbors=2 X = np.array([ [np.nan, 0, 0], [2, 1, 2], @@ -778,7 +790,7 @@ def no_weight(dist=None): [4, 5, 5], ]) - imputer = KNNImputer(n_neighbors=2, weights="distance") + imputer = KNNImputer(max_neighbors=2, weights="distance") assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=4) assert_array_equal(imputer.statistics_, statistics_mean) @@ -852,3 +864,30 @@ def test_metric_type(): # Test with a metric type without NaN support imputer = KNNImputer(metric="euclidean") assert_raises(ValueError, imputer.fit, X) + + +def test_callable_metric(): + + # Define callable metric that prefers a 6,9,... alternating pattern: + def always_six_nine(x, y, missing_values="NaN"): + x = np.ma.array(x, mask=np.isnan(x)) + y = np.ma.array(y, mask=np.isnan(y)) + dist = abs(np.nansum(x-y)) + return dist + + X = np.array([ + [4, 3, 3, np.nan], + [6, 9, 6, 9], + [4, 8, 6, 9], + [np.nan, 9, 11, 10.] + ]) + + X_imputed = np.array([ + [4, 3, 3, 9], + [6, 9, 6, 9], + [4, 8, 6, 9], + [5, 9, 11, 10.] + ]) + + imputer = KNNImputer(max_neighbors=2, metric=always_six_nine) + assert_array_equal(imputer.fit_transform(X), X_imputed) From 7e8f900c970f25c42d4a3f35bc3cb3a25d327b6e Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Tue, 12 Dec 2017 05:13:58 -0600 Subject: [PATCH 57/97] Fixed plot_missing example --- examples/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index ea36eb733bb5f..48f5af1cad70f 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -80,7 +80,7 @@ # Estimate the score after kNN-imputation of the missing values knn_estimator = Pipeline( - [("knnimputer", KNNImputer(missing_values=0, n_neighbors=10)), + [("knnimputer", KNNImputer(missing_values=0, max_neighbors=10)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) knn_score = cross_val_score(knn_estimator, X_missing, y_missing).mean() print("Score after knn-imputation of the missing values = %.2f" % knn_score) From df9dba760c18efa1aef68d5671983720740364bb Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Tue, 12 Dec 2017 11:34:42 -0600 Subject: [PATCH 58/97] Fixed Error Msg --- sklearn/neighbors/base.py | 4 +--- sklearn/neighbors/tests/test_neighbors.py | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 5fd76fc5f5107..dfb8efa8d20aa 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -214,9 +214,7 @@ def _fit(self, X): if issparse(X): if allow_nans: raise ValueError( - "Nearest neighbor algorithm does not currently support " - "the use of sparse matrices for missing values." - ) + "kNN does not support sparse matrix with missing data") if self.algorithm not in ('auto', 'brute'): warnings.warn("cannot use tree with sparse input: " "using brute force") diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 20ba7bed91d92..54ecec2bff8b4 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -151,8 +151,7 @@ def test_masked_unsupervised_kneighbors(): samples = csc_matrix([[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]]) neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="masked_euclidean") - msg = "Nearest neighbor algorithm does not currently support the use of " \ - "sparse matrices for missing values." + msg = "kNN does not support sparse matrix with missing data" assert_raise_message(ValueError, msg, neigh.fit, samples) From d26724a45dc8a4cdaa1230ce1ec3e774d0adc194 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Tue, 12 Dec 2017 12:15:20 -0600 Subject: [PATCH 59/97] Modified missing check for sparse matrix --- sklearn/neighbors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index dfb8efa8d20aa..eeda0ba873d6d 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -212,7 +212,7 @@ def _fit(self, X): raise ValueError("n_samples must be greater than 0") if issparse(X): - if allow_nans: + if np.any(np.isnan(X.data)): raise ValueError( "kNN does not support sparse matrix with missing data") if self.algorithm not in ('auto', 'brute'): From 2b327da8c86b84ee5910f0083dbe18e023bafda7 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Tue, 12 Dec 2017 13:10:19 -0600 Subject: [PATCH 60/97] Test update --- sklearn/preprocessing/tests/test_imputation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 599f8667a3824..02788b6a643d3 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -868,11 +868,11 @@ def test_metric_type(): def test_callable_metric(): - # Define callable metric that prefers a 6,9,... alternating pattern: - def always_six_nine(x, y, missing_values="NaN"): + # Define callable metric that returns the l1 norm: + def custom_callable(x, y, missing_values="NaN"): x = np.ma.array(x, mask=np.isnan(x)) y = np.ma.array(y, mask=np.isnan(y)) - dist = abs(np.nansum(x-y)) + dist = np.nansum(np.abs(x-y)) return dist X = np.array([ @@ -889,5 +889,5 @@ def always_six_nine(x, y, missing_values="NaN"): [5, 9, 11, 10.] ]) - imputer = KNNImputer(max_neighbors=2, metric=always_six_nine) + imputer = KNNImputer(max_neighbors=2, metric=custom_callable) assert_array_equal(imputer.fit_transform(X), X_imputed) From 17046725cfc79c79c9780ee7249fcf1d10c4f1b5 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sun, 17 Dec 2017 02:01:17 -0600 Subject: [PATCH 61/97] Fixed nan check on sparse --- sklearn/metrics/pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index fa508be4d009e..80acf0a912fc2 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1373,7 +1373,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan - if np.all(_get_mask(X, missing_values)): + if np.all(_get_mask(X.data if issparse(X) else X, missing_values)): raise ValueError( "One or more samples(s) only have missing values.") From a1cc41dd979c54f54ad0b25fa090fef561609afc Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sun, 17 Dec 2017 04:01:27 -0600 Subject: [PATCH 62/97] Review Comments Addressed (partial) --- doc/modules/preprocessing.rst | 2 +- sklearn/metrics/pairwise.py | 24 ++++++++++++------------ sklearn/preprocessing/imputation.py | 10 +++++----- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index d765423aa3244..ad56cacdeb624 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -553,7 +553,7 @@ estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missin The :class:`KNNImputer` class provides imputation for completing missing values using the k-Nearest Neighbors approach. Each sample's missing values -are imputed from up to n_neighbors nearest neighbors found in the training set. +are imputed from up to ``n_neighbors`` nearest neighbors found in the training set. Each missing feature is then imputed as the average, either weighted or unweighted, of the neighbors who have a value for it. When any of the neighbors themselves have the feature value missing then diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 80acf0a912fc2..9861ebe9782fa 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -288,10 +288,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, missing_values="NaN", copy=True): """Calculates euclidean distances in the presence of missing values - Considering the rows of X (and Y=X) as samples, compute the distance matrix - between each pair of samples. Similarly, if Y is not X, then compute the - distance matrix between each sample pair (i.e., each row pair) in X and Y. - + Computes the euclidean distance between each pair of samples (rows) in X + and Y, where Y=X is assumed if Y=None. When calculating the distance between a pair of samples, this formulation essentially zero-weights feature coordinates with a missing value in either sample and scales up the weight of the remaining coordinates: @@ -363,7 +361,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, mask_X = _get_mask(X, missing_values) YT = Y.T - mask_YT = _get_mask(YT, missing_values) + mask_YT = mask_X.T if Y is X else _get_mask(YT, missing_values) # Check if any rows have only missing value if np.any(mask_X.sum(axis=1) == X.shape[1])\ @@ -377,21 +375,23 @@ def masked_euclidean_distances(X, Y=None, squared=False, "NaN values present but missing_value = {0}".format( missing_values)) - # Get anti-mask and set Y.T's missing to zero - NYT = (~mask_YT).astype(np.int32) + # Get mask of non-missing values set Y.T's missing to zero. + # Further, casting the mask to int to be used in formula later. + not_YT = (~mask_YT).astype(np.int32) YT[mask_YT] = 0 - # Get X anti-mask and set X's missing to zero - NX = (~mask_X).astype(np.int32) + # Get X's mask of non-missing values and set X's missing to zero + not_X = (~mask_X).astype(np.int32) X[mask_X] = 0 # Calculate distances # The following formula derived by: # Shreya Bhattarai - distances = (X.shape[1] / (np.dot(NX, NYT))) * \ - (np.dot(X * X, NYT) - 2 * (np.dot(X, YT)) + - np.dot(NX, YT * YT)) + distances = ( + (X.shape[1] / (np.dot(not_X, not_YT))) * + (np.dot(X * X, not_YT) - 2 * (np.dot(X, YT)) + + np.dot(not_X, YT * YT))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 119de4964ba94..1d50df9e8ce71 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -391,8 +391,8 @@ class KNNImputer(BaseEstimator, TransformerMixin): ---------- missing_values : integer or "NaN", optional (default = "NaN") The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. For missing values encoded as np.nan, - use the string value "NaN". + `missing_values` will be imputed. For missing values encoded as + ``np.nan``, use the string value "NaN". max_neighbors : int, optional (default = 5) Maximum number of neighboring samples to use for imputation. When any @@ -426,9 +426,9 @@ class KNNImputer(BaseEstimator, TransformerMixin): row_max_missing : float, optional (default = 0.5) The maximum percentage of columns (i.e. features) that can be missing before the sample is excluded from nearest neighbor imputation. It - means that such rows will not be considered a potential donor in fit(), - and in transform() their missing feature values will be imputed to be - the column mean for the entire dataset. + means that such rows will not be considered a potential donor in + ``fit()``, and in ``transform()`` their missing feature values will be + imputed to be the column mean for the entire dataset. col_max_missing : float, optional (default = 0.8) The maximum percentage of rows (or samples) that can be missing From 34f68a5448bf64257ae4cd8e5c93116803e2f516 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Tue, 19 Dec 2017 05:07:01 -0600 Subject: [PATCH 63/97] Updated doc module --- doc/modules/preprocessing.rst | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a242c1659aaa0..016b4a1595ed3 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -645,13 +645,12 @@ estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missin The :class:`KNNImputer` class provides imputation for completing missing values using the k-Nearest Neighbors approach. Each sample's missing values -are imputed from up to ``n_neighbors`` nearest neighbors found in the training set. -Each missing feature is then imputed as the average, either weighted or -unweighted, of the neighbors who have a value for it. +are imputed from up to ``max_neighbors`` nearest neighbors found in the +training set. Each missing feature is then imputed as the average, either +weighted or unweighted, of the neighbors who have a value for it. When any of the neighbors themselves have the feature value missing then -the remaining n_neighbors-1 neighbors are used and, if need be, -the process repeats until a single neighbor remains. Where all neighbors have -that feature value missing, the training set average for that feature is used. +the remaining neighbors are used. Where all neighbors have that feature value +missing, the training set average for that feature is used. For more information on the methodology, see ref. [#]_. The following snippet demonstrates how to replace missing values, @@ -662,7 +661,7 @@ neighbors of the rows that contain the missing values:: >>> from sklearn.preprocessing.imputation import KNNImputer >>> nan = np.nan >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] - >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") + >>> imputer = KNNImputer(max_neighbors=2, weights="uniform") >>> imputer.fit_transform(X) array([[ 1. , 2. , 4. ], [ 3. , 4. , 3. ], From 508270c0bca3cd40d7eb47998e2df0a00e4ada5c Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Thu, 25 Jan 2018 23:52:23 -0600 Subject: [PATCH 64/97] Added support for using only neighbors with non-missing features --- sklearn/preprocessing/imputation.py | 196 +++++++++++++----- .../preprocessing/tests/test_imputation.py | 34 +++ 2 files changed, 175 insertions(+), 55 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 8efc50d24f161..0d047e620ff11 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -473,7 +473,8 @@ class KNNImputer(BaseEstimator, TransformerMixin): def __init__(self, missing_values="NaN", max_neighbors=5, weights="uniform", metric="masked_euclidean", - row_max_missing=0.5, col_max_missing=0.8, copy=True): + row_max_missing=0.5, col_max_missing=0.8, + use_complete=False, copy=True): self.missing_values = missing_values self.max_neighbors = max_neighbors @@ -481,6 +482,7 @@ def __init__(self, missing_values="NaN", max_neighbors=5, self.metric = metric self.row_max_missing = row_max_missing self.col_max_missing = col_max_missing + self.use_complete = use_complete self.copy = copy def fit(self, X, y=None): @@ -633,60 +635,144 @@ def _transform(self, X, adjusted_max_neighbors): row_has_missing = row_total_missing.astype(np.bool) if np.any(row_has_missing): - # Row index of receivers & identify neighbors (potential donors) - receiver_row_index = np.where(row_has_missing)[0].reshape((-1, 1)) - neighbors = self._fitted_neighbors.kneighbors( - X[row_has_missing, :], n_neighbors=adjusted_max_neighbors) - - # Get row index, distance, and weights of donors - knn_distances, knn_row_index = neighbors - row_repeats = row_total_missing[row_total_missing != 0] - - # Weighting: Set self and degenerate donor(s) distance to inf - weight_matrix = None - if self.weights in ["distance"] or callable(self.weights): - weight_matrix = self._get_weight_matrix( - fitted_X, - mask, - adjusted_max_neighbors, - receiver_row_index, - row_repeats, - knn_row_index, - knn_distances - ) - - # Repeat each set donor indices by - # missing count in the corresponding recipient row - knn_row_index_repeat = np.repeat( - knn_row_index, row_repeats, axis=0).ravel() - - # Get repeated column index of donors - receiver_row_missing_index, receiver_col_missing_index = \ - np.where(mask) - knn_col_index_repeat = np.repeat(receiver_col_missing_index, - adjusted_max_neighbors) - - # Retrieve donor cells and calculate kNN score - donors = fitted_X[ - knn_row_index_repeat, knn_col_index_repeat].reshape( - (-1, adjusted_max_neighbors)) - donors_mask = _get_mask(donors, self.missing_values) - donors = np.ma.array(donors, mask=donors_mask) - - # Warning if donor count < max_neighbors - if np.any(donors_mask.sum(axis=1) < self.max_neighbors): - warnings.warn("One or more donor(s) have the relevant " - "feature value missing.") - - # Final imputation - imputed = np.ma.average(donors, axis=1, weights=weight_matrix) - X[mask] = imputed.data - unimputed_index = np.where(donors_mask.all(axis=1)) - if len(unimputed_index[0]) > 0: - unimputed_rows = receiver_row_missing_index[unimputed_index] - unimputed_cols = receiver_col_missing_index[unimputed_index] - X[unimputed_rows, unimputed_cols] = np.take(self.statistics_, - unimputed_cols) + if self.use_complete: + # Initializations + # Mask for fitted_X + mask_fx = _get_mask(fitted_X, np.nan) + + # Locate unique patterns + patterns, row_pat_idx = np.unique( + mask, return_inverse=True, axis=0) + + # Get row idx for receivers (missing) + receiver_row_missing_index, _ = np.where(mask) + + # For every pattern, index receivers and potential donors + for p in range(len(patterns)): + if not np.any(patterns[p]): + continue + # receivers are those with pattern 'p' + row_has_missing_pat = (row_pat_idx == p) + + # Donors have features missing in receivers available + # The bitwise-AND captures if something is missing in both + donor_row_idx = ~np.any(patterns[p] & mask_fx, axis=1) + + # Change donor set to ones not missing relevant features + self._fitted_neighbors._fit_X = fitted_X[donor_row_idx] + if len(self._fitted_neighbors._fit_X) < self.max_neighbors: + err_msg = "Insufficient neighbors with feature values." + raise ValueError(err_msg) + + # Row index of receivers & identify potential donors + receiver_row_index = np.where( + row_has_missing_pat)[0].reshape((-1, 1)) + neighbors = self._fitted_neighbors.kneighbors( + X[row_has_missing_pat, :], + n_neighbors=self.max_neighbors) + + # Get row index, distance, and weights of donors + knn_distances, knn_row_index = neighbors + row_repeats = row_total_missing[row_has_missing_pat] + + # Weighting: Set self/degenerate donor(s) distance to inf + weight_matrix = None + if self.weights in ["distance"] or callable(self.weights): + weight_matrix = self._get_weight_matrix( + self._fitted_neighbors._fit_X, + mask, + self.max_neighbors, + receiver_row_index, + row_repeats, + knn_row_index, + knn_distances + ) + + # Repeat each set donor indices by + # missing count in the corresponding recipient row + knn_row_index_repeat = np.repeat( + knn_row_index, row_repeats, axis=0).ravel() + + # Get repeated column index of donors + _, receiver_col_missing_index = \ + np.where(mask[row_has_missing_pat]) + knn_col_index_repeat = np.repeat( + receiver_col_missing_index, + self.max_neighbors) + + # Retrieve donor cells and calculate kNN score + donors = self._fitted_neighbors._fit_X[ + knn_row_index_repeat, knn_col_index_repeat].reshape( + (-1, self.max_neighbors)) + donors_mask = _get_mask(donors, self.missing_values) + donors = np.ma.array(donors, mask=donors_mask) + + # Final imputation + imputed = np.ma.average(donors, axis=1, + weights=weight_matrix) + X[np.where(row_has_missing_pat)[0], + receiver_col_missing_index] = imputed.data + + # Recover original dataset + self._fitted_neighbors._fit_X = fitted_X + else: + # Row index of receivers & identify potential donors + receiver_row_index = np.where( + row_has_missing)[0].reshape((-1, 1)) + neighbors = self._fitted_neighbors.kneighbors( + X[row_has_missing, :], n_neighbors=adjusted_max_neighbors) + + # Get row index, distance, and weights of donors + knn_distances, knn_row_index = neighbors + row_repeats = row_total_missing[row_total_missing != 0] + + # Weighting: Set self and degenerate donor(s) distance to inf + weight_matrix = None + if self.weights in ["distance"] or callable(self.weights): + weight_matrix = self._get_weight_matrix( + fitted_X, + mask, + adjusted_max_neighbors, + receiver_row_index, + row_repeats, + knn_row_index, + knn_distances + ) + + # Repeat each set donor indices by + # missing count in the corresponding recipient row + knn_row_index_repeat = np.repeat( + knn_row_index, row_repeats, axis=0).ravel() + + # Get repeated column index of donors + receiver_row_missing_index, receiver_col_missing_index = \ + np.where(mask) + knn_col_index_repeat = np.repeat(receiver_col_missing_index, + adjusted_max_neighbors) + + # Retrieve donor cells and calculate kNN score + donors = fitted_X[ + knn_row_index_repeat, knn_col_index_repeat].reshape( + (-1, adjusted_max_neighbors)) + donors_mask = _get_mask(donors, self.missing_values) + donors = np.ma.array(donors, mask=donors_mask) + + # Warning if donor count < max_neighbors + if np.any(donors_mask.sum(axis=1) < self.max_neighbors): + warnings.warn("One or more donor(s) have the relevant " + "feature value missing.") + + # Final imputation + imputed = np.ma.average(donors, axis=1, weights=weight_matrix) + X[mask] = imputed.data + unimputed_index = np.where(donors_mask.all(axis=1)) + if len(unimputed_index[0]) > 0: + unimputed_rows = receiver_row_missing_index[ + unimputed_index] + unimputed_cols = receiver_col_missing_index[ + unimputed_index] + X[unimputed_rows, unimputed_cols] = np.take( + self.statistics_, unimputed_cols) # Merge bad rows to X and mean impute their missing values if np.any(bad_rows): diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 32ff9efd06375..30971e83804c3 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -897,3 +897,37 @@ def custom_callable(x, y, missing_values="NaN"): imputer = KNNImputer(max_neighbors=2, metric=custom_callable) assert_array_equal(imputer.fit_transform(X), X_imputed) + + +def test_complete_features(): + + # Test with use_complete=True + X = np.array([ + [0, 0, 0, np.nan], + [1, 1, 1, np.nan], + [2, 2, np.nan, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [np.nan, 7, 7, 7] + ]) + + r0c3 = np.mean(X[2:-1, -1]) + r1c3 = np.mean(X[2:-1, -1]) + r2c2 = np.nanmean(X[:6, 2]) + r7c0 = np.mean(X[2:-1, 0]) + + X_imputed = np.array([ + [0, 0, 0, r0c3], + [1, 1, 1, r1c3], + [2, 2, r2c2, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [r7c0, 7, 7, 7] + ]) + + imputer_comp = KNNImputer(use_complete=True) + assert_array_equal(imputer_comp.fit_transform(X), X_imputed) From 0562054ca8674b94580e3e81a042b3e1469eed46 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Fri, 26 Jan 2018 00:10:16 -0600 Subject: [PATCH 65/97] Test update --- sklearn/preprocessing/tests/test_imputation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 30971e83804c3..37650c9a143fd 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -930,4 +930,4 @@ def test_complete_features(): ]) imputer_comp = KNNImputer(use_complete=True) - assert_array_equal(imputer_comp.fit_transform(X), X_imputed) + assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed) From 24943ec7fa020337d34bd1ad325bf1cff2936fca Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Fri, 26 Jan 2018 15:17:06 -0600 Subject: [PATCH 66/97] Import Numpy code for np.unique for older versions --- sklearn/preprocessing/imputation.py | 203 +++++++++++++++++++++++++++- 1 file changed, 200 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 0d047e620ff11..e9f1a2bc3ab68 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -62,6 +62,196 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value +# Code for function _unique1d taken directly from Numpy +def _unique1d(ar, return_index=False, return_inverse=False, + return_counts=False): + """ + Find the unique elements of an array, ignoring shape. + """ + ar = np.asanyarray(ar).flatten() + + optional_indices = return_index or return_inverse + optional_returns = optional_indices or return_counts + + if ar.size == 0: + if not optional_returns: + ret = ar + else: + ret = (ar,) + if return_index: + ret += (np.empty(0, np.bool),) + if return_inverse: + ret += (np.empty(0, np.bool),) + if return_counts: + ret += (np.empty(0, np.intp),) + return ret + + if optional_indices: + perm = ar.argsort(kind='mergesort' if return_index else 'quicksort') + aux = ar[perm] + else: + ar.sort() + aux = ar + flag = np.concatenate(([True], aux[1:] != aux[:-1])) + + if not optional_returns: + ret = aux[flag] + else: + ret = (aux[flag],) + if return_index: + ret += (perm[flag],) + if return_inverse: + iflag = np.cumsum(flag) - 1 + inv_idx = np.empty(ar.shape, dtype=np.intp) + inv_idx[perm] = iflag + ret += (inv_idx,) + if return_counts: + idx = np.concatenate(np.nonzero(flag) + ([ar.size],)) + ret += (np.diff(idx),) + return ret + + +# Code for function _unique taken directly from Numpy +def _unique(ar, return_index=False, return_inverse=False, + return_counts=False, axis=None): + """ + Find the unique elements of an array. + + Returns the sorted unique elements of an array. There are three optional + outputs in addition to the unique elements: the indices of the input array + that give the unique values, the indices of the unique array that + reconstruct the input array, and the number of times each unique value + comes up in the input array. + + Parameters + ---------- + ar : array_like + Input array. Unless `axis` is specified, this will be flattened if it + is not already 1-D. + return_index : bool, optional + If True, also return the indices of `ar` (along the specified axis, + if provided, or in the flattened array) that result in the unique + array. + return_inverse : bool, optional + If True, also return the indices of the unique array (for the specified + axis, if provided) that can be used to reconstruct `ar`. + return_counts : bool, optional + If True, also return the number of times each unique item appears + in `ar`. + .. versionadded:: 1.9.0 + axis : int or None, optional + The axis to operate on. If None, `ar` will be flattened beforehand. + Otherwise, duplicate items will be removed along the provided axis, + with all the other axes belonging to the each of the unique elements. + Object arrays or structured arrays that contain objects are not + supported if the `axis` kwarg is used. + .. versionadded:: 1.13.0 + + + + Returns + ------- + unique : ndarray + The sorted unique values. + unique_indices : ndarray, optional + The indices of the first occurrences of the unique values in the + original array. Only provided if `return_index` is True. + unique_inverse : ndarray, optional + The indices to reconstruct the original array from the + unique array. Only provided if `return_inverse` is True. + unique_counts : ndarray, optional + The number of times each of the unique values comes up in the + original array. Only provided if `return_counts` is True. + .. versionadded:: 1.9.0 + + See Also + -------- + numpy.lib.arraysetops : Module with a number of other functions for + performing set operations on arrays. + + Examples + -------- + >>> np.unique([1, 1, 2, 2, 3, 3]) + array([1, 2, 3]) + >>> a = np.array([[1, 1], [2, 3]]) + >>> np.unique(a) + array([1, 2, 3]) + + Return the unique rows of a 2D array + + >>> a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]]) + >>> np.unique(a, axis=0) + array([[1, 0, 0], [2, 3, 4]]) + + Return the indices of the original array that give the unique values: + + >>> a = np.array(['a', 'b', 'b', 'c', 'a']) + >>> u, indices = np.unique(a, return_index=True) + >>> u + array(['a', 'b', 'c'], + dtype='|S1') + >>> indices + array([0, 1, 3]) + >>> a[indices] + array(['a', 'b', 'c'], + dtype='|S1') + + Reconstruct the input array from the unique values: + + >>> a = np.array([1, 2, 6, 4, 2, 3, 2]) + >>> u, indices = np.unique(a, return_inverse=True) + >>> u + array([1, 2, 3, 4, 6]) + >>> indices + array([0, 1, 4, 3, 1, 2, 1]) + >>> u[indices] + array([1, 2, 6, 4, 2, 3, 2]) + + """ + ar = np.asanyarray(ar) + if axis is None: + return _unique1d(ar, return_index, return_inverse, return_counts) + if not (-ar.ndim <= axis < ar.ndim): + raise ValueError('Invalid axis kwarg specified for unique') + + ar = np.swapaxes(ar, axis, 0) + orig_shape, orig_dtype = ar.shape, ar.dtype + # Must reshape to a contiguous 2D array for this to work... + ar = ar.reshape(orig_shape[0], -1) + ar = np.ascontiguousarray(ar) + + if ar.dtype.char in (np.typecodes['AllInteger'] + + np.typecodes['Datetime'] + 'S'): + # Optimization: Creating a view of your data with a np.void data type + # of size the number of bytes in a full row. Handles any type where + # items have a unique binary representation, i.e. 0 is only 0, + # not +0 and -0. + dtype = np.dtype((np.void, ar.dtype.itemsize * ar.shape[1])) + else: + dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])] + + try: + consolidated = ar.view(dtype) + except TypeError: + # There's no good way to do this for object arrays, etc... + msg = 'The axis argument to unique is not supported for dtype {dt}' + raise TypeError(msg.format(dt=ar.dtype)) + + def reshape_uniq(uniq): + uniq = uniq.view(orig_dtype) + uniq = uniq.reshape(-1, *orig_shape[1:]) + uniq = np.swapaxes(uniq, 0, axis) + return uniq + + output = _unique1d(consolidated, return_index, + return_inverse, return_counts) + if not (return_index or return_inverse or return_counts): + return reshape_uniq(output) + else: + uniq = reshape_uniq(output[0]) + return (uniq,) + output[1:] + + class Imputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. @@ -640,9 +830,16 @@ def _transform(self, X, adjusted_max_neighbors): # Mask for fitted_X mask_fx = _get_mask(fitted_X, np.nan) - # Locate unique patterns - patterns, row_pat_idx = np.unique( - mask, return_inverse=True, axis=0) + # Locate unique patterns, but first a numpy version check + np_version = np.__version__.split(".") + + # Different behavior for np.unique in Numpy < 1.13.x + if float('.'.join(np_version[:2])) < 1.13: + patterns, row_pat_idx = _unique( + mask, return_inverse=True, axis=0) + else: + patterns, row_pat_idx = np.unique( + mask, return_inverse=True, axis=0) # Get row idx for receivers (missing) receiver_row_missing_index, _ = np.where(mask) From a449c5bed6456d867bee1b6147467857534e9f74 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Fri, 26 Jan 2018 15:38:24 -0600 Subject: [PATCH 67/97] Remove version check --- sklearn/preprocessing/imputation.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index e9f1a2bc3ab68..e31cc346e5eae 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -830,16 +830,9 @@ def _transform(self, X, adjusted_max_neighbors): # Mask for fitted_X mask_fx = _get_mask(fitted_X, np.nan) - # Locate unique patterns, but first a numpy version check - np_version = np.__version__.split(".") - - # Different behavior for np.unique in Numpy < 1.13.x - if float('.'.join(np_version[:2])) < 1.13: - patterns, row_pat_idx = _unique( - mask, return_inverse=True, axis=0) - else: - patterns, row_pat_idx = np.unique( - mask, return_inverse=True, axis=0) + # Locate unique patterns + patterns, row_pat_idx = _unique( + mask, return_inverse=True, axis=0) # Get row idx for receivers (missing) receiver_row_missing_index, _ = np.where(mask) From a485db94b740d4d2f11040d3d2f0cdedb12620ac Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Fri, 26 Jan 2018 16:01:48 -0600 Subject: [PATCH 68/97] Minor fix --- sklearn/preprocessing/imputation.py | 38 ----------------------------- 1 file changed, 38 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index e31cc346e5eae..603e557bcd029 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -169,44 +169,6 @@ def _unique(ar, return_index=False, return_inverse=False, numpy.lib.arraysetops : Module with a number of other functions for performing set operations on arrays. - Examples - -------- - >>> np.unique([1, 1, 2, 2, 3, 3]) - array([1, 2, 3]) - >>> a = np.array([[1, 1], [2, 3]]) - >>> np.unique(a) - array([1, 2, 3]) - - Return the unique rows of a 2D array - - >>> a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]]) - >>> np.unique(a, axis=0) - array([[1, 0, 0], [2, 3, 4]]) - - Return the indices of the original array that give the unique values: - - >>> a = np.array(['a', 'b', 'b', 'c', 'a']) - >>> u, indices = np.unique(a, return_index=True) - >>> u - array(['a', 'b', 'c'], - dtype='|S1') - >>> indices - array([0, 1, 3]) - >>> a[indices] - array(['a', 'b', 'c'], - dtype='|S1') - - Reconstruct the input array from the unique values: - - >>> a = np.array([1, 2, 6, 4, 2, 3, 2]) - >>> u, indices = np.unique(a, return_inverse=True) - >>> u - array([1, 2, 3, 4, 6]) - >>> indices - array([0, 1, 4, 3, 1, 2, 1]) - >>> u[indices] - array([1, 2, 6, 4, 2, 3, 2]) - """ ar = np.asanyarray(ar) if axis is None: From 60585484d21471aa40a3fb0f0c363f03a6792d87 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Tue, 27 Mar 2018 21:46:34 -0500 Subject: [PATCH 69/97] Added strategy to only use neighbors with non-nan value --- sklearn/preprocessing/imputation.py | 207 ++++++++++-------- .../preprocessing/tests/test_imputation.py | 115 +++++++--- 2 files changed, 195 insertions(+), 127 deletions(-) diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py index 603e557bcd029..157053d19da5a 100644 --- a/sklearn/preprocessing/imputation.py +++ b/sklearn/preprocessing/imputation.py @@ -10,11 +10,13 @@ from scipy import stats from ..base import BaseEstimator, TransformerMixin +# from ..neighbors.base import _get_weights from ..utils import check_array from ..utils.sparsefuncs import _get_median from ..utils.validation import check_is_fitted from ..utils.validation import FLOAT_DTYPES - +from ..utils import _get_n_jobs +from ..metrics import pairwise_distances from ..externals import six zip = six.moves.zip @@ -62,6 +64,52 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value +# Skeletal version of KNeighborsMixin.kneighbors() +def _neighbors(X, donor_X=None, metric="masked_euclidean", n_jobs=1, + **metric_params): + """Finds the unsorted K-neighbors of a point. + + Returns unsorted indices of and distances to the neighbors of each point. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The samples whose neighbors are to be evaluated + + donor_X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + n_jobs : int, optional (default = 1) + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + + Returns + ------- + dist : array + Array representing the unsorted lengths to points, only present if + return_distance=True + + ind : array + Indices of the unsorted nearest points in the population matrix. + + """ + n_samples, _ = X.shape + sample_range = np.arange(n_samples)[:, None] + + n_jobs = _get_n_jobs(n_jobs) + dist = pairwise_distances(X, donor_X, + metric=metric, + n_jobs=n_jobs, + squared=True, + **metric_params) + neigh_ind = np.argsort(dist[sample_range, :]) + + return neigh_ind + + # Code for function _unique1d taken directly from Numpy def _unique1d(ar, return_index=False, return_inverse=False, return_counts=False): @@ -533,7 +581,7 @@ def transform(self, X): class KNNImputer(BaseEstimator, TransformerMixin): """Imputation for completing missing values using k-Nearest Neighbors. - Each sample's missing values are imputed from up to ``max_neighbors`` + Each sample's missing values are imputed from up to ``n_neighbors`` nearest neighbors found in the training set. Each missing feature is then imputed as the average, either weighted or unweighted, of these neighbors who have a value for it. Where all neighbors have that feature value @@ -546,7 +594,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): `missing_values` will be imputed. For missing values encoded as ``np.nan``, use the string value "NaN". - max_neighbors : int, optional (default = 5) + n_neighbors : int, optional (default = 5) Maximum number of neighboring samples to use for imputation. When any of the neighbors themselves have the feature value missing then the remaining neighbors, if any, that have the feature value available are @@ -615,7 +663,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing.imputation import KNNImputer >>> nan = float("NaN") >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] - >>> imputer = KNNImputer(max_neighbors=2, weights="uniform") + >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") >>> imputer.fit_transform(X) array([[ 1. , 2. , 4. ], [ 3. , 4. , 3. ], @@ -623,13 +671,13 @@ class KNNImputer(BaseEstimator, TransformerMixin): [ 8. , 8. , 7. ]]) """ - def __init__(self, missing_values="NaN", max_neighbors=5, + def __init__(self, missing_values="NaN", n_neighbors=5, weights="uniform", metric="masked_euclidean", row_max_missing=0.5, col_max_missing=0.8, use_complete=False, copy=True): self.missing_values = missing_values - self.max_neighbors = max_neighbors + self.n_neighbors = n_neighbors self.weights = weights self.metric = metric self.row_max_missing = row_max_missing @@ -685,21 +733,22 @@ def fit(self, X, y=None): X = X[~bad_rows, :] # Check if sufficient neighboring samples available - if X.shape[0] < self.max_neighbors: - raise ValueError("There are only %d samples, but max_neighbors=%d." - % (X.shape[0], self.max_neighbors)) + if X.shape[0] < self.n_neighbors: + raise ValueError("There are only %d samples, but n_neighbors=%d." + % (X.shape[0], self.n_neighbors)) # Instantiate NN object, get column means, and store in statistics_ - neigh = NearestNeighbors(n_neighbors=self.max_neighbors, + neigh = NearestNeighbors(n_neighbors=self.n_neighbors, metric=self.metric, metric_params={"missing_values": self.missing_values}) self._fitted_neighbors = neigh.fit(X) + # self.fitted_X_ = X self.statistics_ = X_col_means return self - def _get_weight_matrix(self, fitted_X, mask, adjusted_max_neighbors, + def _get_weight_matrix(self, fitted_X, mask, adjusted_n_neighbors, receiver_row_index, row_repeats, knn_row_index, knn_distances): """Get the weight matrix for the donors""" @@ -708,7 +757,7 @@ def _get_weight_matrix(self, fitted_X, mask, adjusted_max_neighbors, from ..neighbors.base import _get_weights # If different X in transform, get a new mask - if self.max_neighbors == adjusted_max_neighbors: + if self.n_neighbors == adjusted_n_neighbors: nbors_mask = _get_mask(fitted_X[knn_row_index], value_to_mask=self.missing_values) else: @@ -733,10 +782,10 @@ def _get_weight_matrix(self, fitted_X, mask, adjusted_max_neighbors, weight_matrix = np.repeat(weight_matrix, row_repeats, axis=0).ravel() weight_matrix = weight_matrix.reshape( - (-1, adjusted_max_neighbors)) + (-1, adjusted_n_neighbors)) return weight_matrix - def _transform(self, X, adjusted_max_neighbors): + def _transform(self, X, adjusted_n_neighbors): """Impute all missing values in X. Parameters @@ -744,14 +793,15 @@ def _transform(self, X, adjusted_max_neighbors): X : {array-like}, shape = [n_samples, n_features] The input data to complete. - adjusted_max_neighbors : int + adjusted_n_neighbors : int Depending on the calling method, the default value must - either be equal to max_neighbors or max_neighbors + 1. + either be equal to n_neighbors or n_neighbors + 1. If the calling method is transform(), then its value needs to be - equal to max_neighbors and if calling method is fit_transform() - then its value must be equal to max_neighbors + 1. + equal to n_neighbors and if calling method is fit_transform() + then its value must be equal to n_neighbors + 1. """ - + # Import here to avoud circular import + from ..neighbors.base import _get_weights check_is_fitted(self, 'statistics_') force_all_finite = False if self.missing_values in ["NaN", np.nan] else True @@ -764,11 +814,14 @@ def _transform(self, X, adjusted_max_neighbors): # Get fitted data and ensure correct dimension fitted_X = self._fitted_neighbors._fit_X - if X.shape[1] != fitted_X.shape[1]: + n_rows_fit_X, n_cols_fit_X = fitted_X.shape + n_rows_X, n_cols_X = X.shape + + if n_cols_X != n_cols_fit_X: raise ValueError("Incompatible dimension between the fitted " "dataset and the one to be transformed.") mask = _get_mask(X, self.missing_values) - n_rows_X, n_cols_X = X.shape + row_total_missing = mask.sum(axis=1) if not np.any(row_total_missing): return X @@ -789,90 +842,58 @@ def _transform(self, X, adjusted_max_neighbors): if np.any(row_has_missing): if self.use_complete: # Initializations + # Mask for fitted_X mask_fx = _get_mask(fitted_X, np.nan) - # Locate unique patterns - patterns, row_pat_idx = _unique( - mask, return_inverse=True, axis=0) - - # Get row idx for receivers (missing) - receiver_row_missing_index, _ = np.where(mask) + # Get row index of missing and distance from donors + n_adj_samples, _ = X[row_has_missing].shape + dist = pairwise_distances(X, + fitted_X, + metric=self.metric, + squared=False) # For every pattern, index receivers and potential donors - for p in range(len(patterns)): - if not np.any(patterns[p]): + for c in range(n_cols_X): + if not np.any(mask[:, c], axis=0): continue - # receivers are those with pattern 'p' - row_has_missing_pat = (row_pat_idx == p) - - # Donors have features missing in receivers available - # The bitwise-AND captures if something is missing in both - donor_row_idx = ~np.any(patterns[p] & mask_fx, axis=1) - - # Change donor set to ones not missing relevant features - self._fitted_neighbors._fit_X = fitted_X[donor_row_idx] - if len(self._fitted_neighbors._fit_X) < self.max_neighbors: - err_msg = "Insufficient neighbors with feature values." - raise ValueError(err_msg) - - # Row index of receivers & identify potential donors - receiver_row_index = np.where( - row_has_missing_pat)[0].reshape((-1, 1)) - neighbors = self._fitted_neighbors.kneighbors( - X[row_has_missing_pat, :], - n_neighbors=self.max_neighbors) - - # Get row index, distance, and weights of donors - knn_distances, knn_row_index = neighbors - row_repeats = row_total_missing[row_has_missing_pat] - - # Weighting: Set self/degenerate donor(s) distance to inf - weight_matrix = None - if self.weights in ["distance"] or callable(self.weights): - weight_matrix = self._get_weight_matrix( - self._fitted_neighbors._fit_X, - mask, - self.max_neighbors, - receiver_row_index, - row_repeats, - knn_row_index, - knn_distances - ) - - # Repeat each set donor indices by - # missing count in the corresponding recipient row - knn_row_index_repeat = np.repeat( - knn_row_index, row_repeats, axis=0).ravel() - - # Get repeated column index of donors - _, receiver_col_missing_index = \ - np.where(mask[row_has_missing_pat]) - knn_col_index_repeat = np.repeat( - receiver_col_missing_index, - self.max_neighbors) + # Row index for receivers and potential donors (pdonors) + receivers_row_idx = np.where(mask[:, c])[0] + pdonors_row_idx = np.where(~mask_fx[:, c])[0] + + # Get distance from potential donors + dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] + dist_pdonors = dist_pdonors.reshape(-1, + len(pdonors_row_idx)) + pdonors_idx = np.argpartition( + dist_pdonors, self.n_neighbors - 1, axis=1) + + # Get final donors row index from pdonors + donors_idx = pdonors_idx[:, :self.n_neighbors] + # Get weights or None + dist_pdonors_rows = np.arange(len(donors_idx))[:, None] + weight_matrix = _get_weights( + dist_pdonors[ + dist_pdonors_rows, donors_idx], self.weights) + donor_row_idx_ravel = donors_idx.ravel() # Retrieve donor cells and calculate kNN score - donors = self._fitted_neighbors._fit_X[ - knn_row_index_repeat, knn_col_index_repeat].reshape( - (-1, self.max_neighbors)) + fitted_X_temp = fitted_X[pdonors_row_idx] + donors = fitted_X_temp[donor_row_idx_ravel, c].reshape( + (-1, self.n_neighbors)) donors_mask = _get_mask(donors, self.missing_values) donors = np.ma.array(donors, mask=donors_mask) # Final imputation imputed = np.ma.average(donors, axis=1, weights=weight_matrix) - X[np.where(row_has_missing_pat)[0], - receiver_col_missing_index] = imputed.data - - # Recover original dataset - self._fitted_neighbors._fit_X = fitted_X + X[receivers_row_idx, c] = imputed.data else: # Row index of receivers & identify potential donors receiver_row_index = np.where( row_has_missing)[0].reshape((-1, 1)) neighbors = self._fitted_neighbors.kneighbors( - X[row_has_missing, :], n_neighbors=adjusted_max_neighbors) + X[row_has_missing, :], n_neighbors=adjusted_n_neighbors) # Get row index, distance, and weights of donors knn_distances, knn_row_index = neighbors @@ -884,7 +905,7 @@ def _transform(self, X, adjusted_max_neighbors): weight_matrix = self._get_weight_matrix( fitted_X, mask, - adjusted_max_neighbors, + adjusted_n_neighbors, receiver_row_index, row_repeats, knn_row_index, @@ -900,17 +921,17 @@ def _transform(self, X, adjusted_max_neighbors): receiver_row_missing_index, receiver_col_missing_index = \ np.where(mask) knn_col_index_repeat = np.repeat(receiver_col_missing_index, - adjusted_max_neighbors) + adjusted_n_neighbors) # Retrieve donor cells and calculate kNN score donors = fitted_X[ knn_row_index_repeat, knn_col_index_repeat].reshape( - (-1, adjusted_max_neighbors)) + (-1, adjusted_n_neighbors)) donors_mask = _get_mask(donors, self.missing_values) donors = np.ma.array(donors, mask=donors_mask) - # Warning if donor count < max_neighbors - if np.any(donors_mask.sum(axis=1) < self.max_neighbors): + # Warning if donor count < n_neighbors + if np.any(donors_mask.sum(axis=1) < self.n_neighbors): warnings.warn("One or more donor(s) have the relevant " "feature value missing.") @@ -955,7 +976,7 @@ def fit_transform(self, X, y=None, **fit_params): Returns imputed dataset. """ return self.fit(X)._transform( - X, adjusted_max_neighbors=self.max_neighbors + 1) + X, adjusted_n_neighbors=self.n_neighbors + 1) def transform(self, X): """Impute all missing values in X. @@ -978,4 +999,4 @@ def transform(self, X): Returns imputed dataset. """ check_is_fitted(self, 'statistics_') - return self._transform(X, adjusted_max_neighbors=self.max_neighbors) + return self._transform(X, adjusted_n_neighbors=self.n_neighbors) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 37650c9a143fd..ceef745087da6 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -12,6 +12,7 @@ from sklearn.preprocessing.imputation import Imputer from sklearn.preprocessing.imputation import KNNImputer from sklearn.metrics.pairwise import masked_euclidean_distances +from sklearn.metrics.pairwise import pairwise_distances from sklearn.neighbors import NearestNeighbors from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV @@ -392,8 +393,8 @@ def test_knn_imputation_shape(): X[0, 0] = np.nan for weights in ['uniform', 'distance']: - for max_neighbors in range(1, 6): - imputer = KNNImputer(max_neighbors=max_neighbors, weights=weights) + for n_neighbors in range(1, 6): + imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols)) @@ -401,12 +402,12 @@ def test_knn_imputation_shape(): def test_knn_imputation_zero(): # Test imputation when missing_values == 0 missing_values = 0 - max_neighbors = 2 + n_neighbors = 2 imputer = KNNImputer(missing_values=missing_values, - max_neighbors=max_neighbors, + n_neighbors=n_neighbors, weights="uniform") imputer_nan = KNNImputer(missing_values="NaN", - max_neighbors=max_neighbors, + n_neighbors=n_neighbors, weights="uniform") # Test with missing_values=0 when NaN present @@ -595,8 +596,8 @@ def test_default_with_invalid_input(): [3, 2, 3, 3, 8], [6, 6, 2, 5, 13], ]) - msg = "There are only %d samples, but max_neighbors=%d." % \ - (X.shape[0], imputer.max_neighbors) + msg = "There are only %d samples, but n_neighbors=%d." % \ + (X.shape[0], imputer.n_neighbors) assert_raise_message(ValueError, msg, imputer.fit, X) # Test with inf present @@ -633,7 +634,7 @@ def test_default_with_invalid_input(): assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X) -def test_knn_max_neighbors(): +def test_knn_n_neighbors(): X = np.array([ [0, 0], @@ -657,9 +658,9 @@ def test_knn_max_neighbors(): [14, 13] ]) - max_neighbors = 1 - imputer = KNNImputer(max_neighbors=max_neighbors) - imputer_plus1 = KNNImputer(max_neighbors=max_neighbors + 1) + n_neighbors = 1 + imputer = KNNImputer(n_neighbors=n_neighbors) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) assert_array_equal(imputer.statistics_, statistics_mean) @@ -687,9 +688,9 @@ def test_knn_max_neighbors(): [14, 13] ]) - max_neighbors = 6 - imputer = KNNImputer(max_neighbors=6) - imputer_plus1 = KNNImputer(max_neighbors=max_neighbors + 1) + n_neighbors = 6 + imputer = KNNImputer(n_neighbors=6) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) assert_array_equal(imputer.statistics_, statistics_mean) @@ -744,7 +745,7 @@ def test_weight_distance(): # Test with "distance" weight nn = NearestNeighbors(metric="masked_euclidean") nn.fit(X) - # Get distance of "max_neighbors" neighbors of row 1 + # Get distance of "n_neighbors" neighbors of row 1 dist, index = nn.kneighbors() dist = dist[1, :] index = index[1, :] @@ -780,7 +781,7 @@ def test_weight_distance(): assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, decimal=6) - # Test with weights = "distance" and max_neighbors=2 + # Test with weights = "distance" and n_neighbors=2 X = np.array([ [np.nan, 0, 0], [2, 1, 2], @@ -796,7 +797,7 @@ def test_weight_distance(): [4, 5, 5], ]) - imputer = KNNImputer(max_neighbors=2, weights="distance") + imputer = KNNImputer(n_neighbors=2, weights="distance") assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=4) assert_array_equal(imputer.statistics_, statistics_mean) @@ -895,7 +896,7 @@ def custom_callable(x, y, missing_values="NaN"): [5, 9, 11, 10.] ]) - imputer = KNNImputer(max_neighbors=2, metric=custom_callable) + imputer = KNNImputer(n_neighbors=2, metric=custom_callable) assert_array_equal(imputer.fit_transform(X), X_imputed) @@ -903,31 +904,77 @@ def test_complete_features(): # Test with use_complete=True X = np.array([ - [0, 0, 0, np.nan], - [1, 1, 1, np.nan], - [2, 2, np.nan, 2], - [3, 3, 3, 3], - [4, 4, 4, 4], - [5, 5, 5, 5], - [6, 6, 6, 6], - [np.nan, 7, 7, 7] + [0, np.nan, 0, np.nan], + [1, 1, 1, np.nan], + [2, 2, np.nan, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [np.nan, 7, 7, 7] ]) + r0c1 = np.mean(X[1:6, 1]) r0c3 = np.mean(X[2:-1, -1]) r1c3 = np.mean(X[2:-1, -1]) r2c2 = np.nanmean(X[:6, 2]) r7c0 = np.mean(X[2:-1, 0]) X_imputed = np.array([ - [0, 0, 0, r0c3], - [1, 1, 1, r1c3], - [2, 2, r2c2, 2], - [3, 3, 3, 3], - [4, 4, 4, 4], - [5, 5, 5, 5], - [6, 6, 6, 6], - [r7c0, 7, 7, 7] + [0, r0c1, 0, r0c3], + [1, 1, 1, r1c3], + [2, 2, r2c2, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [r7c0, 7, 7, 7] ]) imputer_comp = KNNImputer(use_complete=True) assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed) + + +def test_complete_features_weighted(): + + # Test with use_complete=True + X = np.array([ + [0, 0, 0, np.nan], + [1, 1, 1, np.nan], + [2, 2, np.nan, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [np.nan, 7, 7, 7] + ]) + + dist = pairwise_distances(X, + metric="masked_euclidean", + squared=False) + + # Calculate weights + r0c3_w = 1.0 / dist[0, 2:-1] + r1c3_w = 1.0 / dist[1, 2:-1] + r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] + r7c0_w = 1.0 / dist[7, 2:7] + + # Calculate weighted averages + r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) + r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) + r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) + r7c0 = np.average(X[2:7, 0], weights=r7c0_w) + + X_imputed = np.array([ + [0, 0, 0, r0c3], + [1, 1, 1, r1c3], + [2, 2, r2c2, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [r7c0, 7, 7, 7] + ]) + + imputer_comp_wt = KNNImputer(weights="distance", use_complete=True) + assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed) From 0b67233d14ee6fe6440794cccea196f33bc39e16 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 01:29:52 -0500 Subject: [PATCH 70/97] Edit import path in test file --- .../preprocessing/tests/test_imputation.py | 602 +---------------- sklearn/tests/test_impute.py | 606 +++++++++++++++++- 2 files changed, 606 insertions(+), 602 deletions(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 4e2164b7730f1..94e2531bd0c97 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -11,7 +11,7 @@ from sklearn.utils.testing import ignore_warnings from sklearn.preprocessing.imputation import Imputer -from sklearn.preprocessing.imputation import KNNImputer +from sklearn.impute import KNNImputer from sklearn.metrics.pairwise import masked_euclidean_distances from sklearn.metrics.pairwise import pairwise_distances from sklearn.neighbors import NearestNeighbors @@ -388,603 +388,3 @@ def test_imputation_copy(): # Note: If X is sparse and if missing_values=0, then a (dense) copy of X is # made, even if copy=False. - - -############################################################################# -# BEGIN KNNIMPUTER TEST - - -def test_knn_imputation_shape(): - # Verify the shapes of the imputed matrix for different weights and - # number of neighbors. - n_rows = 10 - n_cols = 2 - X = np.random.rand(n_rows, n_cols) - X[0, 0] = np.nan - - for weights in ['uniform', 'distance']: - for n_neighbors in range(1, 6): - imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) - X_imputed = imputer.fit_transform(X) - assert_equal(X_imputed.shape, (n_rows, n_cols)) - - -def test_knn_imputation_zero(): - # Test imputation when missing_values == 0 - missing_values = 0 - n_neighbors = 2 - imputer = KNNImputer(missing_values=missing_values, - n_neighbors=n_neighbors, - weights="uniform") - imputer_nan = KNNImputer(missing_values="NaN", - n_neighbors=n_neighbors, - weights="uniform") - - # Test with missing_values=0 when NaN present - X = np.array([ - [np.nan, 0, 0, 0, 5], - [np.nan, 1, 0, np.nan, 3], - [np.nan, 2, 0, 0, 0], - [np.nan, 6, 0, 5, 13], - ]) - msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with % zeros in column > col_max_missing - X = np.array([ - [1, 0, 0, 0, 5], - [2, 1, 0, 2, 3], - [3, 2, 0, 0, 0], - [4, 6, 0, 5, 13], - ]) - msg = "Some column(s) have more than {}% missing values".format( - imputer.col_max_missing * 100) - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with an imputable matrix and also compare with missing_values="NaN" - X = np.array([ - [1, 0, 1, 0, 1.], - [2, 1, 2, 2, 3], - [3, 2, 3, 0, 0], - [6, 6, 0, 5, 17], - ]) - - X_nan = np.array([ - [1, np.nan, 1, np.nan, 1.], - [2, 1, 2, 2, 3], - [3, 2, 3, np.nan, np.nan], - [6, 6, np.nan, 5, 17], - ]) - statistics_mean = np.nanmean(X_nan, axis=0) - - X_imputed = np.array([ - [1, 1.5, 1, 2, 1.], - [2, 1, 2, 2, 3], - [3, 2, 3, 2, 2], - [6, 6, 2.5, 5, 17], - ]) - - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( - X_nan)) - - -def test_knn_imputation_default(): - # Test imputation with default parameter values - - # Test with an imputable matrix - X = np.array([ - [1, 0, 0, 1], - [2, 1, 2, np.nan], - [3, 2, 3, np.nan], - [np.nan, 4, 5, 5], - [6, np.nan, 6, 7], - [8, 8, 8, 8], - [16, 15, 18, 19], - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [1, 0, 0, 1], - [2, 1, 2, 5.25], - [3, 2, 3, 5.25], - [4, 4, 5, 5], - [6, 3, 6, 7], - [8, 8, 8, 8], - [16, 15, 18, 19], - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - # Test with % missing in row > row_max_missing - X = np.array([ - [1, 0, 0, 1], - [2, 1, 2, np.nan], - [3, 2, 3, np.nan], - [np.nan, 4, 5, 5], - [6, np.nan, 6, 7], - [8, 8, 8, 8], - [np.nan, np.nan, np.nan, 19], - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [1, 0, 0, 1], - [2, 1, 2, 5.25], - [3, 2, 3, 5.25], - [4, 4, 5, 5], - [6, 3, 6, 7], - [8, 8, 8, 8], - [4, 3, 4, 19], - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - # Test with all neighboring donors also having missing feature values - X = np.array([ - [1, 0, 0, np.nan], - [2, 1, 2, np.nan], - [3, 2, 3, np.nan], - [4, 4, 5, np.nan], - [6, 7, 6, np.nan], - [8, 8, 8, np.nan], - [20, 20, 20, 20], - [22, 22, 22, 22] - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [1, 0, 0, 21], - [2, 1, 2, 21], - [3, 2, 3, 21], - [4, 4, 5, 21], - [6, 7, 6, 21], - [8, 8, 8, 21], - [20, 20, 20, 20], - [22, 22, 22, 22] - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - # Test when data in fit() and transform() are different - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 16] - ]) - statistics_mean = np.nanmean(X, axis=0) - - Y = np.array([ - [1, 0], - [3, 2], - [4, np.nan] - ]) - - Y_imputed = np.array([ - [1, 0], - [3, 2], - [4, 4.8] - ]) - - imputer = KNNImputer() - assert_array_equal(imputer.fit(X).transform(Y), Y_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - - -def test_default_with_invalid_input(): - # Test imputation with default values and invalid input - - # Test with % missing in a column > col_max_missing - X = np.array([ - [np.nan, 0, 0, 0, 5], - [np.nan, 1, 0, np.nan, 3], - [np.nan, 2, 0, 0, 0], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [np.nan, 8, 0, 8, 9], - ]) - imputer = KNNImputer() - msg = "Some column(s) have more than {}% missing values".format( - imputer.col_max_missing * 100) - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with insufficient number of neighbors - X = np.array([ - [1, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [6, 6, 2, 5, 13], - ]) - msg = "There are only %d samples, but n_neighbors=%d." % \ - (X.shape[0], imputer.n_neighbors) - assert_raise_message(ValueError, msg, imputer.fit, X) - - # Test with inf present - X = np.array([ - [np.inf, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [6, 6, 2, 5, 7], - ]) - msg = "+/- inf values are not allowed." - assert_raise_message(ValueError, msg, KNNImputer().fit, X) - - # Test with inf present in matrix passed in transform() - X = np.array([ - [np.inf, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [6, 6, 2, 5, 7], - ]) - - X_fit = np.array([ - [0, 1, 1, 2, np.nan], - [2, 1, 2, 2, 3], - [3, 2, 3, 3, 8], - [np.nan, 6, 0, 5, 13], - [np.nan, 7, 0, 7, 8], - [6, 6, 2, 5, 7], - ]) - msg = "+/- inf values are not allowed in data to be transformed." - assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X) - - -def test_knn_n_neighbors(): - - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, np.nan], - [7, 7], - [np.nan, 8], - [14, 13] - ]) - statistics_mean = np.nanmean(X, axis=0) - - # Test with 1 neighbor - X_imputed_1NN = np.array([ - [0, 0], - [4, 2], - [4, 3], - [5, 3], - [7, 7], - [7, 8], - [14, 13] - ]) - - n_neighbors = 1 - imputer = KNNImputer(n_neighbors=n_neighbors) - imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) - - assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) - assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( - X).transform(X)) - - # Test with 6 neighbors - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, np.nan], - [7, 7], - [np.nan, 8], - [14, 13] - ]) - - X_imputed_6NN = np.array([ - [0, 0], - [6, 2], - [4, 3], - [5, 5.5], - [7, 7], - [6, 8], - [14, 13] - ]) - - n_neighbors = 6 - imputer = KNNImputer(n_neighbors=6) - imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) - - assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) - assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( - X).transform(X)) - - -def test_weight_uniform(): - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - # Test with "uniform" weight (or unweighted) - X_imputed_uniform = np.array([ - [0, 0], - [5, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - imputer = KNNImputer(weights="uniform") - assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) - - # Test with "callable" weight - def no_weight(dist=None): - return None - - imputer = KNNImputer(weights=no_weight) - assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) - - -def test_weight_distance(): - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - # Test with "distance" weight - nn = NearestNeighbors(metric="masked_euclidean") - nn.fit(X) - # Get distance of "n_neighbors" neighbors of row 1 - dist, index = nn.kneighbors() - dist = dist[1, :] - index = index[1, :] - weights = 1 / dist - values = X[index, 0] - imputed = np.dot(values, weights) / np.sum(weights) - - # Manual calculation - X_imputed_distance1 = np.array([ - [0, 0], - [3.850394, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - # NearestNeighbor calculation - X_imputed_distance2 = np.array([ - [0, 0], - [imputed, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - imputer = KNNImputer(weights="distance") - assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1, - decimal=6) - assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, - decimal=6) - - # Test with weights = "distance" and n_neighbors=2 - X = np.array([ - [np.nan, 0, 0], - [2, 1, 2], - [3, 2, 3], - [4, 5, 5], - ]) - statistics_mean = np.nanmean(X, axis=0) - - X_imputed = np.array([ - [2.3828, 0, 0], - [2, 1, 2], - [3, 2, 3], - [4, 5, 5], - ]) - - imputer = KNNImputer(n_neighbors=2, weights="distance") - assert_array_almost_equal(imputer.fit_transform(X), X_imputed, - decimal=4) - assert_array_equal(imputer.statistics_, statistics_mean) - - # Test with varying missingness patterns - X = np.array([ - [1, 0, 0, 1], - [0, np.nan, 1, np.nan], - [1, 1, 1, np.nan], - [0, 1, 0, 0], - [0, np.nan, 1, 0], - [1, 1, 1, 1], - [10, 10, 10, 10], - ]) - statistics_mean = np.nanmean(X, axis=0) - - # Get weights of donor neighbors - dist = masked_euclidean_distances(X) - row1_nbor_dists = dist[1, :6] - row1_nbor_dists[np.array([1, 2, 4])] = np.inf # Degenerate neighbors - row1_nbor_wt = 1/row1_nbor_dists - - row2_nbor_dists = dist[2, :6] - row2_nbor_dists[np.array([1, 2])] = np.inf # Degenerate neighbors - row2_nbor_wt = 1/row2_nbor_dists - # A non-degenerate donor has zero distance so it's weight is 1 and - # others have weight 0 - row2_nbor_wt[~np.isinf(row2_nbor_wt)] = 0 - row2_nbor_wt[np.isinf(row2_nbor_wt)] = 1 - - row4_nbor_dists = dist[4, :6] - row4_nbor_dists[np.array([1, 4])] = np.inf # Degenerate neighbors - row4_nbor_wt = 1/row4_nbor_dists - - # Collect donor values - col1_donor_values = np.ma.masked_invalid(X[:6, 1].copy()) - col3_donor_values = np.ma.masked_invalid(X[:6, 3].copy()) - - # Final imputed values - r1c1_imp = np.ma.average(col1_donor_values, weights=row1_nbor_wt) - r1c3_imp = np.ma.average(col3_donor_values, weights=row1_nbor_wt) - r2c3_imp = np.ma.average(col3_donor_values, weights=row2_nbor_wt) - r4c1_imp = np.ma.average(col1_donor_values, weights=row4_nbor_wt) - - X_imputed = np.array([ - [1, 0, 0, 1], - [0, r1c1_imp, 1, r1c3_imp], - [1, 1, 1, r2c3_imp], - [0, 1, 0, 0], - [0, r4c1_imp, 1, 0], - [1, 1, 1, 1], - [10, 10, 10, 10], - ]) - - imputer = KNNImputer(weights="distance") - assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6) - assert_array_equal(imputer.statistics_, statistics_mean) - - -def test_metric_type(): - X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] - ]) - - # Test with a metric type without NaN support - imputer = KNNImputer(metric="euclidean") - assert_raises(ValueError, imputer.fit, X) - - -def test_callable_metric(): - - # Define callable metric that returns the l1 norm: - def custom_callable(x, y, missing_values="NaN"): - x = np.ma.array(x, mask=np.isnan(x)) - y = np.ma.array(y, mask=np.isnan(y)) - dist = np.nansum(np.abs(x-y)) - return dist - - X = np.array([ - [4, 3, 3, np.nan], - [6, 9, 6, 9], - [4, 8, 6, 9], - [np.nan, 9, 11, 10.] - ]) - - X_imputed = np.array([ - [4, 3, 3, 9], - [6, 9, 6, 9], - [4, 8, 6, 9], - [5, 9, 11, 10.] - ]) - - imputer = KNNImputer(n_neighbors=2, metric=custom_callable) - assert_array_equal(imputer.fit_transform(X), X_imputed) - - -def test_complete_features(): - - # Test with use_complete=True - X = np.array([ - [0, np.nan, 0, np.nan], - [1, 1, 1, np.nan], - [2, 2, np.nan, 2], - [3, 3, 3, 3], - [4, 4, 4, 4], - [5, 5, 5, 5], - [6, 6, 6, 6], - [np.nan, 7, 7, 7] - ]) - - r0c1 = np.mean(X[1:6, 1]) - r0c3 = np.mean(X[2:-1, -1]) - r1c3 = np.mean(X[2:-1, -1]) - r2c2 = np.nanmean(X[:6, 2]) - r7c0 = np.mean(X[2:-1, 0]) - - X_imputed = np.array([ - [0, r0c1, 0, r0c3], - [1, 1, 1, r1c3], - [2, 2, r2c2, 2], - [3, 3, 3, 3], - [4, 4, 4, 4], - [5, 5, 5, 5], - [6, 6, 6, 6], - [r7c0, 7, 7, 7] - ]) - - imputer_comp = KNNImputer(use_complete=True) - assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed) - - -def test_complete_features_weighted(): - - # Test with use_complete=True - X = np.array([ - [0, 0, 0, np.nan], - [1, 1, 1, np.nan], - [2, 2, np.nan, 2], - [3, 3, 3, 3], - [4, 4, 4, 4], - [5, 5, 5, 5], - [6, 6, 6, 6], - [np.nan, 7, 7, 7] - ]) - - dist = pairwise_distances(X, - metric="masked_euclidean", - squared=False) - - # Calculate weights - r0c3_w = 1.0 / dist[0, 2:-1] - r1c3_w = 1.0 / dist[1, 2:-1] - r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] - r7c0_w = 1.0 / dist[7, 2:7] - - # Calculate weighted averages - r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) - r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) - r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) - r7c0 = np.average(X[2:7, 0], weights=r7c0_w) - - X_imputed = np.array([ - [0, 0, 0, r0c3], - [1, 1, 1, r1c3], - [2, 2, r2c2, 2], - [3, 3, 3, 3], - [4, 4, 4, 4], - [5, 5, 5, 5], - [6, 6, 6, 6], - [r7c0, 7, 7, 7] - ]) - - imputer_comp_wt = KNNImputer(weights="distance", use_complete=True) - assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index f2bf5912e2213..722277a82c50d 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -6,13 +6,17 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_false -from sklearn.impute import SimpleImputer +from sklearn.impute import SimpleImputer, KNNImputer from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree from sklearn.random_projection import sparse_random_matrix +from sklearn.metrics.pairwise import masked_euclidean_distances +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.neighbors import NearestNeighbors def _check_statistics(X, X_true, @@ -257,3 +261,603 @@ def test_imputation_copy(): Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) + + +############################################################################# +# BEGIN KNNIMPUTER TEST + + +def test_knn_imputation_shape(): + # Verify the shapes of the imputed matrix for different weights and + # number of neighbors. + n_rows = 10 + n_cols = 2 + X = np.random.rand(n_rows, n_cols) + X[0, 0] = np.nan + + for weights in ['uniform', 'distance']: + for n_neighbors in range(1, 6): + imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) + X_imputed = imputer.fit_transform(X) + assert_equal(X_imputed.shape, (n_rows, n_cols)) + + +def test_knn_imputation_zero(): + # Test imputation when missing_values == 0 + missing_values = 0 + n_neighbors = 2 + imputer = KNNImputer(missing_values=missing_values, + n_neighbors=n_neighbors, + weights="uniform") + imputer_nan = KNNImputer(missing_values="NaN", + n_neighbors=n_neighbors, + weights="uniform") + + # Test with missing_values=0 when NaN present + X = np.array([ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + ]) + msg = "Input contains NaN, infinity or a value too large for %r." % X.dtype + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with % zeros in column > col_max_missing + X = np.array([ + [1, 0, 0, 0, 5], + [2, 1, 0, 2, 3], + [3, 2, 0, 0, 0], + [4, 6, 0, 5, 13], + ]) + msg = "Some column(s) have more than {}% missing values".format( + imputer.col_max_missing * 100) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with an imputable matrix and also compare with missing_values="NaN" + X = np.array([ + [1, 0, 1, 0, 1.], + [2, 1, 2, 2, 3], + [3, 2, 3, 0, 0], + [6, 6, 0, 5, 17], + ]) + + X_nan = np.array([ + [1, np.nan, 1, np.nan, 1.], + [2, 1, 2, 2, 3], + [3, 2, 3, np.nan, np.nan], + [6, 6, np.nan, 5, 17], + ]) + statistics_mean = np.nanmean(X_nan, axis=0) + + X_imputed = np.array([ + [1, 1.5, 1, 2, 1.], + [2, 1, 2, 2, 3], + [3, 2, 3, 2, 2], + [6, 6, 2.5, 5, 17], + ]) + + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( + X_nan)) + + +def test_knn_imputation_default(): + # Test imputation with default parameter values + + # Test with an imputable matrix + X = np.array([ + [1, 0, 0, 1], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [np.nan, 4, 5, 5], + [6, np.nan, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [1, 0, 0, 1], + [2, 1, 2, 5.25], + [3, 2, 3, 5.25], + [4, 4, 5, 5], + [6, 3, 6, 7], + [8, 8, 8, 8], + [16, 15, 18, 19], + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test with % missing in row > row_max_missing + X = np.array([ + [1, 0, 0, 1], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [np.nan, 4, 5, 5], + [6, np.nan, 6, 7], + [8, 8, 8, 8], + [np.nan, np.nan, np.nan, 19], + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [1, 0, 0, 1], + [2, 1, 2, 5.25], + [3, 2, 3, 5.25], + [4, 4, 5, 5], + [6, 3, 6, 7], + [8, 8, 8, 8], + [4, 3, 4, 19], + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test with all neighboring donors also having missing feature values + X = np.array([ + [1, 0, 0, np.nan], + [2, 1, 2, np.nan], + [3, 2, 3, np.nan], + [4, 4, 5, np.nan], + [6, 7, 6, np.nan], + [8, 8, 8, np.nan], + [20, 20, 20, 20], + [22, 22, 22, 22] + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [1, 0, 0, 21], + [2, 1, 2, 21], + [3, 2, 3, 21], + [4, 4, 5, 21], + [6, 7, 6, 21], + [8, 8, 8, 21], + [20, 20, 20, 20], + [22, 22, 22, 22] + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit_transform(X), X_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test when data in fit() and transform() are different + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 16] + ]) + statistics_mean = np.nanmean(X, axis=0) + + Y = np.array([ + [1, 0], + [3, 2], + [4, np.nan] + ]) + + Y_imputed = np.array([ + [1, 0], + [3, 2], + [4, 4.8] + ]) + + imputer = KNNImputer() + assert_array_equal(imputer.fit(X).transform(Y), Y_imputed) + assert_array_equal(imputer.statistics_, statistics_mean) + + +def test_default_with_invalid_input(): + # Test imputation with default values and invalid input + + # Test with % missing in a column > col_max_missing + X = np.array([ + [np.nan, 0, 0, 0, 5], + [np.nan, 1, 0, np.nan, 3], + [np.nan, 2, 0, 0, 0], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [np.nan, 8, 0, 8, 9], + ]) + imputer = KNNImputer() + msg = "Some column(s) have more than {}% missing values".format( + imputer.col_max_missing * 100) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with insufficient number of neighbors + X = np.array([ + [1, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [6, 6, 2, 5, 13], + ]) + msg = "There are only %d samples, but n_neighbors=%d." % \ + (X.shape[0], imputer.n_neighbors) + assert_raise_message(ValueError, msg, imputer.fit, X) + + # Test with inf present + X = np.array([ + [np.inf, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + msg = "+/- inf values are not allowed." + assert_raise_message(ValueError, msg, KNNImputer().fit, X) + + # Test with inf present in matrix passed in transform() + X = np.array([ + [np.inf, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + + X_fit = np.array([ + [0, 1, 1, 2, np.nan], + [2, 1, 2, 2, 3], + [3, 2, 3, 3, 8], + [np.nan, 6, 0, 5, 13], + [np.nan, 7, 0, 7, 8], + [6, 6, 2, 5, 7], + ]) + msg = "+/- inf values are not allowed in data to be transformed." + assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X) + + +def test_knn_n_neighbors(): + + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, np.nan], + [7, 7], + [np.nan, 8], + [14, 13] + ]) + statistics_mean = np.nanmean(X, axis=0) + + # Test with 1 neighbor + X_imputed_1NN = np.array([ + [0, 0], + [4, 2], + [4, 3], + [5, 3], + [7, 7], + [7, 8], + [14, 13] + ]) + + n_neighbors = 1 + imputer = KNNImputer(n_neighbors=n_neighbors) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) + + assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( + X).transform(X)) + + # Test with 6 neighbors + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, np.nan], + [7, 7], + [np.nan, 8], + [14, 13] + ]) + + X_imputed_6NN = np.array([ + [0, 0], + [6, 2], + [4, 3], + [5, 5.5], + [7, 7], + [6, 8], + [14, 13] + ]) + + n_neighbors = 6 + imputer = KNNImputer(n_neighbors=6) + imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) + + assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) + assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( + X).transform(X)) + + +def test_weight_uniform(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # Test with "uniform" weight (or unweighted) + X_imputed_uniform = np.array([ + [0, 0], + [5, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + imputer = KNNImputer(weights="uniform") + assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + + # Test with "callable" weight + def no_weight(dist=None): + return None + + imputer = KNNImputer(weights=no_weight) + assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) + + +def test_weight_distance(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # Test with "distance" weight + nn = NearestNeighbors(metric="masked_euclidean") + nn.fit(X) + # Get distance of "n_neighbors" neighbors of row 1 + dist, index = nn.kneighbors() + dist = dist[1, :] + index = index[1, :] + weights = 1 / dist + values = X[index, 0] + imputed = np.dot(values, weights) / np.sum(weights) + + # Manual calculation + X_imputed_distance1 = np.array([ + [0, 0], + [3.850394, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # NearestNeighbor calculation + X_imputed_distance2 = np.array([ + [0, 0], + [imputed, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + imputer = KNNImputer(weights="distance") + assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1, + decimal=6) + assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, + decimal=6) + + # Test with weights = "distance" and n_neighbors=2 + X = np.array([ + [np.nan, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ]) + statistics_mean = np.nanmean(X, axis=0) + + X_imputed = np.array([ + [2.3828, 0, 0], + [2, 1, 2], + [3, 2, 3], + [4, 5, 5], + ]) + + imputer = KNNImputer(n_neighbors=2, weights="distance") + assert_array_almost_equal(imputer.fit_transform(X), X_imputed, + decimal=4) + assert_array_equal(imputer.statistics_, statistics_mean) + + # Test with varying missingness patterns + X = np.array([ + [1, 0, 0, 1], + [0, np.nan, 1, np.nan], + [1, 1, 1, np.nan], + [0, 1, 0, 0], + [0, np.nan, 1, 0], + [1, 1, 1, 1], + [10, 10, 10, 10], + ]) + statistics_mean = np.nanmean(X, axis=0) + + # Get weights of donor neighbors + dist = masked_euclidean_distances(X) + row1_nbor_dists = dist[1, :6] + row1_nbor_dists[np.array([1, 2, 4])] = np.inf # Degenerate neighbors + row1_nbor_wt = 1/row1_nbor_dists + + row2_nbor_dists = dist[2, :6] + row2_nbor_dists[np.array([1, 2])] = np.inf # Degenerate neighbors + row2_nbor_wt = 1/row2_nbor_dists + # A non-degenerate donor has zero distance so it's weight is 1 and + # others have weight 0 + row2_nbor_wt[~np.isinf(row2_nbor_wt)] = 0 + row2_nbor_wt[np.isinf(row2_nbor_wt)] = 1 + + row4_nbor_dists = dist[4, :6] + row4_nbor_dists[np.array([1, 4])] = np.inf # Degenerate neighbors + row4_nbor_wt = 1/row4_nbor_dists + + # Collect donor values + col1_donor_values = np.ma.masked_invalid(X[:6, 1].copy()) + col3_donor_values = np.ma.masked_invalid(X[:6, 3].copy()) + + # Final imputed values + r1c1_imp = np.ma.average(col1_donor_values, weights=row1_nbor_wt) + r1c3_imp = np.ma.average(col3_donor_values, weights=row1_nbor_wt) + r2c3_imp = np.ma.average(col3_donor_values, weights=row2_nbor_wt) + r4c1_imp = np.ma.average(col1_donor_values, weights=row4_nbor_wt) + + X_imputed = np.array([ + [1, 0, 0, 1], + [0, r1c1_imp, 1, r1c3_imp], + [1, 1, 1, r2c3_imp], + [0, 1, 0, 0], + [0, r4c1_imp, 1, 0], + [1, 1, 1, 1], + [10, 10, 10, 10], + ]) + + imputer = KNNImputer(weights="distance") + assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6) + assert_array_equal(imputer.statistics_, statistics_mean) + + +def test_metric_type(): + X = np.array([ + [0, 0], + [np.nan, 2], + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] + ]) + + # Test with a metric type without NaN support + imputer = KNNImputer(metric="euclidean") + assert_raises(ValueError, imputer.fit, X) + + +def test_callable_metric(): + + # Define callable metric that returns the l1 norm: + def custom_callable(x, y, missing_values="NaN"): + x = np.ma.array(x, mask=np.isnan(x)) + y = np.ma.array(y, mask=np.isnan(y)) + dist = np.nansum(np.abs(x-y)) + return dist + + X = np.array([ + [4, 3, 3, np.nan], + [6, 9, 6, 9], + [4, 8, 6, 9], + [np.nan, 9, 11, 10.] + ]) + + X_imputed = np.array([ + [4, 3, 3, 9], + [6, 9, 6, 9], + [4, 8, 6, 9], + [5, 9, 11, 10.] + ]) + + imputer = KNNImputer(n_neighbors=2, metric=custom_callable) + assert_array_equal(imputer.fit_transform(X), X_imputed) + + +def test_complete_features(): + + # Test with use_complete=True + X = np.array([ + [0, np.nan, 0, np.nan], + [1, 1, 1, np.nan], + [2, 2, np.nan, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [np.nan, 7, 7, 7] + ]) + + r0c1 = np.mean(X[1:6, 1]) + r0c3 = np.mean(X[2:-1, -1]) + r1c3 = np.mean(X[2:-1, -1]) + r2c2 = np.nanmean(X[:6, 2]) + r7c0 = np.mean(X[2:-1, 0]) + + X_imputed = np.array([ + [0, r0c1, 0, r0c3], + [1, 1, 1, r1c3], + [2, 2, r2c2, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [r7c0, 7, 7, 7] + ]) + + imputer_comp = KNNImputer(use_complete=True) + assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed) + + +def test_complete_features_weighted(): + + # Test with use_complete=True + X = np.array([ + [0, 0, 0, np.nan], + [1, 1, 1, np.nan], + [2, 2, np.nan, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [np.nan, 7, 7, 7] + ]) + + dist = pairwise_distances(X, + metric="masked_euclidean", + squared=False) + + # Calculate weights + r0c3_w = 1.0 / dist[0, 2:-1] + r1c3_w = 1.0 / dist[1, 2:-1] + r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] + r7c0_w = 1.0 / dist[7, 2:7] + + # Calculate weighted averages + r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) + r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) + r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) + r7c0 = np.average(X[2:7, 0], weights=r7c0_w) + + X_imputed = np.array([ + [0, 0, 0, r0c3], + [1, 1, 1, r1c3], + [2, 2, r2c2, 2], + [3, 3, 3, 3], + [4, 4, 4, 4], + [5, 5, 5, 5], + [6, 6, 6, 6], + [r7c0, 7, 7, 7] + ]) + + imputer_comp_wt = KNNImputer(weights="distance", use_complete=True) + assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed) From 3e0820961638a72bc75c0cc1c831bc1f6c6df73f Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 02:06:43 -0500 Subject: [PATCH 71/97] Error fixes with imports and examples --- sklearn/impute.py | 8 ++++---- sklearn/metrics/pairwise.py | 4 ++-- sklearn/preprocessing/__init__.py | 1 - sklearn/preprocessing/tests/test_imputation.py | 4 ---- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 6860c940e3054..098cbff3e3678 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -620,10 +620,10 @@ class KNNImputer(BaseEstimator, TransformerMixin): >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") >>> imputer.fit_transform(X) - array([[ 1. , 2. , 4. ], - [ 3. , 4. , 3. ], - [ 5.5, 6. , 5. ], - [ 8. , 8. , 7. ]]) + array([[1. , 2. , 4. ], + [3. , 4. , 3. ], + [5.5, 6. , 5. ], + [8. , 8. , 7. ]]) """ def __init__(self, missing_values="NaN", n_neighbors=5, diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 0c1601c441546..ded2bebc531dc 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -330,8 +330,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, >>> X = [[0, 1], [1, nan]] >>> # distance between rows of X >>> masked_euclidean_distances(X, X) - array([[ 0. , 1.41421356], - [ 1.41421356, 0. ]]) + array([[0. , 1.41421356], + [1.41421356, 0. ]]) >>> # get distance to origin >>> masked_euclidean_distances(X, [[0, 0]]) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index eec52e39ef8c4..ba0884613c124 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -40,7 +40,6 @@ 'FunctionTransformer', 'Imputer', 'KernelCenterer', - 'KNNImputer', 'LabelBinarizer', 'LabelEncoder', 'MultiLabelBinarizer', diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 94e2531bd0c97..0fd79de827b9a 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -6,15 +6,11 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_false from sklearn.utils.testing import ignore_warnings from sklearn.preprocessing.imputation import Imputer from sklearn.impute import KNNImputer -from sklearn.metrics.pairwise import masked_euclidean_distances -from sklearn.metrics.pairwise import pairwise_distances -from sklearn.neighbors import NearestNeighbors from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree From 851ab3c792da5e5aa7c3b3a716d9473ca98aa139 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 03:15:30 -0500 Subject: [PATCH 72/97] Added use_complete docstring --- sklearn/impute.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/impute.py b/sklearn/impute.py index 098cbff3e3678..2a0a2e08a211e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -589,6 +589,10 @@ class KNNImputer(BaseEstimator, TransformerMixin): The maximum percentage of rows (or samples) that can be missing for a given feature beyond which an error is raised. + use_complete : boolean, optional (default = False) + When determining neighbors, only consider those samples that have + the feature of interest available (i.e., it is not NaN). + copy : boolean, optional (default = True) If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, if metric is From 7a0647fdb8152a7be4b8f4b5b8005c22a8359454 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 03:53:13 -0500 Subject: [PATCH 73/97] Changed comments and fixed docstring --- sklearn/impute.py | 3 +-- sklearn/metrics/pairwise.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 2a0a2e08a211e..220d53555626c 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -702,7 +702,6 @@ def fit(self, X, y=None): metric_params={"missing_values": self.missing_values}) self._fitted_neighbors = neigh.fit(X) - # self.fitted_X_ = X self.statistics_ = X_col_means return self @@ -812,7 +811,7 @@ def _transform(self, X, adjusted_n_neighbors): metric=self.metric, squared=False) - # For every pattern, index receivers and potential donors + # For each column, find and impute missing for c in range(n_cols_X): if not np.any(mask[:, c], axis=0): continue diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index ded2bebc531dc..6f09ba9dd4d5d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -335,8 +335,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, >>> # get distance to origin >>> masked_euclidean_distances(X, [[0, 0]]) - array([[ 1. ], - [ 1.41421356]]) + array([[1. ], + [1.41421356]]) References ---------- From b17906f0d85509948191f9b264ff10fced4a0509 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 04:28:28 -0500 Subject: [PATCH 74/97] Added more doctest fix and min neighbor check --- doc/modules/preprocessing.rst | 2 +- sklearn/impute.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 8cfbd0dd46090..6839763a9743f 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -611,7 +611,7 @@ encoded as ``np.nan``, using the mean feature value of the two nearest neighbors of the rows that contain the missing values:: >>> import numpy as np - >>> from sklearn.preprocessing.imputation import KNNImputer + >>> from sklearn.impute import KNNImputer >>> nan = np.nan >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] >>> imputer = KNNImputer(max_neighbors=2, weights="uniform") diff --git a/sklearn/impute.py b/sklearn/impute.py index 220d53555626c..438c58cde3b07 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -819,6 +819,10 @@ def _transform(self, X, adjusted_n_neighbors): receivers_row_idx = np.where(mask[:, c])[0] pdonors_row_idx = np.where(~mask_fx[:, c])[0] + # Check if sufficient number of donors are available + if len(pdonors_row_idx) < self.n_neighbors: + raise ValueError("Insufficient number of neighbors.") + # Get distance from potential donors dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] dist_pdonors = dist_pdonors.reshape(-1, From bd6eb6947c00667f06fcc408dadcbf32d5fc0c11 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 05:26:14 -0500 Subject: [PATCH 75/97] fix docs --- doc/modules/preprocessing.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 6839763a9743f..53f073db76d3d 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -614,12 +614,12 @@ neighbors of the rows that contain the missing values:: >>> from sklearn.impute import KNNImputer >>> nan = np.nan >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] - >>> imputer = KNNImputer(max_neighbors=2, weights="uniform") + >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") >>> imputer.fit_transform(X) - array([[ 1. , 2. , 4. ], - [ 3. , 4. , 3. ], - [ 5.5, 6. , 5. ], - [ 8. , 8. , 7. ]]) + array([[1. , 2. , 4. ], + [3. , 4. , 3. ], + [5.5, 6. , 5. ], + [8. , 8. , 7. ]]) :class:`KNNImputer` can also be used in a Pipeline as a way to build a From 2ea131b23a55cba900a9bdecd51cf15c9ee3ecad Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 08:57:21 -0500 Subject: [PATCH 76/97] Increase col_max_missing threshold for example plot --- examples/plot_missing_values.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 77d209155d6ce..eb1683f5079cc 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -79,7 +79,8 @@ # Estimate the score after kNN-imputation of the missing values knn_estimator = Pipeline( - [("knnimputer", KNNImputer(missing_values=0, n_neighbors=10)), + [("knnimputer", KNNImputer(missing_values=0, n_neighbors=10, + col_max_missing=0.95)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) knn_score = cross_val_score(knn_estimator, X_missing, y_missing).mean() print("Score after knn-imputation of the missing values = %.2f" % knn_score) From b1d9397c2544778e60b1bbdac1f578b973fc6451 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 09:27:20 -0500 Subject: [PATCH 77/97] Lower missing rate in demo since tests are failing --- examples/plot_missing_values.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index eb1683f5079cc..d596453c02c56 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -49,8 +49,8 @@ score = cross_val_score(estimator, X_full, y_full).mean() print("Score with the entire dataset = %.2f" % score) -# Add missing values in 75% of the lines -missing_rate = 0.75 +# Add missing values in 50% of the lines +missing_rate = 0.50 n_missing_samples = int(np.floor(n_samples * missing_rate)) missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples, dtype=np.bool), @@ -80,7 +80,7 @@ # Estimate the score after kNN-imputation of the missing values knn_estimator = Pipeline( [("knnimputer", KNNImputer(missing_values=0, n_neighbors=10, - col_max_missing=0.95)), + col_max_missing=0.90)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) knn_score = cross_val_score(knn_estimator, X_missing, y_missing).mean() print("Score after knn-imputation of the missing values = %.2f" % knn_score) From d7cbdfb96c89d5a87afce1da0abf6b9f87a2586a Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 11:35:13 -0500 Subject: [PATCH 78/97] Remove redundant check and changes in plot --- examples/plot_missing_values.py | 2 +- sklearn/impute.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index d596453c02c56..14984e352ea9d 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -80,7 +80,7 @@ # Estimate the score after kNN-imputation of the missing values knn_estimator = Pipeline( [("knnimputer", KNNImputer(missing_values=0, n_neighbors=10, - col_max_missing=0.90)), + col_max_missing=0.99)), ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) knn_score = cross_val_score(knn_estimator, X_missing, y_missing).mean() print("Score after knn-imputation of the missing values = %.2f" % knn_score) diff --git a/sklearn/impute.py b/sklearn/impute.py index 438c58cde3b07..220d53555626c 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -819,10 +819,6 @@ def _transform(self, X, adjusted_n_neighbors): receivers_row_idx = np.where(mask[:, c])[0] pdonors_row_idx = np.where(~mask_fx[:, c])[0] - # Check if sufficient number of donors are available - if len(pdonors_row_idx) < self.n_neighbors: - raise ValueError("Insufficient number of neighbors.") - # Get distance from potential donors dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] dist_pdonors = dist_pdonors.reshape(-1, From 1c9d858f47b596e00905a9f922606a44d1978fe9 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 31 Mar 2018 12:41:13 -0500 Subject: [PATCH 79/97] Handling insufficient neighbors scenario --- sklearn/impute.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/impute.py b/sklearn/impute.py index 220d53555626c..af2937c91af8e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -819,6 +819,13 @@ def _transform(self, X, adjusted_n_neighbors): receivers_row_idx = np.where(mask[:, c])[0] pdonors_row_idx = np.where(~mask_fx[:, c])[0] + # Impute column mean if n_neighbors are not available + if len(pdonors_row_idx) < self.n_neighbors: + warnings.warn("Insufficient number of neighbors! " + "Filling in column mean.") + X[receivers_row_idx, c] = self.statistics_[c] + continue + # Get distance from potential donors dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] dist_pdonors = dist_pdonors.reshape(-1, From 01722f128b83bccfc3a8508d210dd5b59f205069 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 7 Apr 2018 15:24:00 -0500 Subject: [PATCH 80/97] Removed k actual neighbors algo --- sklearn/impute.py | 413 ++++++----------------------------- sklearn/tests/test_impute.py | 139 ++++++------ 2 files changed, 131 insertions(+), 421 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index af2937c91af8e..d1daf8ed8d24e 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -119,158 +119,6 @@ def _neighbors(X, donor_X=None, metric="masked_euclidean", n_jobs=1, return neigh_ind -# Code for function _unique1d taken directly from Numpy -def _unique1d(ar, return_index=False, return_inverse=False, - return_counts=False): - """ - Find the unique elements of an array, ignoring shape. - """ - ar = np.asanyarray(ar).flatten() - - optional_indices = return_index or return_inverse - optional_returns = optional_indices or return_counts - - if ar.size == 0: - if not optional_returns: - ret = ar - else: - ret = (ar,) - if return_index: - ret += (np.empty(0, np.bool),) - if return_inverse: - ret += (np.empty(0, np.bool),) - if return_counts: - ret += (np.empty(0, np.intp),) - return ret - - if optional_indices: - perm = ar.argsort(kind='mergesort' if return_index else 'quicksort') - aux = ar[perm] - else: - ar.sort() - aux = ar - flag = np.concatenate(([True], aux[1:] != aux[:-1])) - - if not optional_returns: - ret = aux[flag] - else: - ret = (aux[flag],) - if return_index: - ret += (perm[flag],) - if return_inverse: - iflag = np.cumsum(flag) - 1 - inv_idx = np.empty(ar.shape, dtype=np.intp) - inv_idx[perm] = iflag - ret += (inv_idx,) - if return_counts: - idx = np.concatenate(np.nonzero(flag) + ([ar.size],)) - ret += (np.diff(idx),) - return ret - - -# Code for function _unique taken directly from Numpy -def _unique(ar, return_index=False, return_inverse=False, - return_counts=False, axis=None): - """ - Find the unique elements of an array. - - Returns the sorted unique elements of an array. There are three optional - outputs in addition to the unique elements: the indices of the input array - that give the unique values, the indices of the unique array that - reconstruct the input array, and the number of times each unique value - comes up in the input array. - - Parameters - ---------- - ar : array_like - Input array. Unless `axis` is specified, this will be flattened if it - is not already 1-D. - return_index : bool, optional - If True, also return the indices of `ar` (along the specified axis, - if provided, or in the flattened array) that result in the unique - array. - return_inverse : bool, optional - If True, also return the indices of the unique array (for the specified - axis, if provided) that can be used to reconstruct `ar`. - return_counts : bool, optional - If True, also return the number of times each unique item appears - in `ar`. - .. versionadded:: 1.9.0 - axis : int or None, optional - The axis to operate on. If None, `ar` will be flattened beforehand. - Otherwise, duplicate items will be removed along the provided axis, - with all the other axes belonging to the each of the unique elements. - Object arrays or structured arrays that contain objects are not - supported if the `axis` kwarg is used. - .. versionadded:: 1.13.0 - - - - Returns - ------- - unique : ndarray - The sorted unique values. - unique_indices : ndarray, optional - The indices of the first occurrences of the unique values in the - original array. Only provided if `return_index` is True. - unique_inverse : ndarray, optional - The indices to reconstruct the original array from the - unique array. Only provided if `return_inverse` is True. - unique_counts : ndarray, optional - The number of times each of the unique values comes up in the - original array. Only provided if `return_counts` is True. - .. versionadded:: 1.9.0 - - See Also - -------- - numpy.lib.arraysetops : Module with a number of other functions for - performing set operations on arrays. - - """ - ar = np.asanyarray(ar) - if axis is None: - return _unique1d(ar, return_index, return_inverse, return_counts) - if not (-ar.ndim <= axis < ar.ndim): - raise ValueError('Invalid axis kwarg specified for unique') - - ar = np.swapaxes(ar, axis, 0) - orig_shape, orig_dtype = ar.shape, ar.dtype - # Must reshape to a contiguous 2D array for this to work... - ar = ar.reshape(orig_shape[0], -1) - ar = np.ascontiguousarray(ar) - - if ar.dtype.char in (np.typecodes['AllInteger'] + - np.typecodes['Datetime'] + 'S'): - # Optimization: Creating a view of your data with a np.void data type - # of size the number of bytes in a full row. Handles any type where - # items have a unique binary representation, i.e. 0 is only 0, - # not +0 and -0. - dtype = np.dtype((np.void, ar.dtype.itemsize * ar.shape[1])) - else: - dtype = [('f{i}'.format(i=i), ar.dtype) for i in range(ar.shape[1])] - - try: - consolidated = ar.view(dtype) - except TypeError: - # There's no good way to do this for object arrays, etc... - msg = 'The axis argument to unique is not supported for dtype {dt}' - raise TypeError(msg.format(dt=ar.dtype)) - - def reshape_uniq(uniq): - uniq = uniq.view(orig_dtype) - uniq = uniq.reshape(-1, *orig_shape[1:]) - uniq = np.swapaxes(uniq, 0, axis) - return uniq - - output = _unique1d(consolidated, return_index, - return_inverse, return_counts) - if not (return_index or return_inverse or return_counts): - return reshape_uniq(output) - else: - uniq = reshape_uniq(output[0]) - return (uniq,) + output[1:] - - class SimpleImputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. @@ -536,11 +384,11 @@ def transform(self, X): class KNNImputer(BaseEstimator, TransformerMixin): """Imputation for completing missing values using k-Nearest Neighbors. - Each sample's missing values are imputed from up to ``n_neighbors`` + Each sample's missing values are imputed using values from ``n_neighbors`` nearest neighbors found in the training set. Each missing feature is then - imputed as the average, either weighted or unweighted, of these neighbors - who have a value for it. Where all neighbors have that feature value - missing, the training set average for that feature is used for imputation. + imputed as the average, either weighted or unweighted, of these neighbors. + Where the number of donor neighbors is less than `n_neighbors', the + training set average for that feature is used for imputation. Parameters ---------- @@ -550,12 +398,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): ``np.nan``, use the string value "NaN". n_neighbors : int, optional (default = 5) - Maximum number of neighboring samples to use for imputation. When any - of the neighbors themselves have the feature value missing then the - remaining neighbors, if any, that have the feature value available are - used. But if none of the neighbors have the value available, the global - feature mean (i.e., by default, the column mean) is used for - imputation. + Number of neighboring samples to use for imputation. weights : str or callable, optional (default = "uniform") Weight function used in prediction. Possible values: @@ -589,10 +432,6 @@ class KNNImputer(BaseEstimator, TransformerMixin): The maximum percentage of rows (or samples) that can be missing for a given feature beyond which an error is raised. - use_complete : boolean, optional (default = False) - When determining neighbors, only consider those samples that have - the feature of interest available (i.e., it is not NaN). - copy : boolean, optional (default = True) If True, a copy of X will be created. If False, imputation will be done in-place whenever possible. Note that, if metric is @@ -632,8 +471,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): def __init__(self, missing_values="NaN", n_neighbors=5, weights="uniform", metric="masked_euclidean", - row_max_missing=0.5, col_max_missing=0.8, - use_complete=False, copy=True): + row_max_missing=0.5, col_max_missing=0.8, copy=True): self.missing_values = missing_values self.n_neighbors = n_neighbors @@ -641,7 +479,7 @@ def __init__(self, missing_values="NaN", n_neighbors=5, self.metric = metric self.row_max_missing = row_max_missing self.col_max_missing = col_max_missing - self.use_complete = use_complete + # self.use_complete = use_complete self.copy = copy def fit(self, X, y=None): @@ -706,44 +544,7 @@ def fit(self, X, y=None): return self - def _get_weight_matrix(self, fitted_X, mask, adjusted_n_neighbors, - receiver_row_index, row_repeats, - knn_row_index, knn_distances): - """Get the weight matrix for the donors""" - - # Import(s) here to avoid circular import - from .neighbors.base import _get_weights - - # If different X in transform, get a new mask - if self.n_neighbors == adjusted_n_neighbors: - nbors_mask = _get_mask(fitted_X[knn_row_index], - value_to_mask=self.missing_values) - else: - nbors_mask = mask[knn_row_index] - - # Anti-mask tells us what is NOT missing - nbors_anti_mask = ~nbors_mask - receiver_anti_mask = ~mask[receiver_row_index] - - # Sum anti-masks to see if both donor & receiver are missing - # A zero value indicates that a feature is missing in both - # Sum over all cols to locate degenerate donors - anti_masks_combined = receiver_anti_mask + nbors_anti_mask - anti_masks_combined = anti_masks_combined.sum(axis=-1) - degenerate_nbors = anti_masks_combined < mask.shape[1] - knn_distances[degenerate_nbors] = np.inf - - # Retreive and, if applicable, transform weight matrix - weight_matrix = _get_weights(knn_distances, self.weights) - if weight_matrix is not None: - weight_matrix = weight_matrix[:, np.newaxis, :] - weight_matrix = np.repeat(weight_matrix, - row_repeats, axis=0).ravel() - weight_matrix = weight_matrix.reshape( - (-1, adjusted_n_neighbors)) - return weight_matrix - - def _transform(self, X, adjusted_n_neighbors): + def transform(self, X): """Impute all missing values in X. Parameters @@ -751,15 +552,15 @@ def _transform(self, X, adjusted_n_neighbors): X : {array-like}, shape = [n_samples, n_features] The input data to complete. - adjusted_n_neighbors : int - Depending on the calling method, the default value must - either be equal to n_neighbors or n_neighbors + 1. - If the calling method is transform(), then its value needs to be - equal to n_neighbors and if calling method is fit_transform() - then its value must be equal to n_neighbors + 1. + Returns + ------- + X : {array-like}, shape = [n_samples, n_features] + The imputed dataset. """ - # Import here to avoud circular import + + # Import here to avoid circular import from .neighbors.base import _get_weights + check_is_fitted(self, 'statistics_') force_all_finite = False if self.missing_values in ["NaN", np.nan] else True @@ -798,119 +599,60 @@ def _transform(self, X, adjusted_n_neighbors): row_has_missing = row_total_missing.astype(np.bool) if np.any(row_has_missing): - if self.use_complete: - # Initializations - - # Mask for fitted_X - mask_fx = _get_mask(fitted_X, np.nan) - - # Get row index of missing and distance from donors - n_adj_samples, _ = X[row_has_missing].shape - dist = pairwise_distances(X, - fitted_X, - metric=self.metric, - squared=False) - - # For each column, find and impute missing - for c in range(n_cols_X): - if not np.any(mask[:, c], axis=0): - continue - # Row index for receivers and potential donors (pdonors) - receivers_row_idx = np.where(mask[:, c])[0] - pdonors_row_idx = np.where(~mask_fx[:, c])[0] - - # Impute column mean if n_neighbors are not available - if len(pdonors_row_idx) < self.n_neighbors: - warnings.warn("Insufficient number of neighbors! " - "Filling in column mean.") - X[receivers_row_idx, c] = self.statistics_[c] - continue - - # Get distance from potential donors - dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] - dist_pdonors = dist_pdonors.reshape(-1, - len(pdonors_row_idx)) - pdonors_idx = np.argpartition( - dist_pdonors, self.n_neighbors - 1, axis=1) - - # Get final donors row index from pdonors - donors_idx = pdonors_idx[:, :self.n_neighbors] - # Get weights or None - dist_pdonors_rows = np.arange(len(donors_idx))[:, None] - weight_matrix = _get_weights( - dist_pdonors[ - dist_pdonors_rows, donors_idx], self.weights) - donor_row_idx_ravel = donors_idx.ravel() - - # Retrieve donor cells and calculate kNN score - fitted_X_temp = fitted_X[pdonors_row_idx] - donors = fitted_X_temp[donor_row_idx_ravel, c].reshape( - (-1, self.n_neighbors)) - donors_mask = _get_mask(donors, self.missing_values) - donors = np.ma.array(donors, mask=donors_mask) - - # Final imputation - imputed = np.ma.average(donors, axis=1, - weights=weight_matrix) - X[receivers_row_idx, c] = imputed.data - else: - # Row index of receivers & identify potential donors - receiver_row_index = np.where( - row_has_missing)[0].reshape((-1, 1)) - neighbors = self._fitted_neighbors.kneighbors( - X[row_has_missing, :], n_neighbors=adjusted_n_neighbors) - - # Get row index, distance, and weights of donors - knn_distances, knn_row_index = neighbors - row_repeats = row_total_missing[row_total_missing != 0] - - # Weighting: Set self and degenerate donor(s) distance to inf - weight_matrix = None - if self.weights in ["distance"] or callable(self.weights): - weight_matrix = self._get_weight_matrix( - fitted_X, - mask, - adjusted_n_neighbors, - receiver_row_index, - row_repeats, - knn_row_index, - knn_distances - ) - - # Repeat each set donor indices by - # missing count in the corresponding recipient row - knn_row_index_repeat = np.repeat( - knn_row_index, row_repeats, axis=0).ravel() - - # Get repeated column index of donors - receiver_row_missing_index, receiver_col_missing_index = \ - np.where(mask) - knn_col_index_repeat = np.repeat(receiver_col_missing_index, - adjusted_n_neighbors) + + # Mask for fitted_X + mask_fx = _get_mask(fitted_X, self.missing_values) + + # Get row index of missing and distance from donors + n_adj_samples, _ = X[row_has_missing].shape + dist = pairwise_distances(X, + fitted_X, + metric=self.metric, + squared=False, + missing_values=self.missing_values) + + # For each column, find and impute missing + for c in range(n_cols_X): + if not np.any(mask[:, c], axis=0): + continue + # Row index for receivers and potential donors (pdonors) + receivers_row_idx = np.where(mask[:, c])[0] + pdonors_row_idx = np.where(~mask_fx[:, c])[0] + + # Impute column mean if n_neighbors are not available + if len(pdonors_row_idx) < self.n_neighbors: + warnings.warn("Insufficient number of neighbors! " + "Filling in column mean.") + X[receivers_row_idx, c] = self.statistics_[c] + continue + + # Get distance from potential donors + dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] + dist_pdonors = dist_pdonors.reshape(-1, + len(pdonors_row_idx)) + pdonors_idx = np.argpartition( + dist_pdonors, self.n_neighbors - 1, axis=1) + + # Get final donors row index from pdonors + donors_idx = pdonors_idx[:, :self.n_neighbors] + # Get weights or None + dist_pdonors_rows = np.arange(len(donors_idx))[:, None] + weight_matrix = _get_weights( + dist_pdonors[ + dist_pdonors_rows, donors_idx], self.weights) + donor_row_idx_ravel = donors_idx.ravel() # Retrieve donor cells and calculate kNN score - donors = fitted_X[ - knn_row_index_repeat, knn_col_index_repeat].reshape( - (-1, adjusted_n_neighbors)) + fitted_X_temp = fitted_X[pdonors_row_idx] + donors = fitted_X_temp[donor_row_idx_ravel, c].reshape( + (-1, self.n_neighbors)) donors_mask = _get_mask(donors, self.missing_values) donors = np.ma.array(donors, mask=donors_mask) - # Warning if donor count < n_neighbors - if np.any(donors_mask.sum(axis=1) < self.n_neighbors): - warnings.warn("One or more donor(s) have the relevant " - "feature value missing.") - # Final imputation - imputed = np.ma.average(donors, axis=1, weights=weight_matrix) - X[mask] = imputed.data - unimputed_index = np.where(donors_mask.all(axis=1)) - if len(unimputed_index[0]) > 0: - unimputed_rows = receiver_row_missing_index[ - unimputed_index] - unimputed_cols = receiver_col_missing_index[ - unimputed_index] - X[unimputed_rows, unimputed_cols] = np.take( - self.statistics_, unimputed_cols) + imputed = np.ma.average(donors, axis=1, + weights=weight_matrix) + X[receivers_row_idx, c] = imputed.data # Merge bad rows to X and mean impute their missing values if np.any(bad_rows): @@ -926,9 +668,6 @@ def _transform(self, X, adjusted_n_neighbors): def fit_transform(self, X, y=None, **fit_params): """Fit KNNImputer and impute all missing values in X. - This method should *only* be used if the data to be fitted is the - same as the data to be transformed. - Parameters ---------- X : {array-like}, shape (n_samples, n_features) @@ -940,28 +679,4 @@ def fit_transform(self, X, y=None, **fit_params): X : {array-like}, shape (n_samples, n_features) Returns imputed dataset. """ - return self.fit(X)._transform( - X, adjusted_n_neighbors=self.n_neighbors + 1) - - def transform(self, X): - """Impute all missing values in X. - - This method should *only* be used if the data to be fitted is different - from the data to be transformed. - - WARNING: If the same dataset is passed in fit() and transform(), - one of the returned "neighbors" maybe the sample itself. If you will be - passing the same dataset, use fit_transform() to avoid this behavior. - - Parameters - ---------- - X : {array-like}, shape = [n_samples, n_features] - The input data to complete. - - Returns - ------- - X : {array-like}, shape (n_samples, n_features) - Returns imputed dataset. - """ - check_is_fitted(self, 'statistics_') - return self._transform(X, adjusted_n_neighbors=self.n_neighbors) + return self.fit(X).transform(X) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 722277a82c50d..dc17842e08e7f 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -289,9 +289,6 @@ def test_knn_imputation_zero(): imputer = KNNImputer(missing_values=missing_values, n_neighbors=n_neighbors, weights="uniform") - imputer_nan = KNNImputer(missing_values="NaN", - n_neighbors=n_neighbors, - weights="uniform") # Test with missing_values=0 when NaN present X = np.array([ @@ -314,33 +311,42 @@ def test_knn_imputation_zero(): imputer.col_max_missing * 100) assert_raise_message(ValueError, msg, imputer.fit, X) + +def test_knn_imputation_zero_p2(): # Test with an imputable matrix and also compare with missing_values="NaN" - X = np.array([ - [1, 0, 1, 0, 1.], - [2, 1, 2, 2, 3], - [3, 2, 3, 0, 0], - [6, 6, 0, 5, 17], + X_zero = np.array([ + [1, 0, 1, 1, 1.], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 0], + [6, 6, 0, 6, 6], ]) X_nan = np.array([ - [1, np.nan, 1, np.nan, 1.], - [2, 1, 2, 2, 3], - [3, 2, 3, np.nan, np.nan], - [6, 6, np.nan, 5, 17], + [1, np.nan, 1, 1, 1.], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, np.nan], + [6, 6, np.nan, 6, 6], ]) statistics_mean = np.nanmean(X_nan, axis=0) X_imputed = np.array([ - [1, 1.5, 1, 2, 1.], - [2, 1, 2, 2, 3], - [3, 2, 3, 2, 2], - [6, 6, 2.5, 5, 17], + [1, 2.5, 1, 1, 1.], + [2, 2, 2, 2, 2], + [3, 3, 3, 3, 1.5], + [6, 6, 2.5, 6, 6], ]) - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_nan.fit_transform( - X_nan)) + imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, + weights="uniform") + + imputer_nan = KNNImputer(missing_values="NaN", + n_neighbors=2, + weights="uniform") + + assert_array_equal(imputer_zero.fit_transform(X_zero), X_imputed) + assert_array_equal(imputer_zero.statistics_, statistics_mean) + assert_array_equal(imputer_zero.fit_transform(X_zero), + imputer_nan.fit_transform(X_nan)) def test_knn_imputation_default(): @@ -360,8 +366,8 @@ def test_knn_imputation_default(): X_imputed = np.array([ [1, 0, 0, 1], - [2, 1, 2, 5.25], - [3, 2, 3, 5.25], + [2, 1, 2, 8], + [3, 2, 3, 8], [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], @@ -386,8 +392,8 @@ def test_knn_imputation_default(): X_imputed = np.array([ [1, 0, 0, 1], - [2, 1, 2, 5.25], - [3, 2, 3, 5.25], + [2, 1, 2, 8], + [3, 2, 3, 8], [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], @@ -520,12 +526,12 @@ def test_default_with_invalid_input(): def test_knn_n_neighbors(): X = np.array([ - [0, 0], - [np.nan, 2], - [4, 3], - [5, np.nan], - [7, 7], - [np.nan, 8], + [0, 0], + [np.nan, 2], + [4, 3], + [5, np.nan], + [7, 7], + [np.nan, 8], [14, 13] ]) statistics_mean = np.nanmean(X, axis=0) @@ -543,12 +549,9 @@ def test_knn_n_neighbors(): n_neighbors = 1 imputer = KNNImputer(n_neighbors=n_neighbors) - imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) assert_array_equal(imputer.statistics_, statistics_mean) - assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( - X).transform(X)) # Test with 6 neighbors X = np.array([ @@ -616,13 +619,13 @@ def no_weight(dist=None): def test_weight_distance(): X = np.array([ - [0, 0], + [0, 0], [np.nan, 2], - [4, 3], - [5, 6], - [7, 7], - [9, 8], - [11, 10] + [4, 3], + [5, 6], + [7, 7], + [9, 8], + [11, 10] ]) # Test with "distance" weight @@ -687,51 +690,43 @@ def test_weight_distance(): # Test with varying missingness patterns X = np.array([ - [1, 0, 0, 1], - [0, np.nan, 1, np.nan], - [1, 1, 1, np.nan], - [0, 1, 0, 0], - [0, np.nan, 1, 0], - [1, 1, 1, 1], - [10, 10, 10, 10], + [1, 0, 0, 1], + [0, np.nan, 1, np.nan], + [1, 1, 1, np.nan], + [0, 1, 0, 0], + [0, 0, 0, 0], + [1, 0, 1, 1], + [10, 10, 10, 10], ]) statistics_mean = np.nanmean(X, axis=0) # Get weights of donor neighbors dist = masked_euclidean_distances(X) - row1_nbor_dists = dist[1, :6] - row1_nbor_dists[np.array([1, 2, 4])] = np.inf # Degenerate neighbors - row1_nbor_wt = 1/row1_nbor_dists - - row2_nbor_dists = dist[2, :6] - row2_nbor_dists[np.array([1, 2])] = np.inf # Degenerate neighbors - row2_nbor_wt = 1/row2_nbor_dists - # A non-degenerate donor has zero distance so it's weight is 1 and - # others have weight 0 - row2_nbor_wt[~np.isinf(row2_nbor_wt)] = 0 - row2_nbor_wt[np.isinf(row2_nbor_wt)] = 1 - - row4_nbor_dists = dist[4, :6] - row4_nbor_dists[np.array([1, 4])] = np.inf # Degenerate neighbors - row4_nbor_wt = 1/row4_nbor_dists + r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] + r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] + r1c1_nbor_wt = (1/r1c1_nbor_dists) + r1c3_nbor_wt = (1 / r1c3_nbor_dists) + + r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] + r2c3_nbor_wt = 1/r2c3_nbor_dists # Collect donor values - col1_donor_values = np.ma.masked_invalid(X[:6, 1].copy()) - col3_donor_values = np.ma.masked_invalid(X[:6, 3].copy()) + col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() + col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() # Final imputed values - r1c1_imp = np.ma.average(col1_donor_values, weights=row1_nbor_wt) - r1c3_imp = np.ma.average(col3_donor_values, weights=row1_nbor_wt) - r2c3_imp = np.ma.average(col3_donor_values, weights=row2_nbor_wt) - r4c1_imp = np.ma.average(col1_donor_values, weights=row4_nbor_wt) + r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) + r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) + r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) + print(r1c1_imp, r1c3_imp, r2c3_imp) X_imputed = np.array([ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], [1, 1, 1, r2c3_imp], [0, 1, 0, 0], - [0, r4c1_imp, 1, 0], - [1, 1, 1, 1], + [0, 0, 0, 0], + [1, 0, 1, 1], [10, 10, 10, 10], ]) @@ -759,7 +754,7 @@ def test_metric_type(): def test_callable_metric(): # Define callable metric that returns the l1 norm: - def custom_callable(x, y, missing_values="NaN"): + def custom_callable(x, y, missing_values="NaN", squared=False): x = np.ma.array(x, mask=np.isnan(x)) y = np.ma.array(y, mask=np.isnan(y)) dist = np.nansum(np.abs(x-y)) @@ -814,7 +809,7 @@ def test_complete_features(): [r7c0, 7, 7, 7] ]) - imputer_comp = KNNImputer(use_complete=True) + imputer_comp = KNNImputer() assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed) @@ -859,5 +854,5 @@ def test_complete_features_weighted(): [r7c0, 7, 7, 7] ]) - imputer_comp_wt = KNNImputer(weights="distance", use_complete=True) + imputer_comp_wt = KNNImputer(weights="distance") assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed) From 36d1d723fffdc6e57529fa32e5ce59df834f1f7e Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 21 Apr 2018 22:13:17 -0500 Subject: [PATCH 81/97] Addressed Comments --- doc/modules/impute.rst | 41 +++++++ doc/modules/preprocessing.rst | 34 ------ sklearn/impute.py | 210 ++++++++++++++-------------------- 3 files changed, 126 insertions(+), 159 deletions(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index e806cc2fd5b4a..e3f5f947b4fed 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -54,3 +54,44 @@ values than observed values. :class:`SimpleImputer` can be used in a Pipeline as a way to build a composite estimator that supports imputation. See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. + +The :class:`KNNImputer` class provides imputation for completing missing +values using the k-Nearest Neighbors approach. Each sample's missing values +are imputed using values from ``n_neighbors`` nearest neighbors found in the +training set. Note that if a sample has more than one feature missing, then +the sample can potentially have multiple sets of ``n_neighbors`` +donors depending on the particular feature being imputed. +Each missing feature is then imputed as the average, either weighted or +unweighted, of these neighbors. Where the number of donor neighbors is less +than ``n_neighbors``, the training set average for that feature is used +for imputation. The total number of samples in the training set is, of course, +always greater than or equal to the number of nearest neighbors available for +imputation, depending on both the overall sample size as well as the number of +samples excluded from nearest neighbor calculation because of too many missing +features (as controlled by ``row_max_missing``). +For more information on the methodology, see ref. [#]_. + +The following snippet demonstrates how to replace missing values, +encoded as ``np.nan``, using the mean feature value of the two nearest +neighbors of the rows that contain the missing values:: + + >>> import numpy as np + >>> from sklearn.impute import KNNImputer + >>> nan = np.nan + >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] + >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") + >>> imputer.fit_transform(X) + array([[1. , 2. , 4. ], + [3. , 4. , 3. ], + [5.5, 6. , 5. ], + [8. , 8. , 7. ]]) + + +:class:`KNNImputer` can also be used in a Pipeline as a way to build a +composite estimator that supports imputation. +See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. + +.. [#] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor + Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value + estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 + Pages 520-525. \ No newline at end of file diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 53f073db76d3d..c274c7a2aa6e6 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -596,40 +596,6 @@ Imputation of missing values Tools for imputing missing values are discussed at :ref:`impute`. -The :class:`KNNImputer` class provides imputation for completing missing -values using the k-Nearest Neighbors approach. Each sample's missing values -are imputed from up to ``max_neighbors`` nearest neighbors found in the -training set. Each missing feature is then imputed as the average, either -weighted or unweighted, of the neighbors who have a value for it. -When any of the neighbors themselves have the feature value missing then -the remaining neighbors are used. Where all neighbors have that feature value -missing, the training set average for that feature is used. -For more information on the methodology, see ref. [#]_. - -The following snippet demonstrates how to replace missing values, -encoded as ``np.nan``, using the mean feature value of the two nearest -neighbors of the rows that contain the missing values:: - - >>> import numpy as np - >>> from sklearn.impute import KNNImputer - >>> nan = np.nan - >>> X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]] - >>> imputer = KNNImputer(n_neighbors=2, weights="uniform") - >>> imputer.fit_transform(X) - array([[1. , 2. , 4. ], - [3. , 4. , 3. ], - [5.5, 6. , 5. ], - [8. , 8. , 7. ]]) - - -:class:`KNNImputer` can also be used in a Pipeline as a way to build a -composite estimator that supports imputation. -See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. - -.. [#] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor - Hastie, Robert Tibshirani, David Botstein and Russ B. Altman, Missing value - estimation methods for DNA microarrays, BIOINFORMATICS Vol. 17 no. 6, 2001 - Pages 520-525. .. _polynomial_features: diff --git a/sklearn/impute.py b/sklearn/impute.py index d1daf8ed8d24e..88b5f39147ee4 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -17,6 +17,9 @@ from .utils.validation import FLOAT_DTYPES from .metrics import pairwise_distances +from .neighbors.base import _check_weights +from .neighbors.base import _get_weights + from .externals import six zip = six.moves.zip @@ -64,61 +67,6 @@ def _most_frequent(array, extra_value, n_repeat): return extra_value -# Skeletal version of KNeighborsMixin.kneighbors() -def _neighbors(X, donor_X=None, metric="masked_euclidean", n_jobs=1, - **metric_params): - """Finds the unsorted K-neighbors of a point. - - Returns unsorted indices of and distances to the neighbors of each point. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - The samples whose neighbors are to be evaluated - - donor_X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' - The query point or points. - If not provided, neighbors of each indexed point are returned. - In this case, the query point is not considered its own neighbor. - - metric : str or callable, optional (default = "masked_euclidean") - Distance metric for searching neighbors. Possible values: - - 'masked_euclidean' - - [callable] : a user-defined function which conforms to the - definition of _pairwise_callable(X, Y, metric, **kwds). In other - words, the function accepts two arrays, X and Y, and a - ``missing_values`` keyword in **kwds and returns a scalar distance - value. - - n_jobs : int, optional (default = 1) - The number of parallel jobs to run for neighbors search. - If ``-1``, then the number of jobs is set to the number of CPU cores. - - Returns - ------- - dist : array - Array representing the unsorted lengths to points, only present if - return_distance=True - - ind : array - Indices of the unsorted nearest points in the population matrix. - - """ - n_samples, _ = X.shape - sample_range = np.arange(n_samples)[:, None] - - n_jobs = _get_n_jobs(n_jobs) - dist = pairwise_distances(X, donor_X, - metric=metric, - n_jobs=n_jobs, - squared=True, - **metric_params) - neigh_ind = np.argsort(dist[sample_range, :]) - - return neigh_ind - - class SimpleImputer(BaseEstimator, TransformerMixin): """Imputation transformer for completing missing values. @@ -387,8 +335,11 @@ class KNNImputer(BaseEstimator, TransformerMixin): Each sample's missing values are imputed using values from ``n_neighbors`` nearest neighbors found in the training set. Each missing feature is then imputed as the average, either weighted or unweighted, of these neighbors. - Where the number of donor neighbors is less than `n_neighbors', the - training set average for that feature is used for imputation. + Note that if a sample has more than one feature missing, then the + neighbors for that sample can be different depending on the particular + feature being imputed. Finally, where the number of donor neighbors is + less than ``n_neighbors``, the training set average for that feature is + used during imputation. Parameters ---------- @@ -422,15 +373,15 @@ class KNNImputer(BaseEstimator, TransformerMixin): value. row_max_missing : float, optional (default = 0.5) - The maximum percentage of columns (i.e. features) that can be missing + The maximum fraction of columns (i.e. features) that can be missing before the sample is excluded from nearest neighbor imputation. It means that such rows will not be considered a potential donor in ``fit()``, and in ``transform()`` their missing feature values will be imputed to be the column mean for the entire dataset. col_max_missing : float, optional (default = 0.8) - The maximum percentage of rows (or samples) that can be missing - for a given feature beyond which an error is raised. + The maximum fraction of rows (or samples) that can be missing + for any feature beyond which an error is raised. copy : boolean, optional (default = True) If True, a copy of X will be created. If False, imputation will @@ -479,9 +430,58 @@ def __init__(self, missing_values="NaN", n_neighbors=5, self.metric = metric self.row_max_missing = row_max_missing self.col_max_missing = col_max_missing - # self.use_complete = use_complete self.copy = copy + def _impute(self, dist, X, fitted_X, mask, mask_fx): + """Helper function to find and impute missing values""" + + # For each column, find and impute + n_rows_X, n_cols_X = X.shape + for c in range(n_cols_X): + if not np.any(mask[:, c], axis=0): + continue + + # Row index for receivers and potential donors (pdonors) + receivers_row_idx = np.where(mask[:, c])[0] + pdonors_row_idx = np.where(~mask_fx[:, c])[0] + + # Impute using column mean if n_neighbors are not available + if len(pdonors_row_idx) < self.n_neighbors: + warnings.warn("Insufficient number of neighbors! " + "Filling in column mean.") + X[receivers_row_idx, c] = self.statistics_[c] + continue + + # Get distance from potential donors + dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] + dist_pdonors = dist_pdonors.reshape(-1, + len(pdonors_row_idx)) + # Argpartition to seperate actual donors from the rest + pdonors_idx = np.argpartition( + dist_pdonors, self.n_neighbors - 1, axis=1) + + # Get final donors row index from pdonors + donors_idx = pdonors_idx[:, :self.n_neighbors] + # Get weights or None + dist_pdonors_rows = np.arange(len(donors_idx))[:, None] + weight_matrix = _get_weights( + dist_pdonors[ + dist_pdonors_rows, donors_idx], self.weights) + donor_row_idx_ravel = donors_idx.ravel() + + # Retrieve donor cells and calculate kNN score + fitted_X_temp = fitted_X[pdonors_row_idx] + donors = fitted_X_temp[donor_row_idx_ravel, c].reshape( + (-1, self.n_neighbors)) + donors_mask = _get_mask(donors, self.missing_values) + donors = np.ma.array(donors, mask=donors_mask) + + # Final imputation + imputed = np.ma.average(donors, axis=1, + weights=weight_matrix) + X[receivers_row_idx, c] = imputed.data + return X + def fit(self, X, y=None): """Fit the imputer on X. @@ -496,9 +496,6 @@ def fit(self, X, y=None): self : object Returns self. """ - # Imports here to avoid circular import - from .neighbors import NearestNeighbors - from .neighbors.base import _check_weights # Check parameters force_all_finite = False if self.missing_values in ["NaN", @@ -535,11 +532,12 @@ def fit(self, X, y=None): % (X.shape[0], self.n_neighbors)) # Instantiate NN object, get column means, and store in statistics_ - neigh = NearestNeighbors(n_neighbors=self.n_neighbors, - metric=self.metric, - metric_params={"missing_values": - self.missing_values}) - self._fitted_neighbors = neigh.fit(X) + # neigh = NearestNeighbors(n_neighbors=self.n_neighbors, + # metric=self.metric, + # metric_params={"missing_values": + # self.missing_values}) + # self._fitted_neighbors = neigh.fit(X) + self.fitted_X_ = X self.statistics_ = X_col_means return self @@ -558,10 +556,7 @@ def transform(self, X): The imputed dataset. """ - # Import here to avoid circular import - from .neighbors.base import _get_weights - - check_is_fitted(self, 'statistics_') + check_is_fitted(self, ["fitted_X_", "statistics_"]) force_all_finite = False if self.missing_values in ["NaN", np.nan] else True X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, @@ -572,8 +567,8 @@ def transform(self, X): "transformed.") # Get fitted data and ensure correct dimension - fitted_X = self._fitted_neighbors._fit_X - n_rows_fit_X, n_cols_fit_X = fitted_X.shape + # fitted_X = self._fitted_neighbors._fit_X + n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape n_rows_X, n_cols_X = X.shape if n_cols_X != n_cols_fit_X: @@ -601,58 +596,23 @@ def transform(self, X): if np.any(row_has_missing): # Mask for fitted_X - mask_fx = _get_mask(fitted_X, self.missing_values) + mask_fx = _get_mask(self.fitted_X_, self.missing_values) # Get row index of missing and distance from donors - n_adj_samples, _ = X[row_has_missing].shape - dist = pairwise_distances(X, - fitted_X, - metric=self.metric, - squared=False, - missing_values=self.missing_values) - - # For each column, find and impute missing - for c in range(n_cols_X): - if not np.any(mask[:, c], axis=0): - continue - # Row index for receivers and potential donors (pdonors) - receivers_row_idx = np.where(mask[:, c])[0] - pdonors_row_idx = np.where(~mask_fx[:, c])[0] - - # Impute column mean if n_neighbors are not available - if len(pdonors_row_idx) < self.n_neighbors: - warnings.warn("Insufficient number of neighbors! " - "Filling in column mean.") - X[receivers_row_idx, c] = self.statistics_[c] - continue - - # Get distance from potential donors - dist_pdonors = dist[receivers_row_idx][:, pdonors_row_idx] - dist_pdonors = dist_pdonors.reshape(-1, - len(pdonors_row_idx)) - pdonors_idx = np.argpartition( - dist_pdonors, self.n_neighbors - 1, axis=1) - - # Get final donors row index from pdonors - donors_idx = pdonors_idx[:, :self.n_neighbors] - # Get weights or None - dist_pdonors_rows = np.arange(len(donors_idx))[:, None] - weight_matrix = _get_weights( - dist_pdonors[ - dist_pdonors_rows, donors_idx], self.weights) - donor_row_idx_ravel = donors_idx.ravel() - - # Retrieve donor cells and calculate kNN score - fitted_X_temp = fitted_X[pdonors_row_idx] - donors = fitted_X_temp[donor_row_idx_ravel, c].reshape( - (-1, self.n_neighbors)) - donors_mask = _get_mask(donors, self.missing_values) - donors = np.ma.array(donors, mask=donors_mask) - - # Final imputation - imputed = np.ma.average(donors, axis=1, - weights=weight_matrix) - X[receivers_row_idx, c] = imputed.data + dist_temp = pairwise_distances(X[row_has_missing], + self.fitted_X_, + metric=self.metric, + squared=False, + missing_values=self.missing_values) + dist = np.empty((n_rows_X, n_rows_X)) + dist[row_has_missing] = dist_temp.copy() + dist[~row_has_missing] = np.nan + + # Delete temp var binding + del dist_temp + + # Find and impute missing + X = self._impute(dist, X, self.fitted_X_, mask, mask_fx) # Merge bad rows to X and mean impute their missing values if np.any(bad_rows): From 8a16e2884f793330016b0be9fba043dc2299fa12 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 28 Apr 2018 16:34:58 -0500 Subject: [PATCH 82/97] Minor bug fixes --- examples/plot_missing_values.py | 1 + sklearn/impute.py | 31 ++++++++++++++++--------------- sklearn/tests/test_impute.py | 12 ++++++++---- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 8dd45303588d2..72affb35ce91a 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -98,6 +98,7 @@ def get_results(dataset): (mice_impute_scores.mean(), mice_impute_scores.std()), (knn_impute_scores.mean(), knn_impute_scores.std())) + results_diabetes = np.array(get_results(load_diabetes())) mses_diabetes = results_diabetes[:, 0] * -1 stds_diabetes = results_diabetes[:, 1] diff --git a/sklearn/impute.py b/sklearn/impute.py index 405bf852372b3..e04b02e05275a 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -23,6 +23,7 @@ from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES from .metrics import pairwise_distances +from .metrics.pairwise import _MASKED_METRICS from .neighbors.base import _check_weights from .neighbors.base import _get_weights @@ -1009,12 +1010,13 @@ def _impute(self, dist, X, fitted_X, mask, mask_fx): dist_pdonors = dist_pdonors.reshape(-1, len(pdonors_row_idx)) - # Argpartition to seperate actual donors from the rest + # Argpartition to separate actual donors from the rest pdonors_idx = np.argpartition( dist_pdonors, self.n_neighbors - 1, axis=1) # Get final donors row index from pdonors donors_idx = pdonors_idx[:, :self.n_neighbors] + # Get weights or None dist_pdonors_rows = np.arange(len(donors_idx))[:, None] weight_matrix = _get_weights( @@ -1022,7 +1024,7 @@ def _impute(self, dist, X, fitted_X, mask, mask_fx): dist_pdonors_rows, donors_idx], self.weights) donor_row_idx_ravel = donors_idx.ravel() - # Retrieve donor cells and calculate kNN score + # Retrieve donor values and calculate kNN score fitted_X_temp = fitted_X[pdonors_row_idx] donors = fitted_X_temp[donor_row_idx_ravel, c].reshape( (-1, self.n_neighbors)) @@ -1050,9 +1052,14 @@ def fit(self, X, y=None): Returns self. """ - # Check parameters + # Check data integrity and calling arguments force_all_finite = False if self.missing_values in ["NaN", np.nan] else True + if not force_all_finite: + if self.metric not in _MASKED_METRICS and not callable( + self.metric): + raise ValueError( + "The selected metric does not support NaN values.") X = check_array(X, accept_sparse=False, dtype=np.float64, force_all_finite=force_all_finite, copy=self.copy) self.weights = _check_weights(self.weights) @@ -1107,6 +1114,7 @@ def transform(self, X): np.nan] else True X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) + # Check for +/- inf if np.any(np.isinf(X)): raise ValueError("+/- inf values are not allowed in data to be " @@ -1143,18 +1151,11 @@ def transform(self, X): # Mask for fitted_X mask_fx = _get_mask(self.fitted_X_, self.missing_values) - # Get row index of missing and distance from donors - dist_temp = pairwise_distances(X[row_has_missing], - self.fitted_X_, - metric=self.metric, - squared=False, - missing_values=self.missing_values) - dist = np.empty((n_rows_X, n_rows_X)) - dist[row_has_missing] = dist_temp.copy() - dist[~row_has_missing] = np.nan - - # Delete temp var binding - del dist_temp + # Pairwise distances between receivers and fitted samples + dist = np.empty((len(X), len(self.fitted_X_))) + dist[row_has_missing] = pairwise_distances( + X[row_has_missing], self.fitted_X_, metric=self.metric, + squared=False, missing_values=self.missing_values) # Find and impute missing X = self._impute(dist, X, self.fitted_X_, mask, mask_fx) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 2a5a7150cf54e..e1838986dd426 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -618,9 +618,11 @@ def test_knn_imputation_default(): [np.nan, 4, 5, 5], [6, np.nan, 6, 7], [8, 8, 8, 8], + [19, 19, 19, 19], [np.nan, np.nan, np.nan, 19], ]) statistics_mean = np.nanmean(X, axis=0) + r7c0, r7c1, r7c2, _ = statistics_mean X_imputed = np.array([ [1, 0, 0, 1], @@ -629,12 +631,13 @@ def test_knn_imputation_default(): [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], - [4, 3, 4, 19], + [19, 19, 19, 19], + [r7c0, r7c1, r7c2, 19], ]) imputer = KNNImputer() - assert_array_equal(imputer.fit_transform(X), X_imputed) - assert_array_equal(imputer.statistics_, statistics_mean) + assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6) + assert_array_almost_equal(imputer.statistics_, statistics_mean, decimal=6) # Test with all neighboring donors also having missing feature values X = np.array([ @@ -980,7 +983,8 @@ def test_metric_type(): # Test with a metric type without NaN support imputer = KNNImputer(metric="euclidean") - assert_raises(ValueError, imputer.fit, X) + bad_metric_msg = "The selected metric does not support NaN values." + assert_raise_message(ValueError, bad_metric_msg, imputer.fit, X) def test_callable_metric(): From a93827c63efb8be2af0501024c5a2321e8e776a2 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sat, 28 Apr 2018 17:41:33 -0500 Subject: [PATCH 83/97] Removing flotsam --- doc/modules/impute.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/impute.rst b/doc/modules/impute.rst index b6eabf43746d5..f1b151767f6cd 100644 --- a/doc/modules/impute.rst +++ b/doc/modules/impute.rst @@ -130,7 +130,7 @@ estimator that supports imputation. ======= :class:`SimpleImputer`, :class:`MICEImputer`, and :class:`KNNImputer` can be used in a Pipelineas a way to build a composite estimator that supports imputation. ->>>>>>> 95f15ffd38f9908ac20cfffb3e6087f9beeb7e8b + See :ref:`sphx_glr_auto_examples_plot_missing_values.py`. .. [#] Olga Troyanskaya, Michael Cantor, Gavin Sherlock, Pat Brown, Trevor From 5de5b60db6386b890226396ca75b42cde6891b40 Mon Sep 17 00:00:00 2001 From: harke <22569641+ashimb9@users.noreply.github.com> Date: Sun, 29 Apr 2018 03:25:52 -0500 Subject: [PATCH 84/97] Minor bug fixes --- examples/plot_missing_values.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 72affb35ce91a..d908fa17462fa 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -84,14 +84,6 @@ def get_results(dataset): knn_impute_scores = cross_val_score(knn_estimator, X_missing, y_missing, scoring='neg_mean_squared_error') - # Estimate the score after kNN-imputation of the missing values - knn_estimator = Pipeline( - [("knnimputer", KNNImputer(missing_values=0, - col_max_missing=0.99)), - ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) - knn_impute_scores = cross_val_score(knn_estimator, X_missing, y_missing, - scoring='neg_mean_squared_error') - return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), @@ -115,7 +107,7 @@ def get_results(dataset): 'Mean Imputation', 'MICE Imputation', 'KNN Imputation'] -colors = ['r', 'g', 'b', 'orange'] +colors = ['r', 'g', 'b', 'orange', 'black'] # plot diabetes results plt.figure(figsize=(12, 6)) From 20581860c2ba6cabec82094a8a9a9c87f8f3d8b0 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 30 Sep 2018 13:46:52 +1000 Subject: [PATCH 85/97] Revert changes to sklearn/neighbors --- sklearn/neighbors/base.py | 236 ++++++++++++++-------- sklearn/neighbors/tests/test_neighbors.py | 176 ++++++---------- sklearn/neighbors/unsupervised.py | 10 +- 3 files changed, 221 insertions(+), 201 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index c3522b1c7c690..dedcc658c0d2f 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -6,6 +6,10 @@ # Multi-output support by Arnaud Joly # # License: BSD 3 clause (C) INRIA, University of Amsterdam +from functools import partial +from distutils.version import LooseVersion + +import sys import warnings from abc import ABCMeta, abstractmethod @@ -15,15 +19,14 @@ from .ball_tree import BallTree from .kd_tree import KDTree from ..base import BaseEstimator -from ..metrics import pairwise_distances +from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS -from ..metrics.pairwise import _MASKED_METRICS -from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices +from ..utils import check_X_y, check_array, gen_even_slices from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..externals import six -from ..externals.joblib import Parallel, delayed -from ..exceptions import NotFittedError +from ..utils import Parallel, delayed, effective_n_jobs +from ..utils._joblib import __version__ as joblib_version from ..exceptions import DataConversionWarning VALID_METRICS = dict(ball_tree=BallTree.valid_metrics, @@ -106,7 +109,7 @@ class NeighborsBase(six.with_metaclass(ABCMeta, BaseEstimator)): @abstractmethod def __init__(self, n_neighbors=None, radius=None, algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=1): + p=2, metric_params=None, n_jobs=None): self.n_neighbors = n_neighbors self.radius = radius @@ -141,8 +144,11 @@ def _check_algorithm_metric(self): "kd_tree algorithm does not support callable metric '%s'" % self.metric) elif self.metric not in VALID_METRICS[alg_check]: - raise ValueError("Metric '%s' not valid for algorithm '%s'" - % (self.metric, self.algorithm)) + raise ValueError("Metric '%s' not valid. Use " + "sorted(sklearn.neighbors.VALID_METRICS['%s']) " + "to get valid options. " + "Metric can also be a callable function." + % (self.metric, alg_check)) if self.metric_params is not None and 'p' in self.metric_params: warnings.warn("Parameter p is found in metric_params. " @@ -157,8 +163,6 @@ def _check_algorithm_metric(self): def _fit(self, X): self._check_algorithm_metric() - - allow_nans = self.metric in _MASKED_METRICS or callable(self.metric) if self.metric_params is None: self.effective_metric_params_ = {} else: @@ -202,25 +206,24 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - X = check_array(X, accept_sparse='csr', - force_all_finite=not allow_nans) + X = check_array(X, accept_sparse='csr') n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") if issparse(X): - if np.any(np.isnan(X.data)): - raise ValueError( - "kNN does not support sparse matrix with missing data") if self.algorithm not in ('auto', 'brute'): warnings.warn("cannot use tree with sparse input: " "using brute force") if self.effective_metric_ not in VALID_METRICS_SPARSE['brute'] \ and not callable(self.effective_metric_): - - raise ValueError("metric '%s' not valid for sparse input" - % self.effective_metric_) + raise ValueError("Metric '%s' not valid for sparse input. " + "Use sorted(sklearn.neighbors." + "VALID_METRICS_SPARSE['brute']) " + "to get valid options. " + "Metric can also be a callable function." + % (self.effective_metric_)) self._fit_X = X.copy() self._tree = None self._fit_method = 'brute' @@ -283,9 +286,43 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" + def _kneighbors_reduce_func(self, dist, start, + n_neighbors, return_distance): + """Reduce a chunk of distances to the nearest neighbors + + Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked` + + Parameters + ---------- + dist : array of shape (n_samples_chunk, n_samples) + start : int + The index in X which the first row of dist corresponds to. + n_neighbors : int + return_distance : bool + + Returns + ------- + dist : array of shape (n_samples_chunk, n_neighbors), optional + Returned only if return_distance + neigh : array of shape (n_samples_chunk, n_neighbors) + """ + sample_range = np.arange(dist.shape[0])[:, None] + neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[ + sample_range, np.argsort(dist[sample_range, neigh_ind])] + if return_distance: + if self.effective_metric_ == 'euclidean': + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind + return result + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. - Returns indices of and distances to the neighbors of each point. Parameters @@ -354,12 +391,7 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - if self.effective_metric_ in _MASKED_METRICS or callable( - self.effective_metric_): - X = check_array(X, accept_sparse='csr', - force_all_finite=False) - else: - X = check_array(X, accept_sparse='csr') + X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X @@ -377,51 +409,50 @@ class from an array representing our data set and ask who's n_samples, _ = X.shape sample_range = np.arange(n_samples)[:, None] - n_jobs = _get_n_jobs(self.n_jobs) + n_jobs = effective_n_jobs(self.n_jobs) if self._fit_method == 'brute': - # for efficiency, use squared euclidean distances - if self.effective_metric_ in ['euclidean', 'masked_euclidean']: - dist = pairwise_distances(X, self._fit_X, - metric=self.effective_metric_, - n_jobs=n_jobs, squared=True, - **self.effective_metric_params_) - else: - dist = pairwise_distances( - X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, - **self.effective_metric_params_) - neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) - neigh_ind = neigh_ind[:, :n_neighbors] - # argpartition doesn't guarantee sorted order, so we sort again - neigh_ind = neigh_ind[ - sample_range, np.argsort(dist[sample_range, neigh_ind])] + reduce_func = partial(self._kneighbors_reduce_func, + n_neighbors=n_neighbors, + return_distance=return_distance) - if return_distance: - if self.effective_metric_ in ['euclidean', 'masked_euclidean']: - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind - else: - result = neigh_ind + # for efficiency, use squared euclidean distances + kwds = ({'squared': True} if self.effective_metric_ == 'euclidean' + else self.effective_metric_params_) + + result = pairwise_distances_chunked( + X, self._fit_X, reduce_func=reduce_func, + metric=self.effective_metric_, n_jobs=n_jobs, + **kwds) elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): raise ValueError( "%s does not work with sparse matrices. Densify the data, " "or set algorithm='brute'" % self._fit_method) - result = Parallel(n_jobs, backend='threading')( - delayed(self._tree.query, check_pickle=False)( + if (sys.version_info < (3,) or + LooseVersion(joblib_version) < LooseVersion('0.12')): + # Deal with change of API in joblib + delayed_query = delayed(self._tree.query, + check_pickle=False) + parallel_kwargs = {"backend": "threading"} + else: + delayed_query = delayed(self._tree.query) + parallel_kwargs = {"prefer": "threads"} + result = Parallel(n_jobs, **parallel_kwargs)( + delayed_query( X[s], n_neighbors, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) - if return_distance: - dist, neigh_ind = tuple(zip(*result)) - result = np.vstack(dist), np.vstack(neigh_ind) - else: - result = np.vstack(result) else: raise ValueError("internal: _fit_method not recognized") + if return_distance: + dist, neigh_ind = zip(*result) + result = np.vstack(dist), np.vstack(neigh_ind) + else: + result = np.vstack(result) + if not query_is_train: return result else: @@ -533,6 +564,40 @@ def kneighbors_graph(self, X=None, n_neighbors=None, class RadiusNeighborsMixin(object): """Mixin for radius-based neighbors searches""" + def _radius_neighbors_reduce_func(self, dist, start, + radius, return_distance): + """Reduce a chunk of distances to the nearest neighbors + + Callback to :func:`sklearn.metrics.pairwise.pairwise_distances_chunked` + + Parameters + ---------- + dist : array of shape (n_samples_chunk, n_samples) + start : int + The index in X which the first row of dist corresponds to. + radius : float + return_distance : bool + + Returns + ------- + dist : list of n_samples_chunk 1d arrays, optional + Returned only if return_distance + neigh : list of n_samples_chunk 1d arrays + """ + neigh_ind = [np.where(d <= radius)[0] for d in dist] + + if return_distance: + if self.effective_metric_ == 'euclidean': + dist = [np.sqrt(d[neigh_ind[i]]) + for i, d in enumerate(dist)] + else: + dist = [d[neigh_ind[i]] + for i, d in enumerate(dist)] + results = dist, neigh_ind + else: + results = neigh_ind + return results + def radius_neighbors(self, X=None, radius=None, return_distance=True): """Finds the neighbors within a given radius of a point or points. @@ -611,39 +676,37 @@ class from an array representing our data set and ask who's if radius is None: radius = self.radius - n_samples = X.shape[0] if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': - dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=self.n_jobs, squared=True) radius *= radius + kwds = {'squared': True} else: - dist = pairwise_distances(X, self._fit_X, - self.effective_metric_, - n_jobs=self.n_jobs, - **self.effective_metric_params_) - - neigh_ind_list = [np.where(d <= radius)[0] for d in dist] + kwds = self.effective_metric_params_ - # See https://github.com/numpy/numpy/issues/5456 - # if you want to understand why this is initialized this way. - neigh_ind = np.empty(n_samples, dtype='object') - neigh_ind[:] = neigh_ind_list + reduce_func = partial(self._radius_neighbors_reduce_func, + radius=radius, + return_distance=return_distance) + results = pairwise_distances_chunked( + X, self._fit_X, reduce_func=reduce_func, + metric=self.effective_metric_, n_jobs=self.n_jobs, + **kwds) if return_distance: - dist_array = np.empty(n_samples, dtype='object') - if self.effective_metric_ == 'euclidean': - dist_list = [np.sqrt(d[neigh_ind[i]]) - for i, d in enumerate(dist)] - else: - dist_list = [d[neigh_ind[i]] - for i, d in enumerate(dist)] - dist_array[:] = dist_list - - results = dist_array, neigh_ind + dist_chunks, neigh_ind_chunks = zip(*results) + dist_list = sum(dist_chunks, []) + neigh_ind_list = sum(neigh_ind_chunks, []) + # See https://github.com/numpy/numpy/issues/5456 + # if you want to understand why this is initialized this way. + dist = np.empty(len(dist_list), dtype='object') + dist[:] = dist_list + neigh_ind = np.empty(len(neigh_ind_list), dtype='object') + neigh_ind[:] = neigh_ind_list + results = dist, neigh_ind else: - results = neigh_ind + neigh_ind_list = sum(results, []) + results = np.empty(len(neigh_ind_list), dtype='object') + results[:] = neigh_ind_list elif self._fit_method in ['ball_tree', 'kd_tree']: if issparse(X): @@ -651,10 +714,17 @@ class from an array representing our data set and ask who's "%s does not work with sparse matrices. Densify the data, " "or set algorithm='brute'" % self._fit_method) - n_jobs = _get_n_jobs(self.n_jobs) - results = Parallel(n_jobs, backend='threading')( - delayed(self._tree.query_radius, check_pickle=False)( - X[s], radius, return_distance) + n_jobs = effective_n_jobs(self.n_jobs) + if LooseVersion(joblib_version) < LooseVersion('0.12'): + # Deal with change of API in joblib + delayed_query = delayed(self._tree.query_radius, + check_pickle=False) + parallel_kwargs = {"backend": "threading"} + else: + delayed_query = delayed(self._tree.query_radius) + parallel_kwargs = {"prefer": "threads"} + results = Parallel(n_jobs, **parallel_kwargs)( + delayed_query(X[s], radius, return_distance) for s in gen_even_slices(X.shape[0], n_jobs) ) if return_distance: diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a2769c4ef0230..160f3dc5c5eed 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -4,6 +4,8 @@ from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, issparse) +import pytest + from sklearn import metrics from sklearn import neighbors, datasets from sklearn.exceptions import DataConversionWarning @@ -18,7 +20,6 @@ from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_in from sklearn.utils.testing import assert_raises -from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns @@ -26,6 +27,8 @@ from sklearn.utils.testing import ignore_warnings from sklearn.utils.validation import check_random_state +from sklearn.externals.joblib import parallel_backend + rng = np.random.RandomState(0) # load and shuffle iris dataset iris = datasets.load_iris() @@ -91,72 +94,6 @@ def test_unsupervised_kneighbors(n_samples=20, n_features=5, assert_array_almost_equal(results[i][1], results[i + 1][1]) -def test_masked_unsupervised_kneighbors(): - # Test 1 - X = np.array([[np.nan, 3., 7., np.nan], - [6., 3., 7., 2.], - [7., 3., 4., 4.], - [2., 7., 7., 1.], - [np.nan, 2., np.nan, 4.]], dtype=np.float32) - - Y = np.array([[3., 1., 7., np.nan], - [1., 3., 1., 6.], - [np.nan, 1., np.nan, 5.], - [3., 1., 3., 3.], - [2., 3., 1., 9.]], dtype=np.float32) - - neigh = neighbors.NearestNeighbors(2, metric="masked_euclidean") - neigh.fit(X) - X_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) - XY_neigh = neigh.kneighbors(Y, 2, return_distance=False) - # Expected outcome - N1 = np.array( - [[1, 4], - [0, 4], - [4, 1], - [0, 1], - [2, 0]]) - - N2 = np.array( - [[4, 0], - [4, 2], - [4, 2], - [4, 2], - [4, 2]]) - - assert_array_equal(X_neigh, N1) - assert_array_equal(XY_neigh, N2) - - # Test 2 - nan = float("nan") - samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] - neigh = neighbors.NearestNeighbors(n_neighbors=2, - metric="masked_euclidean") - neigh.fit(samples) - - X2_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) - XY2_neigh = neigh.kneighbors([[0, nan, 1]], 2, return_distance=False) - - # Expected outcome - N3 = np.array( - [[3, 1], - [3, 2], - [3, 1], - [2, 1]]) - N4 = np.array([[1, 3]]) - - assert_array_equal(X2_neigh, N3) - assert_array_equal(XY2_neigh, N4) - - # Test 3 - nan = float("nan") - samples = csc_matrix([[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]]) - neigh = neighbors.NearestNeighbors(n_neighbors=2, - metric="masked_euclidean") - msg = "kNN does not support sparse matrix with missing data" - assert_raise_message(ValueError, msg, neigh.fit, samples) - - def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors X = rng.random_sample((10, 3)) @@ -246,6 +183,7 @@ def test_precomputed(random_state=42): assert_array_almost_equal(pred_X, pred_D) +@pytest.mark.filterwarnings('ignore: You should specify a value') # 0.22 def test_precomputed_cross_validation(): # Ensure array is split correctly rng = np.random.RandomState(0) @@ -748,7 +686,7 @@ def test_radius_neighbors_regressor(n_samples=40, weights=weights, algorithm='auto') neigh.fit(X, y) - X_test_nan = np.ones((1, n_features))*-1 + X_test_nan = np.full((1, n_features), -1.) empty_warning_msg = ("One or more samples have no neighbors " "within specified radius; predicting NaN.") pred = assert_warns_message(UserWarning, @@ -1130,9 +1068,6 @@ def test_valid_brute_metric_for_auto_algorithm(): nb_p.kneighbors(DYX) for metric in VALID_METRICS_SPARSE['brute']: - # TODO: Remove after adding sparse support for masked_euclidean - if metric == "masked_euclidean": - continue if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric).fit(Xcsr) @@ -1330,63 +1265,76 @@ def test_include_self_neighbors_graph(): assert_array_equal(rng_not_self, [[0., 1.], [1., 0.]]) -def test_same_knn_parallel(): +@pytest.mark.parametrize('algorithm', ALGORITHMS) +def test_same_knn_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) - def check_same_knn_parallel(algorithm): - clf = neighbors.KNeighborsClassifier(n_neighbors=3, - algorithm=algorithm) - clf.fit(X_train, y_train) - y = clf.predict(X_test) - dist, ind = clf.kneighbors(X_test) - graph = clf.kneighbors_graph(X_test, mode='distance').toarray() - - clf.set_params(n_jobs=3) - clf.fit(X_train, y_train) - y_parallel = clf.predict(X_test) - dist_parallel, ind_parallel = clf.kneighbors(X_test) - graph_parallel = \ - clf.kneighbors_graph(X_test, mode='distance').toarray() + clf = neighbors.KNeighborsClassifier(n_neighbors=3, + algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.kneighbors(X_test) + graph = clf.kneighbors_graph(X_test, mode='distance').toarray() - assert_array_equal(y, y_parallel) - assert_array_almost_equal(dist, dist_parallel) - assert_array_equal(ind, ind_parallel) - assert_array_almost_equal(graph, graph_parallel) + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.kneighbors(X_test) + graph_parallel = \ + clf.kneighbors_graph(X_test, mode='distance').toarray() - for algorithm in ALGORITHMS: - yield check_same_knn_parallel, algorithm + assert_array_equal(y, y_parallel) + assert_array_almost_equal(dist, dist_parallel) + assert_array_equal(ind, ind_parallel) + assert_array_almost_equal(graph, graph_parallel) -def test_same_radius_neighbors_parallel(): +@pytest.mark.parametrize('algorithm', ALGORITHMS) +def test_same_radius_neighbors_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) - def check_same_radius_neighbors_parallel(algorithm): - clf = neighbors.RadiusNeighborsClassifier(radius=10, - algorithm=algorithm) - clf.fit(X_train, y_train) - y = clf.predict(X_test) - dist, ind = clf.radius_neighbors(X_test) - graph = clf.radius_neighbors_graph(X_test, mode='distance').toarray() + clf = neighbors.RadiusNeighborsClassifier(radius=10, + algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.radius_neighbors(X_test) + graph = clf.radius_neighbors_graph(X_test, mode='distance').toarray() + + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.radius_neighbors(X_test) + graph_parallel = \ + clf.radius_neighbors_graph(X_test, mode='distance').toarray() + + assert_array_equal(y, y_parallel) + for i in range(len(dist)): + assert_array_almost_equal(dist[i], dist_parallel[i]) + assert_array_equal(ind[i], ind_parallel[i]) + assert_array_almost_equal(graph, graph_parallel) + + +@pytest.mark.parametrize('backend', ['loky', 'multiprocessing', 'threading']) +@pytest.mark.parametrize('algorithm', ALGORITHMS) +def test_knn_forcing_backend(backend, algorithm): + # Non-regression test which ensure the knn methods are properly working + # even when forcing the global joblib backend. + with parallel_backend(backend): + X, y = datasets.make_classification(n_samples=30, n_features=5, + n_redundant=0, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y) - clf.set_params(n_jobs=3) + clf = neighbors.KNeighborsClassifier(n_neighbors=3, + algorithm=algorithm, + n_jobs=3) clf.fit(X_train, y_train) - y_parallel = clf.predict(X_test) - dist_parallel, ind_parallel = clf.radius_neighbors(X_test) - graph_parallel = \ - clf.radius_neighbors_graph(X_test, mode='distance').toarray() - - assert_array_equal(y, y_parallel) - for i in range(len(dist)): - assert_array_almost_equal(dist[i], dist_parallel[i]) - assert_array_equal(ind[i], ind_parallel[i]) - assert_array_almost_equal(graph, graph_parallel) - - for algorithm in ALGORITHMS: - yield check_same_radius_neighbors_parallel, algorithm + clf.predict(X_test) + clf.kneighbors(X_test) + clf.kneighbors_graph(X_test, mode='distance').toarray() def test_dtype_convert(): diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py index 31bad8be91c2e..9d41b640f9e17 100644 --- a/sklearn/neighbors/unsupervised.py +++ b/sklearn/neighbors/unsupervised.py @@ -54,7 +54,7 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan', 'masked_euclidean] + 'manhattan'] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', @@ -74,9 +74,11 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, metric_params : dict, optional (default = None) Additional keyword arguments for the metric function. - n_jobs : int, optional (default = 1) + n_jobs : int or None, optional (default=None) The number of parallel jobs to run for neighbors search. - If ``-1``, then the number of jobs is set to the number of CPU cores. + ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. + ``-1`` means using all processors. See :term:`Glossary ` + for more details. Examples -------- @@ -114,7 +116,7 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, def __init__(self, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', - p=2, metric_params=None, n_jobs=1, **kwargs): + p=2, metric_params=None, n_jobs=None, **kwargs): super(NearestNeighbors, self).__init__( n_neighbors=n_neighbors, radius=radius, From 202cd37ae68576a539a252f0e84e0a9abaac9677 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 30 Sep 2018 13:50:54 +1000 Subject: [PATCH 86/97] Revert changes to deprecated file --- .../preprocessing/tests/test_imputation.py | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/sklearn/preprocessing/tests/test_imputation.py b/sklearn/preprocessing/tests/test_imputation.py index 0fd79de827b9a..663262b50289b 100644 --- a/sklearn/preprocessing/tests/test_imputation.py +++ b/sklearn/preprocessing/tests/test_imputation.py @@ -1,4 +1,4 @@ -from __future__ import division + import numpy as np from scipy import sparse @@ -10,7 +10,6 @@ from sklearn.utils.testing import ignore_warnings from sklearn.preprocessing.imputation import Imputer -from sklearn.impute import KNNImputer from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV from sklearn import tree @@ -311,23 +310,14 @@ def test_imputation_pickle(): @ignore_warnings def test_imputation_copy(): # Test imputation with copy - X_orig = sparse_random_matrix(10, 10, density=0.75, random_state=0) - imputers = {Imputer: {"missing_values": 0, "strategy": "mean"}, - KNNImputer: {"missing_values": 0}} + X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy - # copy=False, dense => no copy - for imputer_cls, params in imputers.items(): - for copy in [True, False]: - X = X_orig.copy().toarray() - params["copy"] = copy - imputer = imputer_cls(**params) - Xt = imputer.fit(X).transform(X) - Xt[0, 0] = -1 - if copy: - assert_false(np.all(X == Xt)) - else: - assert_array_almost_equal(X, Xt) + X = X_orig.copy().toarray() + imputer = Imputer(missing_values=0, strategy="mean", copy=True) + Xt = imputer.fit(X).transform(X) + Xt[0, 0] = -1 + assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() From 6414081c6b64ccc7e9f0de42a143f68ff201dc65 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 30 Sep 2018 13:52:25 +1000 Subject: [PATCH 87/97] COSMIT _MASKED_METRICS -> _NAN_METRICS --- sklearn/impute.py | 4 ++-- sklearn/metrics/pairwise.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 7aa08bca88fcc..70f7a377930bc 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -18,7 +18,7 @@ from .utils.validation import check_is_fitted from .utils.validation import FLOAT_DTYPES from .metrics import pairwise_distances -from .metrics.pairwise import _MASKED_METRICS +from .metrics.pairwise import _NAN_METRICS from .neighbors.base import _check_weights from .neighbors.base import _get_weights @@ -812,7 +812,7 @@ def fit(self, X, y=None): force_all_finite = False if self.missing_values in ["NaN", np.nan] else True if not force_all_finite: - if self.metric not in _MASKED_METRICS and not callable( + if self.metric not in _NAN_METRICS and not callable( self.metric): raise ValueError( "The selected metric does not support NaN values.") diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 7bc2cfaf54114..4580751dcaffa 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1266,7 +1266,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski", 'masked_euclidean'] -_MASKED_METRICS = ['masked_euclidean'] +_NAN_METRICS = ['masked_euclidean'] def _check_chunk_size(reduced, chunk_size): @@ -1533,7 +1533,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) - if metric in _MASKED_METRICS or callable(metric): + if metric in _NAN_METRICS or callable(metric): missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan From 2825fccc2938406c18742b26ab323e0404cda987 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sun, 30 Sep 2018 14:10:28 +1000 Subject: [PATCH 88/97] 'NaN' no longer stands for NaN --- sklearn/impute.py | 15 +++++++-------- sklearn/tests/test_impute.py | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 70f7a377930bc..9a67586b815e0 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -652,10 +652,9 @@ class KNNImputer(BaseEstimator, TransformerMixin): Parameters ---------- - missing_values : integer or "NaN", optional (default = "NaN") + missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of - `missing_values` will be imputed. For missing values encoded as - ``np.nan``, use the string value "NaN". + `missing_values` will be imputed. n_neighbors : int, optional (default = 5) Number of neighboring samples to use for imputation. @@ -729,7 +728,7 @@ class KNNImputer(BaseEstimator, TransformerMixin): [8. , 8. , 7. ]]) """ - def __init__(self, missing_values="NaN", n_neighbors=5, + def __init__(self, missing_values=np.nan, n_neighbors=5, weights="uniform", metric="masked_euclidean", row_max_missing=0.5, col_max_missing=0.8, copy=True): @@ -809,8 +808,7 @@ def fit(self, X, y=None): """ # Check data integrity and calling arguments - force_all_finite = False if self.missing_values in ["NaN", - np.nan] else True + force_all_finite = not is_scalar_nan(self.missing_values) if not force_all_finite: if self.metric not in _NAN_METRICS and not callable( self.metric): @@ -825,7 +823,9 @@ def fit(self, X, y=None): raise ValueError("+/- inf values are not allowed.") # Check if % missing in any column > col_max_missing + print(X, self.missing_values, type(self.missing_values)) mask = _get_mask(X, self.missing_values) + print(mask) if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): raise ValueError("Some column(s) have more than {}% missing values" .format(self.col_max_missing * 100)) @@ -866,8 +866,7 @@ def transform(self, X): """ check_is_fitted(self, ["fitted_X_", "statistics_"]) - force_all_finite = False if self.missing_values in ["NaN", - np.nan] else True + force_all_finite = not is_scalar_nan(self.missing_values) X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 25d27e863e4de..73abc08fb4b8c 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -1219,7 +1219,7 @@ def test_complete_features_weighted(): @pytest.mark.parametrize("imputer_constructor", - [SimpleImputer, KNNImputer]) + [SimpleImputer]) @pytest.mark.parametrize( "imputer_missing_values, missing_value, err_msg", [("NaN", np.nan, "Input contains NaN"), From 745fa2dde582f0b3fdd067bcef1002487a75ed55 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 3 Oct 2018 12:33:36 +1000 Subject: [PATCH 89/97] Fix missing_values validation --- sklearn/impute.py | 10 ++++++++-- sklearn/tests/test_impute.py | 7 +++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 9a67586b815e0..888395b858310 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -808,7 +808,10 @@ def fit(self, X, y=None): """ # Check data integrity and calling arguments - force_all_finite = not is_scalar_nan(self.missing_values) + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" if not force_all_finite: if self.metric not in _NAN_METRICS and not callable( self.metric): @@ -866,7 +869,10 @@ def transform(self, X): """ check_is_fitted(self, ["fitted_X_", "statistics_"]) - force_all_finite = not is_scalar_nan(self.missing_values) + if not is_scalar_nan(self.missing_values): + force_all_finite = True + else: + force_all_finite = "allow-nan" X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 73abc08fb4b8c..0a87f2ca58732 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -669,7 +669,7 @@ def test_knn_imputation_zero(): def test_knn_imputation_zero_p2(): - # Test with an imputable matrix and also compare with missing_values="NaN" + # Test with an imputable matrix and also compare with missing_values=np.NaN X_zero = np.array([ [1, 0, 1, 1, 1.], [2, 2, 2, 2, 2], @@ -695,7 +695,7 @@ def test_knn_imputation_zero_p2(): imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform") - imputer_nan = KNNImputer(missing_values="NaN", + imputer_nan = KNNImputer(missing_values=np.nan, n_neighbors=2, weights="uniform") @@ -1078,7 +1078,6 @@ def test_weight_distance(): r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) - print(r1c1_imp, r1c3_imp, r2c3_imp) X_imputed = np.array([ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], @@ -1114,7 +1113,7 @@ def test_metric_type(): def test_callable_metric(): # Define callable metric that returns the l1 norm: - def custom_callable(x, y, missing_values="NaN", squared=False): + def custom_callable(x, y, missing_values=np.nan, squared=False): x = np.ma.array(x, mask=np.isnan(x)) y = np.ma.array(y, mask=np.isnan(y)) dist = np.nansum(np.abs(x-y)) From 44f021066ce089c977aacd38aa29b66513f105bd Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 3 Oct 2018 12:42:05 +1000 Subject: [PATCH 90/97] Attempt to reinstate neighbors changes --- sklearn/neighbors/base.py | 15 ++++- sklearn/neighbors/tests/test_neighbors.py | 69 +++++++++++++++++++++++ sklearn/neighbors/unsupervised.py | 2 +- 3 files changed, 82 insertions(+), 4 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index dedcc658c0d2f..1465fb7d835dc 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -22,6 +22,7 @@ from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS from ..utils import check_X_y, check_array, gen_even_slices +from ..metrics.pairwise import _NAN_METRICS from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..externals import six @@ -163,6 +164,8 @@ def _check_algorithm_metric(self): def _fit(self, X): self._check_algorithm_metric() + + allow_nans = self.metric in _NAN_METRICS or callable(self.metric) if self.metric_params is None: self.effective_metric_params_ = {} else: @@ -206,7 +209,8 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - X = check_array(X, accept_sparse='csr') + X = check_array(X, accept_sparse='csr', + force_all_finite=not allow_nans) n_samples = X.shape[0] if n_samples == 0: @@ -391,7 +395,10 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + force_all_finite = (not callable(self.effective_metric_) + and self.effective_metric_ not in _NAN_METRICS) + X = check_array(X, accept_sparse='csr', + force_all_finite=force_all_finite) else: query_is_train = True X = self._fit_X @@ -417,7 +424,9 @@ class from an array representing our data set and ask who's return_distance=return_distance) # for efficiency, use squared euclidean distances - kwds = ({'squared': True} if self.effective_metric_ == 'euclidean' + kwds = ({'squared': True} + if self.effective_metric_ in ('euclidean', + 'masked_euclidean') else self.effective_metric_params_) result = pairwise_distances_chunked( diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 160f3dc5c5eed..80f07a2c65862 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -94,6 +94,72 @@ def test_unsupervised_kneighbors(n_samples=20, n_features=5, assert_array_almost_equal(results[i][1], results[i + 1][1]) +def test_masked_unsupervised_kneighbors(): + # Test 1 + X = np.array([[np.nan, 3., 7., np.nan], + [6., 3., 7., 2.], + [7., 3., 4., 4.], + [2., 7., 7., 1.], + [np.nan, 2., np.nan, 4.]], dtype=np.float32) + + Y = np.array([[3., 1., 7., np.nan], + [1., 3., 1., 6.], + [np.nan, 1., np.nan, 5.], + [3., 1., 3., 3.], + [2., 3., 1., 9.]], dtype=np.float32) + + neigh = neighbors.NearestNeighbors(2, metric="masked_euclidean") + neigh.fit(X) + X_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) + XY_neigh = neigh.kneighbors(Y, 2, return_distance=False) + # Expected outcome + N1 = np.array( + [[1, 4], + [0, 4], + [4, 1], + [0, 1], + [2, 0]]) + + N2 = np.array( + [[4, 0], + [4, 2], + [4, 2], + [4, 2], + [4, 2]]) + + assert_array_equal(X_neigh, N1) + assert_array_equal(XY_neigh, N2) + + # Test 2 + nan = float("nan") + samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] + neigh = neighbors.NearestNeighbors(n_neighbors=2, + metric="masked_euclidean") + neigh.fit(samples) + + X2_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) + XY2_neigh = neigh.kneighbors([[0, nan, 1]], 2, return_distance=False) + + # Expected outcome + N3 = np.array( + [[3, 1], + [3, 2], + [3, 1], + [2, 1]]) + N4 = np.array([[1, 3]]) + + assert_array_equal(X2_neigh, N3) + assert_array_equal(XY2_neigh, N4) + + # Test 3 + nan = float("nan") + samples = csc_matrix([[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]]) + neigh = neighbors.NearestNeighbors(n_neighbors=2, + metric="masked_euclidean") + msg = "kNN does not support sparse matrix with missing data" + assert_raise_message(ValueError, msg, neigh.fit, samples) + + def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors X = rng.random_sample((10, 3)) @@ -1068,6 +1134,9 @@ def test_valid_brute_metric_for_auto_algorithm(): nb_p.kneighbors(DYX) for metric in VALID_METRICS_SPARSE['brute']: + # TODO: Remove after adding sparse support for masked_euclidean + if metric == "masked_euclidean": + continue if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric).fit(Xcsr) diff --git a/sklearn/neighbors/unsupervised.py b/sklearn/neighbors/unsupervised.py index 9d41b640f9e17..40f2270ee13c8 100644 --- a/sklearn/neighbors/unsupervised.py +++ b/sklearn/neighbors/unsupervised.py @@ -54,7 +54,7 @@ class NearestNeighbors(NeighborsBase, KNeighborsMixin, Valid values for metric are: - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan'] + 'manhattan', 'masked_euclidean] - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', From 82d5d203faf3dbc02988f9e4d0c8943b3dd871b6 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 3 Oct 2018 20:06:15 +1000 Subject: [PATCH 91/97] Fix up test failures --- sklearn/impute.py | 20 ++++---------------- sklearn/neighbors/base.py | 5 +++-- sklearn/neighbors/tests/test_neighbors.py | 4 ++-- sklearn/tests/test_impute.py | 18 ++++++------------ 4 files changed, 15 insertions(+), 32 deletions(-) diff --git a/sklearn/impute.py b/sklearn/impute.py index 888395b858310..6311e659c4433 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -675,10 +675,10 @@ class KNNImputer(BaseEstimator, TransformerMixin): Distance metric for searching neighbors. Possible values: - 'masked_euclidean' - [callable] : a user-defined function which conforms to the - definition of _pairwise_callable(X, Y, metric, **kwds). In other - words, the function accepts two arrays, X and Y, and a - ``missing_values`` keyword in **kwds and returns a scalar distance - value. + definition of _pairwise_callable(X, Y, metric, **kwds). In other + words, the function accepts two arrays, X and Y, and a + ``missing_values`` keyword in **kwds and returns a scalar distance + value. row_max_missing : float, optional (default = 0.5) The maximum fraction of columns (i.e. features) that can be missing @@ -812,7 +812,6 @@ def fit(self, X, y=None): force_all_finite = True else: force_all_finite = "allow-nan" - if not force_all_finite: if self.metric not in _NAN_METRICS and not callable( self.metric): raise ValueError( @@ -821,14 +820,8 @@ def fit(self, X, y=None): force_all_finite=force_all_finite, copy=self.copy) self.weights = _check_weights(self.weights) - # Check for +/- inf - if np.any(np.isinf(X)): - raise ValueError("+/- inf values are not allowed.") - # Check if % missing in any column > col_max_missing - print(X, self.missing_values, type(self.missing_values)) mask = _get_mask(X, self.missing_values) - print(mask) if np.any(mask.sum(axis=0) > (X.shape[0] * self.col_max_missing)): raise ValueError("Some column(s) have more than {}% missing values" .format(self.col_max_missing * 100)) @@ -876,11 +869,6 @@ def transform(self, X): X = check_array(X, accept_sparse=False, dtype=FLOAT_DTYPES, force_all_finite=force_all_finite, copy=self.copy) - # Check for +/- inf - if np.any(np.isinf(X)): - raise ValueError("+/- inf values are not allowed in data to be " - "transformed.") - # Get fitted data and ensure correct dimension n_rows_fit_X, n_cols_fit_X = self.fitted_X_.shape n_rows_X, n_cols_X = X.shape diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 1465fb7d835dc..2f0ba75ee1f4f 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -46,7 +46,8 @@ VALID_METRICS_SPARSE = dict(ball_tree=[], kd_tree=[], - brute=PAIRWISE_DISTANCE_FUNCTIONS.keys()) + brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - + {'masked_euclidean'})) def _check_weights(weights): @@ -210,7 +211,7 @@ def _fit(self, X): return self X = check_array(X, accept_sparse='csr', - force_all_finite=not allow_nans) + force_all_finite='allow-nan' if allow_nans else False) n_samples = X.shape[0] if n_samples == 0: diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 80f07a2c65862..6b7f31f52b7cd 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -156,8 +156,8 @@ def test_masked_unsupervised_kneighbors(): samples = csc_matrix([[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]]) neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="masked_euclidean") - msg = "kNN does not support sparse matrix with missing data" - assert_raise_message(ValueError, msg, neigh.fit, samples) + msg = "Metric 'masked_euclidean' not valid for sparse input.*" + assert_raises_regex(ValueError, msg, neigh.fit, samples) def test_unsupervised_inputs(): diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 0a87f2ca58732..0a0428c02fbc9 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -21,7 +21,7 @@ from sklearn.random_projection import sparse_random_matrix from sklearn.metrics.pairwise import masked_euclidean_distances from sklearn.metrics.pairwise import pairwise_distances -from sklearn.neighbors import NearestNeighbors +from sklearn.neighbors import KNeighborsRegressor def _check_statistics(X, X_true, @@ -857,7 +857,7 @@ def test_default_with_invalid_input(): [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) - msg = "+/- inf values are not allowed." + msg = "Input contains infinity" assert_raise_message(ValueError, msg, KNNImputer().fit, X) # Test with inf present in matrix passed in transform() @@ -878,7 +878,7 @@ def test_default_with_invalid_input(): [np.nan, 7, 0, 7, 8], [6, 6, 2, 5, 7], ]) - msg = "+/- inf values are not allowed in data to be transformed." + msg = "Input contains infinity" assert_raise_message(ValueError, msg, KNNImputer().fit(X_fit).transform, X) @@ -988,15 +988,9 @@ def test_weight_distance(): ]) # Test with "distance" weight - nn = NearestNeighbors(metric="masked_euclidean") - nn.fit(X) - # Get distance of "n_neighbors" neighbors of row 1 - dist, index = nn.kneighbors() - dist = dist[1, :] - index = index[1, :] - weights = 1 / dist - values = X[index, 0] - imputed = np.dot(values, weights) / np.sum(weights) + nn = KNeighborsRegressor(metric="euclidean", weights="distance") + nn.fit(np.delete(X, 1, axis=0)[:, 1:], np.delete(X, 1, axis=0)[:, 0]) + imputed = nn.predict(X[1:2, 1:]) # Manual calculation X_imputed_distance1 = np.array([ From d8b23e60a20a83edb971765ca007c35bfe152677 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 4 Oct 2018 07:45:15 +1000 Subject: [PATCH 92/97] Fix flake8 issues in example --- examples/plot_missing_values.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 90359d0affa1c..62f9c5aee08a2 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -74,20 +74,16 @@ def get_results(dataset): scoring='neg_mean_squared_error', cv=5) - # Estimate the score after kNN-imputation of the missing values - knn_estimator = Pipeline( - [("knnimputer", KNNImputer(missing_values=0, - col_max_missing=0.99)), - ("forest", RandomForestRegressor(random_state=0, n_estimators=100))]) + knn_estimator = make_pipeline( + [KNNImputer(missing_values=0, col_max_missing=0.99), + RandomForestRegressor(random_state=0, n_estimators=100)]) knn_impute_scores = cross_val_score(knn_estimator, X_missing, y_missing, scoring='neg_mean_squared_error') return ((full_scores.mean(), full_scores.std()), (zero_impute_scores.mean(), zero_impute_scores.std()), (mean_impute_scores.mean(), mean_impute_scores.std()), - (mice_impute_scores.mean(), mice_impute_scores.std()), - (mean_impute_scores.mean(), mean_impute_scores.std()), (knn_impute_scores.mean(), knn_impute_scores.std()), ) From c682361937835e4364e4c5a4546439de7f08fd8f Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 4 Oct 2018 13:57:13 +1000 Subject: [PATCH 93/97] Default force_all_finite to True rather than False --- sklearn/neighbors/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 2f0ba75ee1f4f..3f44128cee577 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -166,7 +166,6 @@ def _check_algorithm_metric(self): def _fit(self, X): self._check_algorithm_metric() - allow_nans = self.metric in _NAN_METRICS or callable(self.metric) if self.metric_params is None: self.effective_metric_params_ = {} else: @@ -210,8 +209,9 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self + allow_nan = self.metric in _NAN_METRICS or callable(self.metric) X = check_array(X, accept_sparse='csr', - force_all_finite='allow-nan' if allow_nans else False) + force_all_finite='allow-nan' if allow_nan else True) n_samples = X.shape[0] if n_samples == 0: From 1912611b093c162b4d7e967fe9129874dfacd0f7 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 4 Oct 2018 14:19:27 +1000 Subject: [PATCH 94/97] Fix example usage --- examples/plot_missing_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/plot_missing_values.py b/examples/plot_missing_values.py index 62f9c5aee08a2..3c672bb5b2b87 100644 --- a/examples/plot_missing_values.py +++ b/examples/plot_missing_values.py @@ -76,8 +76,8 @@ def get_results(dataset): # Estimate the score after kNN-imputation of the missing values knn_estimator = make_pipeline( - [KNNImputer(missing_values=0, col_max_missing=0.99), - RandomForestRegressor(random_state=0, n_estimators=100)]) + KNNImputer(missing_values=0, col_max_missing=0.99), + RandomForestRegressor(random_state=0, n_estimators=100)) knn_impute_scores = cross_val_score(knn_estimator, X_missing, y_missing, scoring='neg_mean_squared_error') From 607ff7f6c0113b7ea629fa69e97cbedc34174d0a Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 4 Oct 2018 20:35:15 +1000 Subject: [PATCH 95/97] Fix masked_euclidean testing in nearest neighbors --- sklearn/neighbors/base.py | 26 ++++++----- sklearn/neighbors/tests/test_neighbors.py | 56 +++++++++++++++++++++-- 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 3f44128cee577..356840c332ac4 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -25,6 +25,7 @@ from ..metrics.pairwise import _NAN_METRICS from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted +from ..utils.validation import _num_samples from ..externals import six from ..utils import Parallel, delayed, effective_n_jobs from ..utils._joblib import __version__ as joblib_version @@ -318,7 +319,7 @@ def _kneighbors_reduce_func(self, dist, start, neigh_ind = neigh_ind[ sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == 'euclidean': + if self.effective_metric_ in ('euclidean', 'masked_euclidean'): result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = dist[sample_range, neigh_ind], neigh_ind @@ -396,10 +397,11 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - force_all_finite = (not callable(self.effective_metric_) - and self.effective_metric_ not in _NAN_METRICS) + allow_nan = (callable(self.effective_metric_) + or self.effective_metric_ in _NAN_METRICS) X = check_array(X, accept_sparse='csr', - force_all_finite=force_all_finite) + force_all_finite=('allow-nan' if allow_nan + else True)) else: query_is_train = True X = self._fit_X @@ -541,8 +543,7 @@ def kneighbors_graph(self, X=None, n_neighbors=None, # kneighbors does the None handling. if X is not None: - X = check_array(X, accept_sparse='csr') - n_samples1 = X.shape[0] + n_samples1 = _num_samples(X) else: n_samples1 = self._fit_X.shape[0] @@ -597,7 +598,7 @@ def _radius_neighbors_reduce_func(self, dist, start, neigh_ind = [np.where(d <= radius)[0] for d in dist] if return_distance: - if self.effective_metric_ == 'euclidean': + if self.effective_metric_ in ('masked_euclidean', 'euclidean'): dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)] else: @@ -678,7 +679,11 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + allow_nan = (callable(self.effective_metric_) + or self.effective_metric_ in _NAN_METRICS) + X = check_array(X, accept_sparse='csr', + force_all_finite=('allow-nan' if allow_nan + else True)) else: query_is_train = True X = self._fit_X @@ -688,7 +693,7 @@ class from an array representing our data set and ask who's if self._fit_method == 'brute': # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean': + if self.effective_metric_ in ('euclidean', 'masked_euclidean'): radius *= radius kwds = {'squared': True} else: @@ -811,9 +816,6 @@ def radius_neighbors_graph(self, X=None, radius=None, mode='connectivity'): -------- kneighbors_graph """ - if X is not None: - X = check_array(X, accept_sparse=['csr', 'csc', 'coo']) - n_samples2 = self._fit_X.shape[0] if radius is None: radius = self.radius diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 6b7f31f52b7cd..2b01707d3dec0 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -108,10 +108,11 @@ def test_masked_unsupervised_kneighbors(): [3., 1., 3., 3.], [2., 3., 1., 9.]], dtype=np.float32) - neigh = neighbors.NearestNeighbors(2, metric="masked_euclidean") + neigh = neighbors.NearestNeighbors(n_neighbors=2, + metric="masked_euclidean") neigh.fit(X) - X_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) - XY_neigh = neigh.kneighbors(Y, 2, return_distance=False) + X_dist, X_neigh = neigh.kneighbors(return_distance=True) + XY_dist, XY_neigh = neigh.kneighbors(Y, return_distance=True) # Expected outcome N1 = np.array( [[1, 4], @@ -129,6 +130,17 @@ def test_masked_unsupervised_kneighbors(): assert_array_equal(X_neigh, N1) assert_array_equal(XY_neigh, N2) + for i in range(X.shape[0]): + assert_array_equal(X_dist[i:i+1], + pairwise_distances(X[i:i+1], X[N1[i]], + metric='masked_euclidean')) + for i in range(Y.shape[0]): + assert_array_equal(XY_dist[i:i+1], + pairwise_distances(Y[i:i+1], X[N2[i]], + metric='masked_euclidean')) + + # smoke test of graph + neigh.kneighbors_graph(X) # Test 2 nan = float("nan") @@ -137,8 +149,8 @@ def test_masked_unsupervised_kneighbors(): metric="masked_euclidean") neigh.fit(samples) - X2_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) - XY2_neigh = neigh.kneighbors([[0, nan, 1]], 2, return_distance=False) + X2_neigh = neigh.kneighbors(return_distance=False) + XY2_neigh = neigh.kneighbors([[0, nan, 1]], return_distance=False) # Expected outcome N3 = np.array( @@ -160,6 +172,40 @@ def test_masked_unsupervised_kneighbors(): assert_raises_regex(ValueError, msg, neigh.fit, samples) +def test_masked_unsupervised_radius_neighbors(): + X = np.array([[np.nan, 3., 7., np.nan], + [6., 3., 7., 2.], + [7., 3., 4., 4.], + [2., 7., 7., 1.], + [np.nan, 2., np.nan, 4.]], dtype=np.float32) + + Y = np.array([[3., 1., 7., np.nan], + [1., 3., 1., 6.], + [np.nan, 1., np.nan, 5.], + [3., 1., 3., 3.], + [2., 3., 1., 9.]], dtype=np.float32) + + neigh = neighbors.NearestNeighbors(radius=4, metric="masked_euclidean") + neigh.fit(X) + X_dist, X_neigh = neigh.radius_neighbors(return_distance=True) + XY_dist, XY_neigh = neigh.radius_neighbors(Y, return_distance=True) + for i in range(X.shape[0]): + if len(X_neigh[i]) == 0: + continue + assert_array_equal(X_dist[i], + pairwise_distances(X[i:i+1], X[X_neigh[i]], + metric='masked_euclidean')[0]) + for i in range(Y.shape[0]): + if len(XY_neigh[i]) == 0: + continue + assert_array_equal(XY_dist[i], + pairwise_distances(Y[i:i+1], X[XY_neigh[i]], + metric='masked_euclidean')[0]) + + # smoke test of graph + neigh.radius_neighbors_graph(X) + + def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors X = rng.random_sample((10, 3)) From 87677e75243712d63cc29e0c1ee0b8409adc2296 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 4 Oct 2018 23:59:05 +1000 Subject: [PATCH 96/97] Fix missing_values in masked_euclidean_distances --- sklearn/metrics/pairwise.py | 16 +++++----------- sklearn/metrics/tests/test_pairwise.py | 10 +++++----- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 4580751dcaffa..36f3d1cbd319f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -23,6 +23,7 @@ from ..utils import check_array from ..utils import gen_even_slices from ..utils import gen_batches, get_chunk_n_rows +from ..utils import is_scalar_nan from ..utils.extmath import row_norms, safe_sparse_dot from ..preprocessing import normalize from ..utils import Parallel @@ -288,7 +289,7 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, def masked_euclidean_distances(X, Y=None, squared=False, - missing_values="NaN", copy=True): + missing_values=np.nan, copy=True): """Calculates euclidean distances in the presence of missing values Computes the euclidean distance between each pair of samples (rows) in X @@ -315,7 +316,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, squared : boolean, optional Return squared Euclidean distances. - missing_values : "NaN" or integer, optional + missing_values : np.nan or integer, optional Representation of missing value copy : boolean, optional @@ -352,9 +353,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, paired_distances : distances betweens pairs of elements of X and Y. """ - # NOTE: force_all_finite=False allows not only NaN but also +/- inf + force_all_finite = 'allow-nan' if is_scalar_nan(missing_values) else True X, Y = check_pairwise_arrays(X, Y, accept_sparse=False, - force_all_finite=False, copy=copy) + force_all_finite=force_all_finite, copy=copy) if (np.any(np.isinf(X)) or (Y is not X and np.any(np.isinf(Y)))): raise ValueError( @@ -371,13 +372,6 @@ def masked_euclidean_distances(X, Y=None, squared=False, or (Y is not X and np.any(mask_YT.sum(axis=0) == Y.shape[1])): raise ValueError("One or more rows only contain missing values.") - # else: - if missing_values not in ["NaN", np.nan] and ( - np.any(np.isnan(X)) or (Y is not X and np.any(np.isnan(Y)))): - raise ValueError( - "NaN values present but missing_value = {0}".format( - missing_values)) - # Get mask of non-missing values set Y.T's missing to zero. # Further, casting the mask to int to be used in formula later. not_YT = (~mask_YT).astype(np.int32) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index c6035067f1a7f..383b667cb0a13 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -582,8 +582,8 @@ def test_masked_euclidean_distances(): [np.nan, np.nan, 5., 4., 7.], [np.nan, np.nan, np.nan, 4., 5.]]) - D1 = masked_euclidean_distances(X, Y, missing_values="NaN") - D2 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN") + D1 = masked_euclidean_distances(X, Y, missing_values=np.nan) + D2 = masked_euclidean_distances(X, Y, squared=True, missing_values=np.nan) assert_array_almost_equal(D1**2, D2) @@ -592,7 +592,7 @@ def test_masked_euclidean_distances(): [[40., 48.33333331, 22.5], [25., 25., 45.], [5., 180., 80.]]) - D4 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN") + D4 = masked_euclidean_distances(X, Y, squared=True, missing_values=np.nan) assert_array_almost_equal(D3, D4) @@ -602,8 +602,8 @@ def test_masked_euclidean_distances(): [[5.0/2.0 * ((7-3)**2 + (2-2)**2)]]) # Check when Y = X is explicitly passed - D5 = masked_euclidean_distances(X, missing_values="NaN") - D6 = masked_euclidean_distances(X, X, missing_values="NaN") + D5 = masked_euclidean_distances(X, missing_values=np.nan) + D6 = masked_euclidean_distances(X, X, missing_values=np.nan) assert_array_almost_equal(D5, D6) # Check with missing_value = 1 while NaN is present From 39e1da88b38b8bf3b08e3fa651925733caf55412 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Fri, 5 Oct 2018 07:44:35 +1000 Subject: [PATCH 97/97] Can't subtract list and set in Py2 --- sklearn/neighbors/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 356840c332ac4..d91ec6b8d6a15 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -47,7 +47,7 @@ VALID_METRICS_SPARSE = dict(ball_tree=[], kd_tree=[], - brute=(PAIRWISE_DISTANCE_FUNCTIONS.keys() - + brute=(set(PAIRWISE_DISTANCE_FUNCTIONS.keys()) - {'masked_euclidean'}))