From d707dcd11d8114aacff61e56f5f972205de106dc Mon Sep 17 00:00:00 2001 From: harke Date: Thu, 13 Jul 2017 04:12:24 -0500 Subject: [PATCH 01/19] Modified metrics to enable euclidean distance calculation with missing (NaN) values --- sklearn/metrics/pairwise.py | 142 ++++++++++++++++++++----- sklearn/metrics/tests/test_pairwise.py | 31 ++++++ sklearn/neighbors/base.py | 35 ++++-- 3 files changed, 176 insertions(+), 32 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 0b63653672f51..f83baa2d330b7 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -28,6 +28,13 @@ from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan +# Get mask for missing values +def _get_mask(X, value_to_mask): + """Compute the boolean mask X == missing_values.""" + if value_to_mask == "NaN" or np.isnan(value_to_mask): + return np.isnan(X) + else: + return X == value_to_mask # Utility Functions def _return_float_dtype(X, Y): @@ -54,7 +61,8 @@ def _return_float_dtype(X, Y): return X, Y, dtype -def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): +def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, + copy=False, force_all_finite=True): """ Set X and Y appropriately and checks inputs If Y is None, it is set as a pointer to X (i.e. not a copy). @@ -103,11 +111,14 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None): if Y is X or Y is None: X = Y = check_array(X, accept_sparse='csr', dtype=dtype, + copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) else: X = check_array(X, accept_sparse='csr', dtype=dtype, + copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) Y = check_array(Y, accept_sparse='csr', dtype=dtype, + copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) if precomputed: @@ -160,7 +171,8 @@ def check_paired_arrays(X, Y): # Pairwise distances def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, - X_norm_squared=None): + X_norm_squared=None, kill_missing=True, + missing_values=None, copy=False): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. @@ -179,6 +191,19 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. + Additionally, euclidean_distances() can also compute pairwise euclidean + distance for vectors in dense matrices X and Y with missing values in + arbitrary coordinates. The following formula is used for this: + + dist(X, Y) = (X.shape[1] * 1 / ((dot(NX, NYT)))) * + (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + + dot(NX, (Y.T * Y.T))) + + where NX and NYT represent the logical-not of the missing masks of + X and Y.T, respectively.This formulation zero-weights coordinates with + missing value in either vector in the pair and up-weights the remaining + coordinates. + Read more in the :ref:`User Guide `. Parameters @@ -198,6 +223,15 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, Pre-computed dot-products of vectors in X (e.g., ``(X**2).sum(axis=1)``) + kill_missing : boolean, optional + Allow missing values (e.g., NaN) + + missing_values : String, optional + String representation of missing value + + copy : boolean, optional + Make and use a deep copy of X and Y (if it exists) + Returns ------- distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) @@ -219,34 +253,75 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, -------- paired_distances : distances betweens pairs of elements of X and Y. """ - X, Y = check_pairwise_arrays(X, Y) - if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): - raise ValueError( - "Incompatible dimensions for X and X_norm_squared") + #NOTE: force_all_finite=False allows not only NaN but also inf/-inf + X, Y = check_pairwise_arrays(X, Y, force_all_finite=kill_missing, copy=copy) + if kill_missing is False and \ + (np.any(np.isinf(X.data)) or (Y is not None and np.any(np.isinf(Y.data)))): + raise ValueError( + "+/- Infinite values are not allowed.") + + if kill_missing: + if X_norm_squared is not None: + XX = check_array(X_norm_squared) + if XX.shape == (1, X.shape[0]): + XX = XX.T + elif XX.shape != (X.shape[0], 1): + raise ValueError( + "Incompatible dimensions for X and X_norm_squared") + else: + XX = row_norms(X, squared=True)[:, np.newaxis] + + if X is Y: # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is not None: + YY = np.atleast_2d(Y_norm_squared) + + if YY.shape != (1, Y.shape[0]): + raise ValueError( + "Incompatible dimensions for Y and Y_norm_squared") + else: + YY = row_norms(Y, squared=True)[np.newaxis, :] + + distances = safe_sparse_dot(X, Y.T, dense_output=True) + distances *= -2 + distances += XX + distances += YY + np.maximum(distances, 0, out=distances) + else: - XX = row_norms(X, squared=True)[:, np.newaxis] + if missing_values!="NaN" and \ + (np.any(_get_mask(X.data, "NaN")) or np.any(_get_mask(Y.data, "NaN"))): + raise ValueError( + "NaN values present but missing_value = {0}".format(missing_values)) - if X is Y: # shortcut in the common case euclidean_distances(X, X) - YY = XX.T - elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) + # ValueError if X and Y have incompatible dimensions + # if X.shape[1] != Y.shape[1]: + # raise ValueError("The search dimension of the matrices " + # "are not equal: [{0}] versus [{1}]". + # format(X.shape[1], Y.shape[1])) - if YY.shape != (1, Y.shape[0]): - raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - else: - YY = row_norms(Y, squared=True)[np.newaxis, :] + # Get missing mask for X + mask_X = _get_mask(X, missing_values) + + # Get Y.T mask and anti-mask and set Y.T's missing to zero + YT = Y.T + mask_YT = _get_mask(YT, missing_values) + NYT = (~mask_YT).astype("int") + YT[mask_YT] = 0 + + #Get X anti-mask and set X's missing to zero + NX = (~mask_X).astype("int") + X[mask_X] = 0 - distances = safe_sparse_dot(X, Y.T, dense_output=True) - distances *= -2 - distances += XX - distances += YY - np.maximum(distances, 0, out=distances) + # Matrix formula to calculate pair-wise distance between all vectors in a + # matrix X to vectors in matrix Y. It zero-weights coordinates with missing value + # in either vector in the pair and up-weights the remaining coordinates. + # Matrix formula derived by: Shreya Bhattarai + + distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ + (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + np.dot(NX, (YT * YT))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1130,6 +1205,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] +_MISSING_SUPPORTED_METRICS = ['euclidean'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): """ Compute the distance matrix from a vector array X and optional Y. @@ -1216,6 +1292,22 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) + if (kwds.get("kill_missing") is False): + if (metric not in _MISSING_SUPPORTED_METRICS): + raise ValueError( + "Metric {0} does not have missing value support ".format( + metric) + ) + if issparse(X) or (Y is not None and issparse(Y)): + raise ValueError( + "Missing value support for sparse matrices not added yet") + if (kwds.get("missing_values") is None): + raise ValueError("Missing value is not defined") + if(np.any(_get_mask(X.data, kwds.get("missing_values")). + sum(axis=1) == X.data.shape[1])): + raise ValueError( + "One or more samples(s) only have missing values.") + if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index d8b64b58ca481..8b0a42a528810 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -406,6 +406,37 @@ def test_euclidean_distances(): Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01) +def test_euclidean_distances_with_missing(): + # first check that we get right answer with missing values for X + X = np.array([[1., 5., 7., 5., 10.], + [8., 2., 4., np.nan, 8.], + [5., np.nan, 5., np.nan, 1.], + [8., np.nan, np.nan, np.nan, np.nan]]) + D1 = euclidean_distances(X, kill_missing=False, missing_values="NaN") + + D2 = np.array([[0., 9.42072184, 12.97433364, 15.65247584], + [9.42072184, 0., 9.91631652, 0.], + [12.97433364, 9.91631652, 0., 6.70820393], + [15.65247584, 0., 6.70820393, 0.]]) + + assert_array_almost_equal(D1, D2) + + # check with pairs of matrices with missing values + X = np.array([[1., np.nan, 3., 4., 2.], + [np.nan, 4., 6., 1., np.nan], + [3., np.nan, np.nan, np.nan, 1.]]) + + Y = np.array([[np.nan, 7., 7., np.nan, 2.], + [np.nan, np.nan, 5., 4., 7.], + [np.nan, np.nan, np.nan, 4., 5.]]) + + D3 = np.array([[6.32455532, 6.95221787, 4.74341649], + [5., 5., 6.70820393], + [2.23606798, 13.41640786, 8.94427191]]) + + D4 = euclidean_distances(X, Y, kill_missing=False, missing_values="NaN") + + assert_array_almost_equal(D3, D4) def test_cosine_distances(): # Check the pairwise Cosine distances computation diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e14da8bbc2e97..72f11f608cbcc 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -157,7 +157,7 @@ def _init_params(self, n_neighbors=None, radius=None, self._tree = None self._fit_method = None - def _fit(self, X): + def _fit(self, X, kill_missing=True): if self.metric_params is None: self.effective_metric_params_ = {} else: @@ -201,7 +201,9 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - X = check_array(X, accept_sparse='csr') + # # copy=True if missing accepted as they will be replaced by 0 + # copy = True if kill_missing is False else False + X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) n_samples = X.shape[0] if n_samples == 0: @@ -270,7 +272,8 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" - def kneighbors(self, X=None, n_neighbors=None, return_distance=True): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True, + kill_missing=True, missing_values="NaN", copy=None): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -290,6 +293,15 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True): return_distance : boolean, optional. Defaults to True. If False, distances will not be returned + kill_missing : boolean, optional + Allow missing values (e.g., NaN) + + missing_values : String, optional + String representation of missing value + + copy : boolean, optional + Make and use a deep copy of X + Returns ------- dist : array @@ -331,7 +343,9 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + # copy=True if missing accepted as they will be replaced by 0 + # copy = True if kill_missing is False else False + X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) else: query_is_train = True X = self._fit_X @@ -349,12 +363,19 @@ class from an array representing our data set and ask who's n_samples, _ = X.shape sample_range = np.arange(n_samples)[:, None] + # copy=True if missing accepted and copy is None + if copy is None: + copy = True if kill_missing is False else False + n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=n_jobs, squared=True) + n_jobs=n_jobs, squared=True, + kill_missing=kill_missing, + missing_values=missing_values, + copy=copy) else: dist = pairwise_distances( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, @@ -791,7 +812,7 @@ def fit(self, X, y): class UnsupervisedMixin(object): - def fit(self, X, y=None): + def fit(self, X, y=None, kill_missing=True): """Fit the model using X as training data Parameters @@ -800,4 +821,4 @@ def fit(self, X, y=None): Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. """ - return self._fit(X) + return self._fit(X, kill_missing) From b4b5ae9a4775d7ceb420fc3bac54994820483897 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 17 Jul 2017 21:21:43 -0500 Subject: [PATCH 02/19] Changes to ensure Python 2.x compatibility --- sklearn/metrics/pairwise.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f83baa2d330b7..f9c46b1eb109e 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -307,11 +307,13 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # Get Y.T mask and anti-mask and set Y.T's missing to zero YT = Y.T mask_YT = _get_mask(YT, missing_values) - NYT = (~mask_YT).astype("int") + NYT = (~mask_YT).astype(np.int8) + # NYT = (~mask_YT) YT[mask_YT] = 0 - #Get X anti-mask and set X's missing to zero - NX = (~mask_X).astype("int") + # Get X anti-mask and set X's missing to zero + NX = (~mask_X).astype(np.int8) + # NX = (~mask_X) X[mask_X] = 0 # Matrix formula to calculate pair-wise distance between all vectors in a @@ -319,9 +321,15 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # in either vector in the pair and up-weights the remaining coordinates. # Matrix formula derived by: Shreya Bhattarai - distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ - (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - np.dot(NX, (YT * YT))) + # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ + # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + # np.dot(NX, (YT * YT))) + + # Above is faster but following for Python 2.x support + distances = np.multiply(np.multiply(X.shape[1], (1.0 / np.dot(NX, NYT))), + (np.dot(np.multiply(X, X), NYT) - + (2.0 * (np.dot(X, YT))) + + np.dot(NX, (np.multiply(YT, YT))))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. From 04ed4a0aa452c03d7b09a6a07e820307964d37c9 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 17 Jul 2017 22:24:15 -0500 Subject: [PATCH 03/19] Fixed pep8 issues --- sklearn/metrics/pairwise.py | 35 ++++++++++++++++---------- sklearn/metrics/tests/test_pairwise.py | 2 ++ sklearn/neighbors/base.py | 3 ++- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f9c46b1eb109e..a855aa0ad583a 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -28,6 +28,7 @@ from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan + # Get mask for missing values def _get_mask(X, value_to_mask): """Compute the boolean mask X == missing_values.""" @@ -36,6 +37,7 @@ def _get_mask(X, value_to_mask): else: return X == value_to_mask + # Utility Functions def _return_float_dtype(X, Y): """ @@ -254,10 +256,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, paired_distances : distances betweens pairs of elements of X and Y. """ - #NOTE: force_all_finite=False allows not only NaN but also inf/-inf - X, Y = check_pairwise_arrays(X, Y, force_all_finite=kill_missing, copy=copy) + # NOTE: force_all_finite=False allows not only NaN but also inf/-inf + X, Y = check_pairwise_arrays(X, Y, + force_all_finite=kill_missing, copy=copy) if kill_missing is False and \ - (np.any(np.isinf(X.data)) or (Y is not None and np.any(np.isinf(Y.data)))): + (np.any(np.isinf(X.data)) or + (Y is not None and np.any(np.isinf(Y.data)))): raise ValueError( "+/- Infinite values are not allowed.") @@ -290,10 +294,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, np.maximum(distances, 0, out=distances) else: - if missing_values!="NaN" and \ - (np.any(_get_mask(X.data, "NaN")) or np.any(_get_mask(Y.data, "NaN"))): + if missing_values != "NaN" and \ + (np.any(_get_mask(X.data, "NaN")) or + np.any(_get_mask(Y.data, "NaN"))): raise ValueError( - "NaN values present but missing_value = {0}".format(missing_values)) + "NaN values present but missing_value = {0}". + format(missing_values)) # ValueError if X and Y have incompatible dimensions # if X.shape[1] != Y.shape[1]: @@ -316,17 +322,19 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # NX = (~mask_X) X[mask_X] = 0 - # Matrix formula to calculate pair-wise distance between all vectors in a - # matrix X to vectors in matrix Y. It zero-weights coordinates with missing value - # in either vector in the pair and up-weights the remaining coordinates. - # Matrix formula derived by: Shreya Bhattarai + # Matrix formula to calculate pair-wise distance between all vectors + # in a matrix X to vectors in matrix Y. It zero-weights coordinates + # with missing value in either vector in the pair and up-weights the + # remaining coordinates. + # Formula derived by: Shreya Bhattarai # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + # np.dot(NX, (YT * YT))) # Above is faster but following for Python 2.x support - distances = np.multiply(np.multiply(X.shape[1], (1.0 / np.dot(NX, NYT))), + distances = np.multiply(np.multiply(X.shape[1], + (1.0 / np.dot(NX, NYT))), (np.dot(np.multiply(X, X), NYT) - (2.0 * (np.dot(X, YT))) + np.dot(NX, (np.multiply(YT, YT))))) @@ -1215,6 +1223,7 @@ def _pairwise_callable(X, Y, metric, **kwds): _MISSING_SUPPORTED_METRICS = ['euclidean'] + def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): """ Compute the distance matrix from a vector array X and optional Y. @@ -1311,8 +1320,8 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Missing value support for sparse matrices not added yet") if (kwds.get("missing_values") is None): raise ValueError("Missing value is not defined") - if(np.any(_get_mask(X.data, kwds.get("missing_values")). - sum(axis=1) == X.data.shape[1])): + if(np.any(_get_mask(X.data, kwds.get("missing_values")).sum( + axis=1) == X.data.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 8b0a42a528810..ddb4e795dd1bc 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -406,6 +406,7 @@ def test_euclidean_distances(): Y_norm_squared=np.zeros_like(Y_norm_sq)) assert_greater(np.max(np.abs(wrong_D - D1)), .01) + def test_euclidean_distances_with_missing(): # first check that we get right answer with missing values for X X = np.array([[1., 5., 7., 5., 10.], @@ -438,6 +439,7 @@ def test_euclidean_distances_with_missing(): assert_array_almost_equal(D3, D4) + def test_cosine_distances(): # Check the pairwise Cosine distances computation rng = np.random.RandomState(1337) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 72f11f608cbcc..e4fa7ffe6db03 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -345,7 +345,8 @@ class from an array representing our data set and ask who's query_is_train = False # copy=True if missing accepted as they will be replaced by 0 # copy = True if kill_missing is False else False - X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) + X = check_array(X, accept_sparse='csr', + force_all_finite=kill_missing) else: query_is_train = True X = self._fit_X From a6d8ef66b5fcdda9144b04d997b3e8ba8ad4d441 Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 06:22:32 -0500 Subject: [PATCH 04/19] Addressed comments from review --- sklearn/metrics/pairwise.py | 321 ++++++++++++++-------- sklearn/metrics/tests/test_pairwise.py | 18 +- sklearn/neighbors/base.py | 238 +++++++++++++--- sklearn/neighbors/tests/test_neighbors.py | 60 ++++ 4 files changed, 486 insertions(+), 151 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a855aa0ad583a..f01f4fd36ab33 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -94,6 +94,13 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, .. versionadded:: 0.18 + copy : bool + Create and return a deep copy of X and Y (if Y exists) + + force_all_finite : bool + Throw a ValueError exception if either X or Y (if Y exists) + contains any NaN or +/- inf values + Returns ------- safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features) @@ -173,38 +180,119 @@ def check_paired_arrays(X, Y): # Pairwise distances def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, - X_norm_squared=None, kill_missing=True, - missing_values=None, copy=False): + X_norm_squared=None): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. - For efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: - dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) - This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. - However, this is not the most precise way of doing this computation, and the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. + Read more in the :ref:`User Guide `. + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples_1, n_features) + Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) + Y_norm_squared : array-like, shape (n_samples_2, ), optional + Pre-computed dot-products of vectors in Y (e.g., + ``(Y**2).sum(axis=1)``) + squared : boolean, optional + Return squared Euclidean distances. + X_norm_squared : array-like, shape = [n_samples_1], optional + Pre-computed dot-products of vectors in X (e.g., + ``(X**2).sum(axis=1)``) + Returns + ------- + distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) + Examples + -------- + >>> from sklearn.metrics.pairwise import euclidean_distances + >>> X = [[0, 1], [1, 1]] + >>> # distance between rows of X + >>> euclidean_distances(X, X) + array([[ 0., 1.], + [ 1., 0.]]) + >>> # get distance to origin + >>> euclidean_distances(X, [[0, 0]]) + array([[ 1. ], + [ 1.41421356]]) + See also + -------- + paired_distances : distances betweens pairs of elements of X and Y. + """ + X, Y = check_pairwise_arrays(X, Y) + + if X_norm_squared is not None: + XX = check_array(X_norm_squared) + if XX.shape == (1, X.shape[0]): + XX = XX.T + elif XX.shape != (X.shape[0], 1): + raise ValueError( + "Incompatible dimensions for X and X_norm_squared") + else: + XX = row_norms(X, squared=True)[:, np.newaxis] + + if X is Y: # shortcut in the common case euclidean_distances(X, X) + YY = XX.T + elif Y_norm_squared is not None: + YY = np.atleast_2d(Y_norm_squared) + + if YY.shape != (1, Y.shape[0]): + raise ValueError( + "Incompatible dimensions for Y and Y_norm_squared") + else: + YY = row_norms(Y, squared=True)[np.newaxis, :] + + distances = safe_sparse_dot(X, Y.T, dense_output=True) + distances *= -2 + distances += XX + distances += YY + np.maximum(distances, 0, out=distances) + + if X is Y: + # Ensure that distances between vectors and themselves are set to 0.0. + # This may not be the case due to floating point rounding errors. + distances.flat[::distances.shape[0] + 1] = 0.0 + + return distances if squared else np.sqrt(distances, out=distances) + - Additionally, euclidean_distances() can also compute pairwise euclidean - distance for vectors in dense matrices X and Y with missing values in - arbitrary coordinates. The following formula is used for this: +# Pairwise distances in the presence of missing values +def masked_euclidean_distances(X, Y=None, squared=False, + missing_values="NaN", copy=True, **kwargs): + """ + Considering the rows of X (and Y=X) as vectors, compute the + distance matrix between each pair of vectors. Similarly, if + Y is not X, then compute the distance matrix between each + pair of vectors (i.e., each row pair) in X and Y. + + This function computes pairwise euclidean distance for vectors + in dense matrices X and Y with missing values in arbitrary + coordinates. The following formula is used for this: dist(X, Y) = (X.shape[1] * 1 / ((dot(NX, NYT)))) * (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + dot(NX, (Y.T * Y.T))) where NX and NYT represent the logical-not of the missing masks of - X and Y.T, respectively.This formulation zero-weights coordinates with - missing value in either vector in the pair and up-weights the remaining - coordinates. + X and Y.T, respectively.This formulation zero-weights feature coordinates + with missing value in either vector in the pair and up-weights the + remaining coordinates. + Formula derived by: Shreya Bhattarai + + Breakdown of euclidean distance calculation between a vector pair x,y: + + weight = Total # of coordinates / # of non-missing coordinates + dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates) + + This of course implies that if all coordinates are missing in either + vector in the pair then NaN is returned for that pair. Read more in the :ref:`User Guide `. @@ -214,25 +302,14 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) - Y_norm_squared : array-like, shape (n_samples_2, ), optional - Pre-computed dot-products of vectors in Y (e.g., - ``(Y**2).sum(axis=1)``) - squared : boolean, optional Return squared Euclidean distances. - X_norm_squared : array-like, shape = [n_samples_1], optional - Pre-computed dot-products of vectors in X (e.g., - ``(X**2).sum(axis=1)``) - - kill_missing : boolean, optional - Allow missing values (e.g., NaN) - - missing_values : String, optional - String representation of missing value + missing_values : "NaN" or integer, optional (default=”NaN”) + Representation of missing value copy : boolean, optional - Make and use a deep copy of X and Y (if it exists) + Make and use a deep copy of X and Y (if Y exists) Returns ------- @@ -240,14 +317,15 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, Examples -------- - >>> from sklearn.metrics.pairwise import euclidean_distances - >>> X = [[0, 1], [1, 1]] + >>> from sklearn.metrics.pairwise import masked_euclidean_distances + >>> nan = float("NaN") + >>> X = [[0, 1], [1, nan]] >>> # distance between rows of X - >>> euclidean_distances(X, X) - array([[ 0., 1.], - [ 1., 0.]]) + >>> masked_euclidean_distances(X, X) + array([[ 0., 1.41421356], + [ 1.41421356, 0.]]) >>> # get distance to origin - >>> euclidean_distances(X, [[0, 0]]) + >>> masked_euclidean_distances(X, [[0, 0]]) array([[ 1. ], [ 1.41421356]]) @@ -255,89 +333,86 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, -------- paired_distances : distances betweens pairs of elements of X and Y. """ + # Check and except sparse matrices + if issparse(X) or (Y is not None and issparse(Y)): + raise ValueError( + "Missing value support for sparse matrices not added yet") # NOTE: force_all_finite=False allows not only NaN but also inf/-inf X, Y = check_pairwise_arrays(X, Y, - force_all_finite=kill_missing, copy=copy) - if kill_missing is False and \ - (np.any(np.isinf(X.data)) or - (Y is not None and np.any(np.isinf(Y.data)))): + force_all_finite=False, copy=copy) + if (np.any(np.isinf(X.data)) or + (Y is not None and np.any(np.isinf(Y.data)))): raise ValueError( "+/- Infinite values are not allowed.") - if kill_missing: - if X_norm_squared is not None: - XX = check_array(X_norm_squared) - if XX.shape == (1, X.shape[0]): - XX = XX.T - elif XX.shape != (X.shape[0], 1): - raise ValueError( - "Incompatible dimensions for X and X_norm_squared") - else: - XX = row_norms(X, squared=True)[:, np.newaxis] - - if X is Y: # shortcut in the common case euclidean_distances(X, X) - YY = XX.T - elif Y_norm_squared is not None: - YY = np.atleast_2d(Y_norm_squared) - - if YY.shape != (1, Y.shape[0]): - raise ValueError( - "Incompatible dimensions for Y and Y_norm_squared") - else: - YY = row_norms(Y, squared=True)[np.newaxis, :] - - distances = safe_sparse_dot(X, Y.T, dense_output=True) - distances *= -2 - distances += XX - distances += YY - np.maximum(distances, 0, out=distances) + # Check if any rows have only missing value + if np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])\ + or (Y is not None and np.any(_get_mask(Y, missing_values).sum( + axis=1) == Y.shape[1])): + raise ValueError("One or more rows only contain missing values.") + # + # if kill_missing: + # if X_norm_squared is not None: + # XX = check_array(X_norm_squared) + # if XX.shape == (1, X.shape[0]): + # XX = XX.T + # elif XX.shape != (X.shape[0], 1): + # raise ValueError( + # "Incompatible dimensions for X and X_norm_squared") + # else: + # XX = row_norms(X, squared=True)[:, np.newaxis] + # + # if X is Y: # shortcut in the common case euclidean_distances(X, X) + # YY = XX.T + # elif Y_norm_squared is not None: + # YY = np.atleast_2d(Y_norm_squared) + # + # if YY.shape != (1, Y.shape[0]): + # raise ValueError( + # "Incompatible dimensions for Y and Y_norm_squared") + # else: + # YY = row_norms(Y, squared=True)[np.newaxis, :] + # + # distances = safe_sparse_dot(X, Y.T, dense_output=True) + # distances *= -2 + # distances += XX + # distances += YY + # np.maximum(distances, 0, out=distances) + + # else: + if missing_values != "NaN" and \ + (np.any(_get_mask(X.data, "NaN")) or + np.any(_get_mask(Y.data, "NaN"))): + raise ValueError( + "NaN values present but missing_value = {0}".format( + missing_values)) - else: - if missing_values != "NaN" and \ - (np.any(_get_mask(X.data, "NaN")) or - np.any(_get_mask(Y.data, "NaN"))): - raise ValueError( - "NaN values present but missing_value = {0}". - format(missing_values)) - - # ValueError if X and Y have incompatible dimensions - # if X.shape[1] != Y.shape[1]: - # raise ValueError("The search dimension of the matrices " - # "are not equal: [{0}] versus [{1}]". - # format(X.shape[1], Y.shape[1])) - - # Get missing mask for X - mask_X = _get_mask(X, missing_values) - - # Get Y.T mask and anti-mask and set Y.T's missing to zero - YT = Y.T - mask_YT = _get_mask(YT, missing_values) - NYT = (~mask_YT).astype(np.int8) - # NYT = (~mask_YT) - YT[mask_YT] = 0 - - # Get X anti-mask and set X's missing to zero - NX = (~mask_X).astype(np.int8) - # NX = (~mask_X) - X[mask_X] = 0 - - # Matrix formula to calculate pair-wise distance between all vectors - # in a matrix X to vectors in matrix Y. It zero-weights coordinates - # with missing value in either vector in the pair and up-weights the - # remaining coordinates. - # Formula derived by: Shreya Bhattarai - - # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ - # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - # np.dot(NX, (YT * YT))) - - # Above is faster but following for Python 2.x support - distances = np.multiply(np.multiply(X.shape[1], - (1.0 / np.dot(NX, NYT))), - (np.dot(np.multiply(X, X), NYT) - - (2.0 * (np.dot(X, YT))) + - np.dot(NX, (np.multiply(YT, YT))))) + # Get missing mask for X + mask_X = _get_mask(X, missing_values) + + # Get Y.T mask and anti-mask and set Y.T's missing to zero + YT = Y.T + mask_YT = _get_mask(YT, missing_values) + NYT = (~mask_YT).astype(np.int8) + YT[mask_YT] = 0 + + # Get X anti-mask and set X's missing to zero + NX = (~mask_X).astype(np.int8) + X[mask_X] = 0 + + # Calculate distances + + # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ + # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + # np.dot(NX, (YT * YT))) + + # Above is faster but following for Python 2.x support + distances = np.multiply(np.multiply(X.shape[1], + (1.0 / np.dot(NX, NYT))), + (np.dot(np.multiply(X, X), NYT) - + (2.0 * (np.dot(X, YT))) + + np.dot(NX, (np.multiply(YT, YT))))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1133,6 +1208,11 @@ def chi2_kernel(X, Y=None, gamma=1.): 'precomputed': None, # HACK: precomputed is always allowed, never called } +# Helper functions with missing value support - distance +MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { + 'euclidean': masked_euclidean_distances, +} + def distance_metrics(): """Valid metrics for pairwise_distances. @@ -1221,7 +1301,7 @@ def _pairwise_callable(X, Y, metric, **kwds): 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] -_MISSING_SUPPORTED_METRICS = ['euclidean'] +_MASKED_SUPPORTED_METRICS = ['euclidean'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): @@ -1309,8 +1389,13 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) - if (kwds.get("kill_missing") is False): - if (metric not in _MISSING_SUPPORTED_METRICS): + # To handle kill_missing = False + kill_missing = kwds.get("kill_missing") + if not kill_missing and kill_missing is not None: + missing_values = kwds.get("missing_values") if kwds.get( + "missing_values") is not None else np.nan + + if (metric not in _MASKED_SUPPORTED_METRICS): raise ValueError( "Metric {0} does not have missing value support ".format( metric) @@ -1318,18 +1403,22 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): if issparse(X) or (Y is not None and issparse(Y)): raise ValueError( "Missing value support for sparse matrices not added yet") - if (kwds.get("missing_values") is None): - raise ValueError("Missing value is not defined") - if(np.any(_get_mask(X.data, kwds.get("missing_values")).sum( - axis=1) == X.data.shape[1])): + # if (kwds.get("missing_values") is None): + # raise ValueError("Missing value is not defined") + if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") + # if type(metric) is str: + # metric = "masked_" + metric if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X + elif kill_missing is False and metric in \ + MASKED_PAIRWISE_DISTANCE_FUNCTIONS: + func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] elif metric in PAIRWISE_DISTANCE_FUNCTIONS: - func = PAIRWISE_DISTANCE_FUNCTIONS[metric] + func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index ddb4e795dd1bc..8732bf5e6d70a 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -17,6 +17,7 @@ from sklearn.externals.six import iteritems from sklearn.metrics.pairwise import euclidean_distances +from sklearn.metrics.pairwise import masked_euclidean_distances from sklearn.metrics.pairwise import manhattan_distances from sklearn.metrics.pairwise import linear_kernel from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel @@ -56,6 +57,17 @@ def test_pairwise_distances(): S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) + # Euclidean dist. (masked) should be equivalent to calling the function. + X = rng.random_sample((5, 4)) + S = pairwise_distances(X, metric="euclidean", kill_missing=False) + S2 = masked_euclidean_distances(X) + assert_array_almost_equal(S, S2) + # Euclidean distance, with Y != X. + Y = rng.random_sample((2, 4)) + S = pairwise_distances(X, Y, metric="euclidean", + kill_missing=False) + S2 = masked_euclidean_distances(X, Y) + assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) @@ -407,13 +419,13 @@ def test_euclidean_distances(): assert_greater(np.max(np.abs(wrong_D - D1)), .01) -def test_euclidean_distances_with_missing(): +def test_masked_euclidean_distances(): # first check that we get right answer with missing values for X X = np.array([[1., 5., 7., 5., 10.], [8., 2., 4., np.nan, 8.], [5., np.nan, 5., np.nan, 1.], [8., np.nan, np.nan, np.nan, np.nan]]) - D1 = euclidean_distances(X, kill_missing=False, missing_values="NaN") + D1 = masked_euclidean_distances(X, missing_values="NaN") D2 = np.array([[0., 9.42072184, 12.97433364, 15.65247584], [9.42072184, 0., 9.91631652, 0.], @@ -435,7 +447,7 @@ def test_euclidean_distances_with_missing(): [5., 5., 6.70820393], [2.23606798, 13.41640786, 8.94427191]]) - D4 = euclidean_distances(X, Y, kill_missing=False, missing_values="NaN") + D4 = masked_euclidean_distances(X, Y, missing_values="NaN") assert_array_almost_equal(D3, D4) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e4fa7ffe6db03..b21c058202ecf 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -17,6 +17,7 @@ from ..base import BaseEstimator from ..metrics import pairwise_distances from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from ..metrics.pairwise import _MASKED_SUPPORTED_METRICS from ..utils import check_X_y, check_array, _get_n_jobs, gen_even_slices from ..utils.multiclass import check_classification_targets from ..externals import six @@ -158,6 +159,25 @@ def _init_params(self, n_neighbors=None, radius=None, self._fit_method = None def _fit(self, X, kill_missing=True): + if not kill_missing: + if self.metric not in _MASKED_SUPPORTED_METRICS: + raise ValueError( + "Metric {0} is currently not supported for " + "data containing missing values.".format(self.metric) + ) + + _MASKED_SUPPORTED_ALGORITHMS = ["brute"] + if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: + if self.algorithm == "auto": + pass + else: + warnings.warn( + "{0} algorithm is currently not supported for " + "data containing missing values. " + "Reverting to a supported algorithm.". + format(self.algorithm)) + self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] + if self.metric_params is None: self.effective_metric_params_ = {} else: @@ -203,23 +223,30 @@ def _fit(self, X, kill_missing=True): # # copy=True if missing accepted as they will be replaced by 0 # copy = True if kill_missing is False else False - X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) + X = check_array(X, accept_sparse='csr', + force_all_finite=kill_missing) n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") if issparse(X): - if self.algorithm not in ('auto', 'brute'): - warnings.warn("cannot use tree with sparse input: " - "using brute force") - if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: - raise ValueError("metric '%s' not valid for sparse input" - % self.effective_metric_) - self._fit_X = X.copy() - self._tree = None - self._fit_method = 'brute' - return self + if not kill_missing: + raise ValueError( + "Nearest neighbor algorithm does not currently support" + "the use of sparse matrices." + ) + else: + if self.algorithm not in ('auto', 'brute'): + warnings.warn("cannot use tree with sparse input: " + "using brute force") + if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: + raise ValueError("metric '%s' not valid for sparse input" + % self.effective_metric_) + self._fit_X = X.copy() + self._tree = None + self._fit_method = 'brute' + return self self._fit_method = self.algorithm self._fit_X = X @@ -272,8 +299,7 @@ def _pairwise(self): class KNeighborsMixin(object): """Mixin for k-neighbors searches""" - def kneighbors(self, X=None, n_neighbors=None, return_distance=True, - kill_missing=True, missing_values="NaN", copy=None): + def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. @@ -293,15 +319,6 @@ def kneighbors(self, X=None, n_neighbors=None, return_distance=True, return_distance : boolean, optional. Defaults to True. If False, distances will not be returned - kill_missing : boolean, optional - Allow missing values (e.g., NaN) - - missing_values : String, optional - String representation of missing value - - copy : boolean, optional - Make and use a deep copy of X - Returns ------- dist : array @@ -345,8 +362,7 @@ class from an array representing our data set and ask who's query_is_train = False # copy=True if missing accepted as they will be replaced by 0 # copy = True if kill_missing is False else False - X = check_array(X, accept_sparse='csr', - force_all_finite=kill_missing) + X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X @@ -364,19 +380,12 @@ class from an array representing our data set and ask who's n_samples, _ = X.shape sample_range = np.arange(n_samples)[:, None] - # copy=True if missing accepted and copy is None - if copy is None: - copy = True if kill_missing is False else False - n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances if self.effective_metric_ == 'euclidean': dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=n_jobs, squared=True, - kill_missing=kill_missing, - missing_values=missing_values, - copy=copy) + n_jobs=n_jobs, squared=True) else: dist = pairwise_distances( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, @@ -443,6 +452,171 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind + def masked_kneighbors(self, X=None, n_neighbors=None, return_distance=True, + missing_values="NaN", copy=True): + """Finds the K-neighbors of a point, even when they contain NaN values. + + Returns indices of and distances to the neighbors of each point. + + Parameters + ---------- + X : array-like, shape (n_query, n_features), \ + or (n_query, n_indexed) if metric == 'precomputed' + The query point or points. + If not provided, neighbors of each indexed point are returned. + In this case, the query point is not considered its own neighbor. + + n_neighbors : int + Number of neighbors to get (default is the value + passed to the constructor). + + return_distance : boolean, optional. Defaults to True. + If False, distances will not be returned + + missing_values : "NaN" or integer, optional. Default is "NaN". + Representation of missing value + + copy : boolean, optional. Default is True. + Create and use a deep copy of X + + Returns + ------- + dist : array + Array representing the lengths to points, only present if + return_distance=True + + ind : array + Indices of the nearest points in the population matrix. + + Examples + -------- + In the following example, we construct a NeighborsClassifier + class from an array representing our data set and ask who's + the closest point to [0, nan, 1], where "nan" represents a + missing value. + >>> nan = float("nan") + >>> samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] + >>> from sklearn.neighbors import NearestNeighbors + >>> neigh = NearestNeighbors(n_neighbors=2, metric="euclidean") + >>> neigh.fit(samples, kill_missing=False) # doctest: +ELLIPSIS + NearestNeighbors(algorithm='auto', leaf_size=30,...) + >>> print(neigh.masked_kneighbors(n_neighbors=2, + >>> return_distance=False)) # doctest: +ELLIPSIS + (array([[3, 1], [3, 2], [3, 1], [2, 1]])...) + + >>> X = [[0, nan, 1]] + >>> neigh.masked_kneighbors([[0, nan, 1]], 2, + >>> return_distance=False) # doctest: +ELLIPSIS + (array([[1, 3]])...) + """ + if self._fit_method is None: + raise NotFittedError("Must fit neighbors before querying.") + + if n_neighbors is None: + n_neighbors = self.n_neighbors + + if X is not None: + query_is_train = False + X = check_array(X, accept_sparse='csr', + force_all_finite=False, copy=copy) + else: + query_is_train = True + X = self._fit_X + # Include an extra neighbor to account for the sample itself being + # returned, which is removed later + n_neighbors += 1 + + train_size = self._fit_X.shape[0] + if n_neighbors > train_size: + raise ValueError( + "Expected n_neighbors <= n_samples, " + " but n_samples = %d, n_neighbors = %d" % + (train_size, n_neighbors) + ) + n_samples, _ = X.shape + sample_range = np.arange(n_samples)[:, None] + + n_jobs = _get_n_jobs(self.n_jobs) + if self._fit_method == 'brute': + # for efficiency, use squared euclidean distances + if self.effective_metric_ == 'euclidean': + dist = pairwise_distances(X, self._fit_X, 'euclidean', + n_jobs=n_jobs, squared=True, + kill_missing=False, + missing_values=missing_values, + copy=copy) + else: + # dist = pairwise_distances( + # X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, + # **self.effective_metric_params_) + raise ValueError( + "Only the following metrics are currently supported for " + "data with missing values:{0}". + format(_MASKED_SUPPORTED_METRICS) + ) + neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) + neigh_ind = neigh_ind[:, :n_neighbors] + # argpartition doesn't guarantee sorted order, so we sort again + neigh_ind = neigh_ind[ + sample_range, np.argsort(dist[sample_range, neigh_ind])] + + if return_distance: + if self.effective_metric_ == 'euclidean': + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind + else: + result = neigh_ind + + # elif self._fit_method in ['ball_tree', 'kd_tree']: + # if issparse(X): + # raise ValueError( + # "%s does not work with sparse matrices." + # "Densify the data, " + # "or set algorithm='brute'" % self._fit_method) + # result = Parallel(n_jobs, backend='threading')( + # delayed(self._tree.query, check_pickle=False)( + # X[s], n_neighbors, return_distance) + # for s in gen_even_slices(X.shape[0], n_jobs) + # ) + # if return_distance: + # dist, neigh_ind = tuple(zip(*result)) + # result = np.vstack(dist), np.vstack(neigh_ind) + # else: + # result = np.vstack(result) + else: + raise ValueError("internal: _fit_method not recognized for data " + "containing missing") + + if not query_is_train: + return result + else: + # If the query data is the same as the indexed data, we would like + # to ignore the first nearest neighbor of every sample, i.e + # the sample itself. + if return_distance: + dist, neigh_ind = result + else: + neigh_ind = result + + sample_mask = neigh_ind != sample_range + + # Corner case: When the number of duplicates are more + # than the number of neighbors, the first NN will not + # be the sample, but a duplicate. + # In that case mask the first duplicate. + dup_gr_nbrs = np.all(sample_mask, axis=1) + sample_mask[:, 0][dup_gr_nbrs] = False + + neigh_ind = np.reshape( + neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) + + if return_distance: + dist = np.reshape( + dist[sample_mask], (n_samples, n_neighbors - 1)) + return dist, neigh_ind + return neigh_ind + def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'): """Computes the (weighted) graph of k-Neighbors for points in X diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 052c83c71d2e7..f772e8b963e14 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -88,6 +88,66 @@ def test_unsupervised_kneighbors(n_samples=20, n_features=5, assert_array_almost_equal(results[i][1], results[i + 1][1]) +def test_masked_unsupervised_kneighbors(): + # Test 1 + X = np.array([[np.nan, 3., 7., np.nan], + [6., 3., 7., 2.], + [7., 3., 4., 4.], + [2., 7., 7., 1.], + [np.nan, 2., np.nan, 4.]], dtype=np.float32) + + Y = np.array([[3., 1., 7., np.nan], + [1., 3., 1., 6.], + [np.nan, 1., np.nan, 5.], + [3., 1., 3., 3.], + [2., 3., 1., 9.]], dtype=np.float32) + + neigh = neighbors.NearestNeighbors(2, metric="euclidean") + neigh.fit(X, kill_missing=False) + X_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) + XY_neigh = neigh.masked_kneighbors(Y, 2, return_distance=False) + + # Expected outcome + N1 = np.array( + [[1, 4], + [0, 4], + [4, 1], + [0, 1], + [2, 0]]) + + N2 = np.array( + [[4, 0], + [4, 2], + [4, 2], + [4, 2], + [4, 2]]) + + assert_array_equal(X_neigh, N1) + assert_array_equal(XY_neigh, N2) + + # Test 2 + nan = float("nan") + samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] + neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="euclidean") + + neigh.fit(samples, kill_missing=False) + X2_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) + + XY2_neigh = neigh.masked_kneighbors([[0, nan, 1]], 2, + return_distance=False) + + # Expected outcome + N3 = np.array( + [[3, 1], + [3, 2], + [3, 1], + [2, 1]]) + N4 = np.array([[1, 3]]) + + assert_array_equal(X2_neigh, N3) + assert_array_equal(XY2_neigh, N4) + + def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors X = rng.random_sample((10, 3)) From e4f8612ffa61d176059f52f1beca42a674ffb94b Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 07:12:43 -0500 Subject: [PATCH 05/19] Docstring example issues --- sklearn/metrics/pairwise.py | 4 +++- sklearn/neighbors/base.py | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index f01f4fd36ab33..aa8d3c4140bd9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -222,9 +222,10 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, >>> euclidean_distances(X, [[0, 0]]) array([[ 1. ], [ 1.41421356]]) + See also -------- - paired_distances : distances betweens pairs of elements of X and Y. + paired_distances : distances between pairs of elements of X and Y. """ X, Y = check_pairwise_arrays(X, Y) @@ -324,6 +325,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, >>> masked_euclidean_distances(X, X) array([[ 0., 1.41421356], [ 1.41421356, 0.]]) + >>> # get distance to origin >>> masked_euclidean_distances(X, [[0, 0]]) array([[ 1. ], diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index b21c058202ecf..cf3189afac42a 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -500,14 +500,15 @@ class from an array representing our data set and ask who's >>> neigh = NearestNeighbors(n_neighbors=2, metric="euclidean") >>> neigh.fit(samples, kill_missing=False) # doctest: +ELLIPSIS NearestNeighbors(algorithm='auto', leaf_size=30,...) - >>> print(neigh.masked_kneighbors(n_neighbors=2, - >>> return_distance=False)) # doctest: +ELLIPSIS + >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) + >>> print(N) # doctest: +ELLIPSIS (array([[3, 1], [3, 2], [3, 1], [2, 1]])...) >>> X = [[0, nan, 1]] - >>> neigh.masked_kneighbors([[0, nan, 1]], 2, - >>> return_distance=False) # doctest: +ELLIPSIS + >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) + >>> print(N2) # doctest: +ELLIPSIS (array([[1, 3]])...) + """ if self._fit_method is None: raise NotFittedError("Must fit neighbors before querying.") From daf247f7db0f3e4e8fca798cb18a070a78bcdc2a Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 14:05:09 -0500 Subject: [PATCH 06/19] Formatting fixes on docstring --- sklearn/metrics/pairwise.py | 4 ++-- sklearn/neighbors/base.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index aa8d3c4140bd9..423a082085648 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -323,8 +323,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, >>> X = [[0, 1], [1, nan]] >>> # distance between rows of X >>> masked_euclidean_distances(X, X) - array([[ 0., 1.41421356], - [ 1.41421356, 0.]]) + array([[ 0. , 1.41421356], + [ 1.41421356, 0. ]]) >>> # get distance to origin >>> masked_euclidean_distances(X, [[0, 0]]) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index cf3189afac42a..79bda40e4f1c4 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -502,12 +502,15 @@ class from an array representing our data set and ask who's NearestNeighbors(algorithm='auto', leaf_size=30,...) >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) >>> print(N) # doctest: +ELLIPSIS - (array([[3, 1], [3, 2], [3, 1], [2, 1]])...) + [[3 1] + [3 2] + [3 1] + [2 1]] >>> X = [[0, nan, 1]] >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) >>> print(N2) # doctest: +ELLIPSIS - (array([[1, 3]])...) + [[1 3]] """ if self._fit_method is None: From 10f5adb99a5ae7b681203e19d58888eeb055f6cf Mon Sep 17 00:00:00 2001 From: harke Date: Wed, 19 Jul 2017 15:25:44 -0500 Subject: [PATCH 07/19] And yet more fixes --- sklearn/neighbors/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 79bda40e4f1c4..0a1abc006697e 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -503,9 +503,9 @@ class from an array representing our data set and ask who's >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) >>> print(N) # doctest: +ELLIPSIS [[3 1] - [3 2] - [3 1] - [2 1]] + [3 2] + [3 1] + [2 1]] >>> X = [[0, nan, 1]] >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) From 22cf9ef72b2a5a317f4e754d22bad6c89962c055 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 23 Jul 2017 00:07:15 -0500 Subject: [PATCH 08/19] Addressed review comments (Part 2) --- sklearn/metrics/pairwise.py | 190 +++++++++++----------- sklearn/metrics/tests/test_pairwise.py | 5 +- sklearn/neighbors/base.py | 61 +++---- sklearn/neighbors/tests/test_neighbors.py | 12 +- 4 files changed, 135 insertions(+), 133 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 423a082085648..05fe4bd3dbb23 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -8,7 +8,7 @@ # Lars Buitinck # Joel Nothman # License: BSD 3 clause - +from __future__ import division import itertools from functools import partial @@ -64,7 +64,8 @@ def _return_float_dtype(X, Y): def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, - copy=False, force_all_finite=True): + accept_sparse='csr', force_all_finite=True, + copy=False): """ Set X and Y appropriately and checks inputs If Y is None, it is set as a pointer to X (i.e. not a copy). @@ -94,12 +95,24 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, .. versionadded:: 0.18 - copy : bool - Create and return a deep copy of X and Y (if Y exists) + accept_sparse : string, boolean or list/tuple of strings + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + .. deprecated:: 0.19 + Passing 'None' to parameter ``accept_sparse`` in methods is + deprecated in version 0.19 "and will be removed in 0.21. Use + ``accept_sparse=False`` instead. force_all_finite : bool - Throw a ValueError exception if either X or Y (if Y exists) - contains any NaN or +/- inf values + Whether to raise an error on np.inf and np.nan in X (or Y if it exists) + + copy : bool + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. Returns ------- @@ -119,14 +132,14 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, dtype = dtype_float if Y is X or Y is None: - X = Y = check_array(X, accept_sparse='csr', dtype=dtype, + X = Y = check_array(X, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) else: - X = check_array(X, accept_sparse='csr', dtype=dtype, + X = check_array(X, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) - Y = check_array(Y, accept_sparse='csr', dtype=dtype, + Y = check_array(Y, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, warn_on_dtype=warn_on_dtype, estimator=estimator) @@ -266,8 +279,9 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, # Pairwise distances in the presence of missing values def masked_euclidean_distances(X, Y=None, squared=False, - missing_values="NaN", copy=True, **kwargs): - """ + missing_values="NaN", copy=True): + """Calculates euclidean distances in the presence of missing values + Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. Similarly, if Y is not X, then compute the distance matrix between each @@ -275,25 +289,47 @@ def masked_euclidean_distances(X, Y=None, squared=False, This function computes pairwise euclidean distance for vectors in dense matrices X and Y with missing values in arbitrary - coordinates. The following formula is used for this: + coordinates. + + The following formula is used for this: - dist(X, Y) = (X.shape[1] * 1 / ((dot(NX, NYT)))) * + dist(X, Y) = (X.shape[1] / ((dot(NX, NYT)))) * (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + dot(NX, (Y.T * Y.T))) where NX and NYT represent the logical-not of the missing masks of - X and Y.T, respectively.This formulation zero-weights feature coordinates - with missing value in either vector in the pair and up-weights the - remaining coordinates. - Formula derived by: Shreya Bhattarai + X and Y.T, respectively. + Formula in matrix form derived by: + Shreya Bhattarai + + This formulation zero-weights feature coordinates with missing value in + either vector in the pair and up-weights the remaining coordinates. + For instance, say we have two sample points (x1, y1) and (x2, NaN): + + To calculate the euclidean distance between these, first the square + "distance" is calculated based only on the first feature coordinate + as the second coordinate is missing in one of the samples, + i.e., we have (x2-x1)**2. This squared distance is scaled-up by the ratio + of total number of coordinates to the number of available coordinates, + which in this case is 2/1 = 2. Now, we are left with 2*((x2-x1)**2). + Finally, if squared=False then the square root of this is evaluated + and returned otherwise the value is returned as is. Breakdown of euclidean distance calculation between a vector pair x,y: weight = Total # of coordinates / # of non-missing coordinates dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates) - This of course implies that if all coordinates are missing in either - vector in the pair then NaN is returned for that pair. + This formulation implies that if all coordinates are missing in either + vector in the pair or if there are no common non-missing coordinates then + NaN is returned for that pair. + + References + ---------- + John K. Dixon, "Pattern Recognition with Partly Missing Data", + IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: + 10, pp. 617 - 621, Oct. 1979. + http://ieeexplore.ieee.org/abstract/document/4310090/ Read more in the :ref:`User Guide `. @@ -306,7 +342,7 @@ def masked_euclidean_distances(X, Y=None, squared=False, squared : boolean, optional Return squared Euclidean distances. - missing_values : "NaN" or integer, optional (default=”NaN”) + missing_values : "NaN" or integer, optional Representation of missing value copy : boolean, optional @@ -335,13 +371,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, -------- paired_distances : distances betweens pairs of elements of X and Y. """ - # Check and except sparse matrices - if issparse(X) or (Y is not None and issparse(Y)): - raise ValueError( - "Missing value support for sparse matrices not added yet") - # NOTE: force_all_finite=False allows not only NaN but also inf/-inf - X, Y = check_pairwise_arrays(X, Y, + # NOTE: force_all_finite=False allows not only NaN but also +/- inf + X, Y = check_pairwise_arrays(X, Y, accept_sparse=False, force_all_finite=False, copy=copy) if (np.any(np.isinf(X.data)) or (Y is not None and np.any(np.isinf(Y.data)))): @@ -353,34 +385,6 @@ def masked_euclidean_distances(X, Y=None, squared=False, or (Y is not None and np.any(_get_mask(Y, missing_values).sum( axis=1) == Y.shape[1])): raise ValueError("One or more rows only contain missing values.") - # - # if kill_missing: - # if X_norm_squared is not None: - # XX = check_array(X_norm_squared) - # if XX.shape == (1, X.shape[0]): - # XX = XX.T - # elif XX.shape != (X.shape[0], 1): - # raise ValueError( - # "Incompatible dimensions for X and X_norm_squared") - # else: - # XX = row_norms(X, squared=True)[:, np.newaxis] - # - # if X is Y: # shortcut in the common case euclidean_distances(X, X) - # YY = XX.T - # elif Y_norm_squared is not None: - # YY = np.atleast_2d(Y_norm_squared) - # - # if YY.shape != (1, Y.shape[0]): - # raise ValueError( - # "Incompatible dimensions for Y and Y_norm_squared") - # else: - # YY = row_norms(Y, squared=True)[np.newaxis, :] - # - # distances = safe_sparse_dot(X, Y.T, dense_output=True) - # distances *= -2 - # distances += XX - # distances += YY - # np.maximum(distances, 0, out=distances) # else: if missing_values != "NaN" and \ @@ -405,16 +409,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, # Calculate distances - # distances = (X.shape[1] * 1 / ((np.dot(NX, NYT)))) * \ - # (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - # np.dot(NX, (YT * YT))) - - # Above is faster but following for Python 2.x support - distances = np.multiply(np.multiply(X.shape[1], - (1.0 / np.dot(NX, NYT))), - (np.dot(np.multiply(X, X), NYT) - - (2.0 * (np.dot(X, YT))) + - np.dot(NX, (np.multiply(YT, YT))))) + distances = (X.shape[1] / ((np.dot(NX, NYT)))) * \ + (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + + np.dot(NX, (YT * YT))) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1208,12 +1205,13 @@ def chi2_kernel(X, Y=None, gamma=1.): 'l1': manhattan_distances, 'manhattan': manhattan_distances, 'precomputed': None, # HACK: precomputed is always allowed, never called + 'masked_euclidean': masked_euclidean_distances, } # Helper functions with missing value support - distance -MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { - 'euclidean': masked_euclidean_distances, -} +# MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { +# 'euclidean': masked_euclidean_distances, +# } def distance_metrics(): @@ -1225,16 +1223,17 @@ def distance_metrics(): The valid distance metrics, and the function they map to, are: - ============ ==================================== - metric Function - ============ ==================================== - 'cityblock' metrics.pairwise.manhattan_distances - 'cosine' metrics.pairwise.cosine_distances - 'euclidean' metrics.pairwise.euclidean_distances - 'l1' metrics.pairwise.manhattan_distances - 'l2' metrics.pairwise.euclidean_distances - 'manhattan' metrics.pairwise.manhattan_distances - ============ ==================================== + ============ ==================================== + metric Function + ============ ==================================== + 'cityblock' metrics.pairwise.manhattan_distances + 'cosine' metrics.pairwise.cosine_distances + 'euclidean' metrics.pairwise.euclidean_distances + 'l1' metrics.pairwise.manhattan_distances + 'l2' metrics.pairwise.euclidean_distances + 'manhattan' metrics.pairwise.manhattan_distances + 'masked_euclidean' metrics.pairwise.masked_euclidean_distances + ============ ==================================== Read more in the :ref:`User Guide `. @@ -1301,9 +1300,10 @@ def _pairwise_callable(X, Y, metric, **kwds): 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', - 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] + 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski", + 'masked_euclidean'] -_MASKED_SUPPORTED_METRICS = ['euclidean'] +_MASKED_SUPPORTED_METRICS = ['masked_euclidean'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): @@ -1323,7 +1323,9 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan']. These metrics support sparse matrix inputs. + 'manhattan']. These metrics support sparse matrix + inputs. + Also, ['masked_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', @@ -1392,33 +1394,27 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "callable" % (metric, _VALID_METRICS)) # To handle kill_missing = False - kill_missing = kwds.get("kill_missing") - if not kill_missing and kill_missing is not None: + # kill_missing = kwds.get("kill_missing") + # if not kill_missing and kill_missing is not None: + if metric in _MASKED_SUPPORTED_METRICS: missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan - if (metric not in _MASKED_SUPPORTED_METRICS): - raise ValueError( - "Metric {0} does not have missing value support ".format( - metric) - ) - if issparse(X) or (Y is not None and issparse(Y)): - raise ValueError( - "Missing value support for sparse matrices not added yet") - # if (kwds.get("missing_values") is None): - # raise ValueError("Missing value is not defined") + # if (metric not in _MASKED_SUPPORTED_METRICS): + # raise ValueError( + # "Metric {0} does not have missing value support ".format( + # metric) + # ) if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") - # if type(metric) is str: - # metric = "masked_" + metric if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X - elif kill_missing is False and metric in \ - MASKED_PAIRWISE_DISTANCE_FUNCTIONS: - func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] + # elif kill_missing is False and metric in \ + # MASKED_PAIRWISE_DISTANCE_FUNCTIONS: + # func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 8732bf5e6d70a..774bf66957935 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -59,13 +59,12 @@ def test_pairwise_distances(): assert_array_almost_equal(S, S2) # Euclidean dist. (masked) should be equivalent to calling the function. X = rng.random_sample((5, 4)) - S = pairwise_distances(X, metric="euclidean", kill_missing=False) + S = pairwise_distances(X, metric="masked_euclidean") S2 = masked_euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) - S = pairwise_distances(X, Y, metric="euclidean", - kill_missing=False) + S = pairwise_distances(X, Y, metric="masked_euclidean") S2 = masked_euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 0a1abc006697e..78f319716ec85 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -158,25 +158,29 @@ def _init_params(self, n_neighbors=None, radius=None, self._tree = None self._fit_method = None - def _fit(self, X, kill_missing=True): - if not kill_missing: - if self.metric not in _MASKED_SUPPORTED_METRICS: - raise ValueError( - "Metric {0} is currently not supported for " - "data containing missing values.".format(self.metric) - ) - - _MASKED_SUPPORTED_ALGORITHMS = ["brute"] - if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: - if self.algorithm == "auto": - pass - else: - warnings.warn( - "{0} algorithm is currently not supported for " - "data containing missing values. " - "Reverting to a supported algorithm.". - format(self.algorithm)) - self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] + def _fit(self, X): + if self.metric in _MASKED_SUPPORTED_METRICS: + kill_missing = False + # if not kill_missing: + # if self.metric not in _MASKED_SUPPORTED_METRICS: + # raise ValueError( + # "Metric {0} is currently not supported for " + # "data containing missing values.".format(self.metric) + # ) + # + # _MASKED_SUPPORTED_ALGORITHMS = ["brute"] + # if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: + # if self.algorithm == "auto": + # pass + # else: + # warnings.warn( + # "{0} algorithm is currently not supported for " + # "data containing missing values. " + # "Reverting to a supported algorithm.". + # format(self.algorithm)) + # self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] + else: + kill_missing = True if self.metric_params is None: self.effective_metric_params_ = {} @@ -360,8 +364,6 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - # copy=True if missing accepted as they will be replaced by 0 - # copy = True if kill_missing is False else False X = check_array(X, accept_sparse='csr') else: query_is_train = True @@ -497,8 +499,8 @@ class from an array representing our data set and ask who's >>> nan = float("nan") >>> samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] >>> from sklearn.neighbors import NearestNeighbors - >>> neigh = NearestNeighbors(n_neighbors=2, metric="euclidean") - >>> neigh.fit(samples, kill_missing=False) # doctest: +ELLIPSIS + >>> neigh = NearestNeighbors(n_neighbors=2, metric="masked_euclidean") + >>> neigh.fit(samples) # doctest: +ELLIPSIS NearestNeighbors(algorithm='auto', leaf_size=30,...) >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) >>> print(N) # doctest: +ELLIPSIS @@ -543,10 +545,9 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean': - dist = pairwise_distances(X, self._fit_X, 'euclidean', + if self.effective_metric_ == 'masked_euclidean': + dist = pairwise_distances(X, self._fit_X, 'masked_euclidean', n_jobs=n_jobs, squared=True, - kill_missing=False, missing_values=missing_values, copy=copy) else: @@ -565,7 +566,7 @@ class from an array representing our data set and ask who's sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == 'euclidean': + if self.effective_metric_ == 'masked_euclidean': result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = dist[sample_range, neigh_ind], neigh_ind @@ -991,7 +992,8 @@ def fit(self, X, y): class UnsupervisedMixin(object): - def fit(self, X, y=None, kill_missing=True): + # def fit(self, X, y=None, kill_missing=True): + def fit(self, X, y=None): """Fit the model using X as training data Parameters @@ -1000,4 +1002,5 @@ def fit(self, X, y=None, kill_missing=True): Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. """ - return self._fit(X, kill_missing) + # return self._fit(X, kill_missing) + return self._fit(X) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index f772e8b963e14..be457fdcf4b5a 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -102,8 +102,8 @@ def test_masked_unsupervised_kneighbors(): [3., 1., 3., 3.], [2., 3., 1., 9.]], dtype=np.float32) - neigh = neighbors.NearestNeighbors(2, metric="euclidean") - neigh.fit(X, kill_missing=False) + neigh = neighbors.NearestNeighbors(2, metric="masked_euclidean") + neigh.fit(X) X_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) XY_neigh = neigh.masked_kneighbors(Y, 2, return_distance=False) @@ -128,9 +128,10 @@ def test_masked_unsupervised_kneighbors(): # Test 2 nan = float("nan") samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] - neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="euclidean") + neigh = neighbors.NearestNeighbors(n_neighbors=2, + metric="masked_euclidean") - neigh.fit(samples, kill_missing=False) + neigh.fit(samples) X2_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) XY2_neigh = neigh.masked_kneighbors([[0, nan, 1]], 2, @@ -1079,6 +1080,9 @@ def test_valid_brute_metric_for_auto_algorithm(): nb_p.kneighbors(DYX) for metric in VALID_METRICS_SPARSE['brute']: + # TODO: Remove after adding sparse support for masked_euclidean + if metric == "masked_euclidean": + continue if metric != 'precomputed' and metric not in require_params: nn = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto', metric=metric).fit(Xcsr) From 2482c8ad3a7dd9acbf6c3a4921c4eb9c3807d830 Mon Sep 17 00:00:00 2001 From: harke Date: Sun, 23 Jul 2017 00:38:04 -0500 Subject: [PATCH 09/19] Changed nan-mask from int8 to int32 --- sklearn/metrics/pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 05fe4bd3dbb23..a34ddb56d7d9f 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -400,11 +400,11 @@ def masked_euclidean_distances(X, Y=None, squared=False, # Get Y.T mask and anti-mask and set Y.T's missing to zero YT = Y.T mask_YT = _get_mask(YT, missing_values) - NYT = (~mask_YT).astype(np.int8) + NYT = (~mask_YT).astype(np.int32) YT[mask_YT] = 0 # Get X anti-mask and set X's missing to zero - NX = (~mask_X).astype(np.int8) + NX = (~mask_X).astype(np.int32) X[mask_X] = 0 # Calculate distances From 66527cd81f4da7a1c02c7bf98a774d9733fbf76d Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 24 Jul 2017 00:52:24 -0500 Subject: [PATCH 10/19] Addressed review comments (#3) --- sklearn/metrics/pairwise.py | 42 ++--- sklearn/metrics/tests/test_pairwise.py | 53 +++--- sklearn/neighbors/base.py | 201 ++-------------------- sklearn/neighbors/tests/test_neighbors.py | 11 +- 4 files changed, 56 insertions(+), 251 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index a34ddb56d7d9f..ebbdb1ad7e8ae 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -324,13 +324,6 @@ def masked_euclidean_distances(X, Y=None, squared=False, vector in the pair or if there are no common non-missing coordinates then NaN is returned for that pair. - References - ---------- - John K. Dixon, "Pattern Recognition with Partly Missing Data", - IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: - 10, pp. 617 - 621, Oct. 1979. - http://ieeexplore.ieee.org/abstract/document/4310090/ - Read more in the :ref:`User Guide `. Parameters @@ -367,6 +360,13 @@ def masked_euclidean_distances(X, Y=None, squared=False, array([[ 1. ], [ 1.41421356]]) + References + ---------- + * John K. Dixon, "Pattern Recognition with Partly Missing Data", + IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: + 10, pp. 617 - 621, Oct. 1979. + http://ieeexplore.ieee.org/abstract/document/4310090/ + See also -------- paired_distances : distances betweens pairs of elements of X and Y. @@ -409,9 +409,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, # Calculate distances - distances = (X.shape[1] / ((np.dot(NX, NYT)))) * \ - (np.dot((X * X), NYT) - 2 * (np.dot(X, YT)) + - np.dot(NX, (YT * YT))) + distances = (X.shape[1] / (np.dot(NX, NYT))) * \ + (np.dot(X * X, NYT) - 2 * (np.dot(X, YT)) + + np.dot(NX, YT * YT)) if X is Y: # Ensure that distances between vectors and themselves are set to 0.0. @@ -1208,11 +1208,6 @@ def chi2_kernel(X, Y=None, gamma=1.): 'masked_euclidean': masked_euclidean_distances, } -# Helper functions with missing value support - distance -# MASKED_PAIRWISE_DISTANCE_FUNCTIONS = { -# 'euclidean': masked_euclidean_distances, -# } - def distance_metrics(): """Valid metrics for pairwise_distances. @@ -1223,9 +1218,9 @@ def distance_metrics(): The valid distance metrics, and the function they map to, are: - ============ ==================================== + =================== ============================================ metric Function - ============ ==================================== + =================== ============================================ 'cityblock' metrics.pairwise.manhattan_distances 'cosine' metrics.pairwise.cosine_distances 'euclidean' metrics.pairwise.euclidean_distances @@ -1233,7 +1228,7 @@ def distance_metrics(): 'l2' metrics.pairwise.euclidean_distances 'manhattan' metrics.pairwise.manhattan_distances 'masked_euclidean' metrics.pairwise.masked_euclidean_distances - ============ ==================================== + =================== ============================================ Read more in the :ref:`User Guide `. @@ -1393,18 +1388,10 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): "Valid metrics are %s, or 'precomputed', or a " "callable" % (metric, _VALID_METRICS)) - # To handle kill_missing = False - # kill_missing = kwds.get("kill_missing") - # if not kill_missing and kill_missing is not None: if metric in _MASKED_SUPPORTED_METRICS: missing_values = kwds.get("missing_values") if kwds.get( "missing_values") is not None else np.nan - # if (metric not in _MASKED_SUPPORTED_METRICS): - # raise ValueError( - # "Metric {0} does not have missing value support ".format( - # metric) - # ) if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])): raise ValueError( "One or more samples(s) only have missing values.") @@ -1412,9 +1399,6 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): if metric == "precomputed": X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X - # elif kill_missing is False and metric in \ - # MASKED_PAIRWISE_DISTANCE_FUNCTIONS: - # func = MASKED_PAIRWISE_DISTANCE_FUNCTIONS[metric] elif metric in PAIRWISE_DISTANCE_FUNCTIONS: func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 774bf66957935..c92c41532fca5 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -58,15 +58,13 @@ def test_pairwise_distances(): S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Euclidean dist. (masked) should be equivalent to calling the function. - X = rng.random_sample((5, 4)) - S = pairwise_distances(X, metric="masked_euclidean") - S2 = masked_euclidean_distances(X) - assert_array_almost_equal(S, S2) - # Euclidean distance, with Y != X. - Y = rng.random_sample((2, 4)) - S = pairwise_distances(X, Y, metric="masked_euclidean") - S2 = masked_euclidean_distances(X, Y) - assert_array_almost_equal(S, S2) + X_masked = rng.random_sample((5, 4)) + Y_masked = rng.random_sample((2, 4)) + X_masked[0, 0] = np.nan + Y_masked[0, 0] = np.nan + S_masked = pairwise_distances(X_masked, Y_masked, metric="masked_euclidean") + S2_masked = masked_euclidean_distances(X_masked, Y_masked) + assert_array_almost_equal(S_masked, S2_masked) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) @@ -419,20 +417,6 @@ def test_euclidean_distances(): def test_masked_euclidean_distances(): - # first check that we get right answer with missing values for X - X = np.array([[1., 5., 7., 5., 10.], - [8., 2., 4., np.nan, 8.], - [5., np.nan, 5., np.nan, 1.], - [8., np.nan, np.nan, np.nan, np.nan]]) - D1 = masked_euclidean_distances(X, missing_values="NaN") - - D2 = np.array([[0., 9.42072184, 12.97433364, 15.65247584], - [9.42072184, 0., 9.91631652, 0.], - [12.97433364, 9.91631652, 0., 6.70820393], - [15.65247584, 0., 6.70820393, 0.]]) - - assert_array_almost_equal(D1, D2) - # check with pairs of matrices with missing values X = np.array([[1., np.nan, 3., 4., 2.], [np.nan, 4., 6., 1., np.nan], @@ -442,14 +426,33 @@ def test_masked_euclidean_distances(): [np.nan, np.nan, 5., 4., 7.], [np.nan, np.nan, np.nan, 4., 5.]]) - D3 = np.array([[6.32455532, 6.95221787, 4.74341649], + D1 = np.array([[6.32455532, 6.95221787, 4.74341649], [5., 5., 6.70820393], [2.23606798, 13.41640786, 8.94427191]]) - D4 = masked_euclidean_distances(X, Y, missing_values="NaN") + D2 = masked_euclidean_distances(X, Y, missing_values="NaN") + + assert_array_almost_equal(D1, D2) + + # check when squared = True + D3 = np.array( + [[40., 48.33333331, 22.5], + [25., 25., 45.], + [5., 180., 80.]]) + D4 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN") assert_array_almost_equal(D3, D4) + # Check with explicit formula and square=True + assert_array_almost_equal( + masked_euclidean_distances(X[:1], Y[:1], squared=True), + [[5.0/2.0 * ((7-3)**2 + (2-2)**2)]]) + + # Check when Y = X is explicitly passed + D5 = masked_euclidean_distances(X, missing_values="NaN") + D6 = masked_euclidean_distances(X, X, missing_values="NaN") + assert_array_almost_equal(D5, D6) + def test_cosine_distances(): # Check the pairwise Cosine distances computation diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 78f319716ec85..e173868f19040 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -161,24 +161,6 @@ def _init_params(self, n_neighbors=None, radius=None, def _fit(self, X): if self.metric in _MASKED_SUPPORTED_METRICS: kill_missing = False - # if not kill_missing: - # if self.metric not in _MASKED_SUPPORTED_METRICS: - # raise ValueError( - # "Metric {0} is currently not supported for " - # "data containing missing values.".format(self.metric) - # ) - # - # _MASKED_SUPPORTED_ALGORITHMS = ["brute"] - # if self.algorithm not in _MASKED_SUPPORTED_ALGORITHMS: - # if self.algorithm == "auto": - # pass - # else: - # warnings.warn( - # "{0} algorithm is currently not supported for " - # "data containing missing values. " - # "Reverting to a supported algorithm.". - # format(self.algorithm)) - # self.algorithm = _MASKED_SUPPORTED_ALGORITHMS[0] else: kill_missing = True @@ -225,8 +207,6 @@ def _fit(self, X): self._fit_method = 'kd_tree' return self - # # copy=True if missing accepted as they will be replaced by 0 - # copy = True if kill_missing is False else False X = check_array(X, accept_sparse='csr', force_all_finite=kill_missing) @@ -364,7 +344,11 @@ class from an array representing our data set and ask who's if X is not None: query_is_train = False - X = check_array(X, accept_sparse='csr') + if self.effective_metric_ in _MASKED_SUPPORTED_METRICS: + X = check_array(X, accept_sparse='csr', + force_all_finite=False) + else: + X = check_array(X, accept_sparse='csr') else: query_is_train = True X = self._fit_X @@ -388,6 +372,10 @@ class from an array representing our data set and ask who's if self.effective_metric_ == 'euclidean': dist = pairwise_distances(X, self._fit_X, 'euclidean', n_jobs=n_jobs, squared=True) + elif self.effective_metric_ == 'masked_euclidean': + dist = pairwise_distances(X, self._fit_X, + 'masked_euclidean', + n_jobs=n_jobs, squared=True) else: dist = pairwise_distances( X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, @@ -400,7 +388,8 @@ class from an array representing our data set and ask who's sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == 'euclidean': + if self.effective_metric_ == 'euclidean' or self.\ + effective_metric_ == 'masked_euclidean': result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = dist[sample_range, neigh_ind], neigh_ind @@ -454,174 +443,6 @@ class from an array representing our data set and ask who's return dist, neigh_ind return neigh_ind - def masked_kneighbors(self, X=None, n_neighbors=None, return_distance=True, - missing_values="NaN", copy=True): - """Finds the K-neighbors of a point, even when they contain NaN values. - - Returns indices of and distances to the neighbors of each point. - - Parameters - ---------- - X : array-like, shape (n_query, n_features), \ - or (n_query, n_indexed) if metric == 'precomputed' - The query point or points. - If not provided, neighbors of each indexed point are returned. - In this case, the query point is not considered its own neighbor. - - n_neighbors : int - Number of neighbors to get (default is the value - passed to the constructor). - - return_distance : boolean, optional. Defaults to True. - If False, distances will not be returned - - missing_values : "NaN" or integer, optional. Default is "NaN". - Representation of missing value - - copy : boolean, optional. Default is True. - Create and use a deep copy of X - - Returns - ------- - dist : array - Array representing the lengths to points, only present if - return_distance=True - - ind : array - Indices of the nearest points in the population matrix. - - Examples - -------- - In the following example, we construct a NeighborsClassifier - class from an array representing our data set and ask who's - the closest point to [0, nan, 1], where "nan" represents a - missing value. - >>> nan = float("nan") - >>> samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] - >>> from sklearn.neighbors import NearestNeighbors - >>> neigh = NearestNeighbors(n_neighbors=2, metric="masked_euclidean") - >>> neigh.fit(samples) # doctest: +ELLIPSIS - NearestNeighbors(algorithm='auto', leaf_size=30,...) - >>> N = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) - >>> print(N) # doctest: +ELLIPSIS - [[3 1] - [3 2] - [3 1] - [2 1]] - - >>> X = [[0, nan, 1]] - >>> N2 = neigh.masked_kneighbors(X, 2, return_distance=False) - >>> print(N2) # doctest: +ELLIPSIS - [[1 3]] - - """ - if self._fit_method is None: - raise NotFittedError("Must fit neighbors before querying.") - - if n_neighbors is None: - n_neighbors = self.n_neighbors - - if X is not None: - query_is_train = False - X = check_array(X, accept_sparse='csr', - force_all_finite=False, copy=copy) - else: - query_is_train = True - X = self._fit_X - # Include an extra neighbor to account for the sample itself being - # returned, which is removed later - n_neighbors += 1 - - train_size = self._fit_X.shape[0] - if n_neighbors > train_size: - raise ValueError( - "Expected n_neighbors <= n_samples, " - " but n_samples = %d, n_neighbors = %d" % - (train_size, n_neighbors) - ) - n_samples, _ = X.shape - sample_range = np.arange(n_samples)[:, None] - - n_jobs = _get_n_jobs(self.n_jobs) - if self._fit_method == 'brute': - # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'masked_euclidean': - dist = pairwise_distances(X, self._fit_X, 'masked_euclidean', - n_jobs=n_jobs, squared=True, - missing_values=missing_values, - copy=copy) - else: - # dist = pairwise_distances( - # X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, - # **self.effective_metric_params_) - raise ValueError( - "Only the following metrics are currently supported for " - "data with missing values:{0}". - format(_MASKED_SUPPORTED_METRICS) - ) - neigh_ind = np.argpartition(dist, n_neighbors - 1, axis=1) - neigh_ind = neigh_ind[:, :n_neighbors] - # argpartition doesn't guarantee sorted order, so we sort again - neigh_ind = neigh_ind[ - sample_range, np.argsort(dist[sample_range, neigh_ind])] - - if return_distance: - if self.effective_metric_ == 'masked_euclidean': - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind - else: - result = neigh_ind - - # elif self._fit_method in ['ball_tree', 'kd_tree']: - # if issparse(X): - # raise ValueError( - # "%s does not work with sparse matrices." - # "Densify the data, " - # "or set algorithm='brute'" % self._fit_method) - # result = Parallel(n_jobs, backend='threading')( - # delayed(self._tree.query, check_pickle=False)( - # X[s], n_neighbors, return_distance) - # for s in gen_even_slices(X.shape[0], n_jobs) - # ) - # if return_distance: - # dist, neigh_ind = tuple(zip(*result)) - # result = np.vstack(dist), np.vstack(neigh_ind) - # else: - # result = np.vstack(result) - else: - raise ValueError("internal: _fit_method not recognized for data " - "containing missing") - - if not query_is_train: - return result - else: - # If the query data is the same as the indexed data, we would like - # to ignore the first nearest neighbor of every sample, i.e - # the sample itself. - if return_distance: - dist, neigh_ind = result - else: - neigh_ind = result - - sample_mask = neigh_ind != sample_range - - # Corner case: When the number of duplicates are more - # than the number of neighbors, the first NN will not - # be the sample, but a duplicate. - # In that case mask the first duplicate. - dup_gr_nbrs = np.all(sample_mask, axis=1) - sample_mask[:, 0][dup_gr_nbrs] = False - - neigh_ind = np.reshape( - neigh_ind[sample_mask], (n_samples, n_neighbors - 1)) - - if return_distance: - dist = np.reshape( - dist[sample_mask], (n_samples, n_neighbors - 1)) - return dist, neigh_ind - return neigh_ind - def kneighbors_graph(self, X=None, n_neighbors=None, mode='connectivity'): """Computes the (weighted) graph of k-Neighbors for points in X diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index be457fdcf4b5a..b9390883f9903 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -104,9 +104,8 @@ def test_masked_unsupervised_kneighbors(): neigh = neighbors.NearestNeighbors(2, metric="masked_euclidean") neigh.fit(X) - X_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) - XY_neigh = neigh.masked_kneighbors(Y, 2, return_distance=False) - + X_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) + XY_neigh = neigh.kneighbors(Y, 2, return_distance=False) # Expected outcome N1 = np.array( [[1, 4], @@ -130,12 +129,10 @@ def test_masked_unsupervised_kneighbors(): samples = [[0, 5, 5], [1, 0, nan], [4, 1, 1], [nan, 2, 3]] neigh = neighbors.NearestNeighbors(n_neighbors=2, metric="masked_euclidean") - neigh.fit(samples) - X2_neigh = neigh.masked_kneighbors(n_neighbors=2, return_distance=False) - XY2_neigh = neigh.masked_kneighbors([[0, nan, 1]], 2, - return_distance=False) + X2_neigh = neigh.kneighbors(n_neighbors=2, return_distance=False) + XY2_neigh = neigh.kneighbors([[0, nan, 1]], 2, return_distance=False) # Expected outcome N3 = np.array( From a968b1e27af4b71990d8a7e117f4f56b67dd7833 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 24 Jul 2017 00:57:34 -0500 Subject: [PATCH 11/19] Pep8 fix --- sklearn/metrics/tests/test_pairwise.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index c92c41532fca5..3247862726a01 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -62,7 +62,8 @@ def test_pairwise_distances(): Y_masked = rng.random_sample((2, 4)) X_masked[0, 0] = np.nan Y_masked[0, 0] = np.nan - S_masked = pairwise_distances(X_masked, Y_masked, metric="masked_euclidean") + S_masked = pairwise_distances(X_masked, Y_masked, + metric="masked_euclidean") S2_masked = masked_euclidean_distances(X_masked, Y_masked) assert_array_almost_equal(S_masked, S2_masked) # Test with tuples as X and Y From 356c8e86278f03d02bbe3288520a25c55f6c9762 Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 24 Jul 2017 02:35:10 -0500 Subject: [PATCH 12/19] Comment edit on test_pairwise --- sklearn/metrics/tests/test_pairwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 3247862726a01..ee0020ee4e1d0 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -57,7 +57,7 @@ def test_pairwise_distances(): S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) - # Euclidean dist. (masked) should be equivalent to calling the function. + # Check to ensure NaNs work with pairwise_distances. X_masked = rng.random_sample((5, 4)) Y_masked = rng.random_sample((2, 4)) X_masked[0, 0] = np.nan From d6aeaf31f8524dde4bd82a18c4e42ea79d96f1cf Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 01:15:46 -0500 Subject: [PATCH 13/19] Addressed review comments #4 --- sklearn/metrics/pairwise.py | 40 ++++++++++++------------ sklearn/metrics/tests/test_pairwise.py | 11 +++---- sklearn/neighbors/base.py | 43 +++++++++++--------------- 3 files changed, 43 insertions(+), 51 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index ebbdb1ad7e8ae..58dd5cde301e0 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -197,32 +197,44 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vectors. + For efficiency reasons, the euclidean distance between a pair of row vector x and y is computed as:: + dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) + This formulation has two advantages over other ways of computing distances. First, it is computationally efficient when dealing with sparse data. Second, if one argument varies but the other remains unchanged, then `dot(x, x)` and/or `dot(y, y)` can be pre-computed. + However, this is not the most precise way of doing this computation, and the distance matrix returned by this function may not be exactly symmetric as required by, e.g., ``scipy.spatial.distance`` functions. + Read more in the :ref:`User Guide `. + Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples_1, n_features) + Y : {array-like, sparse matrix}, shape (n_samples_2, n_features) + Y_norm_squared : array-like, shape (n_samples_2, ), optional Pre-computed dot-products of vectors in Y (e.g., ``(Y**2).sum(axis=1)``) + squared : boolean, optional Return squared Euclidean distances. + X_norm_squared : array-like, shape = [n_samples_1], optional Pre-computed dot-products of vectors in X (e.g., ``(X**2).sum(axis=1)``) + Returns ------- distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2) + Examples -------- >>> from sklearn.metrics.pairwise import euclidean_distances @@ -277,7 +289,6 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False, return distances if squared else np.sqrt(distances, out=distances) -# Pairwise distances in the presence of missing values def masked_euclidean_distances(X, Y=None, squared=False, missing_values="NaN", copy=True): """Calculates euclidean distances in the presence of missing values @@ -302,27 +313,18 @@ def masked_euclidean_distances(X, Y=None, squared=False, Formula in matrix form derived by: Shreya Bhattarai - This formulation zero-weights feature coordinates with missing value in - either vector in the pair and up-weights the remaining coordinates. - For instance, say we have two sample points (x1, y1) and (x2, NaN): + When calculating the distance between a pair of samples, this formulation + essentially zero-weights feature coordinates with a missing value in either + sample and scales up the weight of the remaining coordinates: - To calculate the euclidean distance between these, first the square - "distance" is calculated based only on the first feature coordinate - as the second coordinate is missing in one of the samples, - i.e., we have (x2-x1)**2. This squared distance is scaled-up by the ratio - of total number of coordinates to the number of available coordinates, - which in this case is 2/1 = 2. Now, we are left with 2*((x2-x1)**2). - Finally, if squared=False then the square root of this is evaluated - and returned otherwise the value is returned as is. - - Breakdown of euclidean distance calculation between a vector pair x,y: - - weight = Total # of coordinates / # of non-missing coordinates dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates) + where, + weight = Total # of coordinates / # of non-missing coordinates - This formulation implies that if all coordinates are missing in either - vector in the pair or if there are no common non-missing coordinates then - NaN is returned for that pair. + For instance, the distance between sample points (x1, y1) and (x2, NaN) + would result in sqrt(2*((x2-x1)**2). Note that if all the coordinates are + missing or if there are no common non-missing coordinates then NaN is + returned for that pair. Read more in the :ref:`User Guide `. diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index ee0020ee4e1d0..812d35eace9cd 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -427,13 +427,10 @@ def test_masked_euclidean_distances(): [np.nan, np.nan, 5., 4., 7.], [np.nan, np.nan, np.nan, 4., 5.]]) - D1 = np.array([[6.32455532, 6.95221787, 4.74341649], - [5., 5., 6.70820393], - [2.23606798, 13.41640786, 8.94427191]]) + D1 = masked_euclidean_distances(X, Y, missing_values="NaN") + D2 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN") - D2 = masked_euclidean_distances(X, Y, missing_values="NaN") - - assert_array_almost_equal(D1, D2) + assert_array_almost_equal(D1**2, D2) # check when squared = True D3 = np.array( @@ -444,7 +441,7 @@ def test_masked_euclidean_distances(): assert_array_almost_equal(D3, D4) - # Check with explicit formula and square=True + # Check with explicit formula and squared=True assert_array_almost_equal( masked_euclidean_distances(X[:1], Y[:1], squared=True), [[5.0/2.0 * ((7-3)**2 + (2-2)**2)]]) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index e173868f19040..a6ffbff630c2a 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -159,10 +159,8 @@ def _init_params(self, n_neighbors=None, radius=None, self._fit_method = None def _fit(self, X): - if self.metric in _MASKED_SUPPORTED_METRICS: - kill_missing = False - else: - kill_missing = True + allow_nans = True if self.\ + metric in _MASKED_SUPPORTED_METRICS else False if self.metric_params is None: self.effective_metric_params_ = {} @@ -208,29 +206,28 @@ def _fit(self, X): return self X = check_array(X, accept_sparse='csr', - force_all_finite=kill_missing) + force_all_finite=not allow_nans) n_samples = X.shape[0] if n_samples == 0: raise ValueError("n_samples must be greater than 0") if issparse(X): - if not kill_missing: + if allow_nans: raise ValueError( "Nearest neighbor algorithm does not currently support" - "the use of sparse matrices." + "the use of sparse matrices for missing values." ) - else: - if self.algorithm not in ('auto', 'brute'): - warnings.warn("cannot use tree with sparse input: " - "using brute force") - if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: - raise ValueError("metric '%s' not valid for sparse input" - % self.effective_metric_) - self._fit_X = X.copy() - self._tree = None - self._fit_method = 'brute' - return self + if self.algorithm not in ('auto', 'brute'): + warnings.warn("cannot use tree with sparse input: " + "using brute force") + if self.effective_metric_ not in VALID_METRICS_SPARSE['brute']: + raise ValueError("metric '%s' not valid for sparse input" + % self.effective_metric_) + self._fit_X = X.copy() + self._tree = None + self._fit_method = 'brute' + return self self._fit_method = self.algorithm self._fit_X = X @@ -369,12 +366,10 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean': - dist = pairwise_distances(X, self._fit_X, 'euclidean', - n_jobs=n_jobs, squared=True) - elif self.effective_metric_ == 'masked_euclidean': + if self.effective_metric_ == 'euclidean' or self.\ + effective_metric_ == 'masked_euclidean': dist = pairwise_distances(X, self._fit_X, - 'masked_euclidean', + self.effective_metric_, n_jobs=n_jobs, squared=True) else: dist = pairwise_distances( @@ -813,7 +808,6 @@ def fit(self, X, y): class UnsupervisedMixin(object): - # def fit(self, X, y=None, kill_missing=True): def fit(self, X, y=None): """Fit the model using X as training data @@ -823,5 +817,4 @@ def fit(self, X, y=None): Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. """ - # return self._fit(X, kill_missing) return self._fit(X) From e8ccdee096add356c55e6d20986a2f625bfbc57f Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 02:37:42 -0500 Subject: [PATCH 14/19] replaced or with in --- sklearn/neighbors/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index a6ffbff630c2a..00f3537f81354 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -366,8 +366,7 @@ class from an array representing our data set and ask who's n_jobs = _get_n_jobs(self.n_jobs) if self._fit_method == 'brute': # for efficiency, use squared euclidean distances - if self.effective_metric_ == 'euclidean' or self.\ - effective_metric_ == 'masked_euclidean': + if self.effective_metric_ in ['euclidean', 'masked_euclidean']: dist = pairwise_distances(X, self._fit_X, self.effective_metric_, n_jobs=n_jobs, squared=True) From 4a8309b6605fdf19bc72303785f15e6d04699146 Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 02:46:05 -0500 Subject: [PATCH 15/19] Changed allow_nans assignment --- sklearn/neighbors/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 00f3537f81354..3e77b51aad740 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -159,8 +159,7 @@ def _init_params(self, n_neighbors=None, radius=None, self._fit_method = None def _fit(self, X): - allow_nans = True if self.\ - metric in _MASKED_SUPPORTED_METRICS else False + allow_nans = self.metric in _MASKED_SUPPORTED_METRICS if self.metric_params is None: self.effective_metric_params_ = {} From 5cbc156a4663621f9555f150a0f1d1c50bf2b988 Mon Sep 17 00:00:00 2001 From: harke Date: Tue, 25 Jul 2017 02:50:11 -0500 Subject: [PATCH 16/19] One more or to in --- sklearn/neighbors/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/neighbors/base.py b/sklearn/neighbors/base.py index 3e77b51aad740..66dbc87fe39f9 100644 --- a/sklearn/neighbors/base.py +++ b/sklearn/neighbors/base.py @@ -381,8 +381,7 @@ class from an array representing our data set and ask who's sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == 'euclidean' or self.\ - effective_metric_ == 'masked_euclidean': + if self.effective_metric_ in ['euclidean', 'masked_euclidean']: result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind else: result = dist[sample_range, neigh_ind], neigh_ind From a31c43a595c9a594f072930cc0b0b672983e689e Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 31 Jul 2017 00:54:13 -0500 Subject: [PATCH 17/19] Addressed review comments #5 --- sklearn/metrics/pairwise.py | 61 +++++++++----------------- sklearn/metrics/tests/test_pairwise.py | 31 +++++++++++++ 2 files changed, 51 insertions(+), 41 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 58dd5cde301e0..c00b6847e379d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -102,11 +102,6 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None, to be any format. False means that a sparse matrix input will raise an error. - .. deprecated:: 0.19 - Passing 'None' to parameter ``accept_sparse`` in methods is - deprecated in version 0.19 "and will be removed in 0.21. Use - ``accept_sparse=False`` instead. - force_all_finite : bool Whether to raise an error on np.inf and np.nan in X (or Y if it exists) @@ -293,25 +288,9 @@ def masked_euclidean_distances(X, Y=None, squared=False, missing_values="NaN", copy=True): """Calculates euclidean distances in the presence of missing values - Considering the rows of X (and Y=X) as vectors, compute the - distance matrix between each pair of vectors. Similarly, if - Y is not X, then compute the distance matrix between each - pair of vectors (i.e., each row pair) in X and Y. - - This function computes pairwise euclidean distance for vectors - in dense matrices X and Y with missing values in arbitrary - coordinates. - - The following formula is used for this: - - dist(X, Y) = (X.shape[1] / ((dot(NX, NYT)))) * - (dot((X * X), NYT) - 2 * (dot(X, Y.T)) + - dot(NX, (Y.T * Y.T))) - - where NX and NYT represent the logical-not of the missing masks of - X and Y.T, respectively. - Formula in matrix form derived by: - Shreya Bhattarai + Considering the rows of X (and Y=X) as samples, compute the distance matrix + between each pair of samples. Similarly, if Y is not X, then compute the + distance matrix between each sample pair (i.e., each row pair) in X and Y. When calculating the distance between a pair of samples, this formulation essentially zero-weights feature coordinates with a missing value in either @@ -321,10 +300,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, where, weight = Total # of coordinates / # of non-missing coordinates - For instance, the distance between sample points (x1, y1) and (x2, NaN) - would result in sqrt(2*((x2-x1)**2). Note that if all the coordinates are - missing or if there are no common non-missing coordinates then NaN is - returned for that pair. + Note that if all the coordinates are missing or if there are no common + non-missing coordinates then NaN is returned for that pair. Read more in the :ref:`User Guide `. @@ -377,31 +354,31 @@ def masked_euclidean_distances(X, Y=None, squared=False, # NOTE: force_all_finite=False allows not only NaN but also +/- inf X, Y = check_pairwise_arrays(X, Y, accept_sparse=False, force_all_finite=False, copy=copy) - if (np.any(np.isinf(X.data)) or - (Y is not None and np.any(np.isinf(Y.data)))): + if (np.any(np.isinf(X)) or + (Y is not X and np.any(np.isinf(Y)))): raise ValueError( "+/- Infinite values are not allowed.") + # Get missing mask for X and Y.T + mask_X = _get_mask(X, missing_values) + + YT = Y.T + mask_YT = _get_mask(YT, missing_values) + # Check if any rows have only missing value - if np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])\ - or (Y is not None and np.any(_get_mask(Y, missing_values).sum( - axis=1) == Y.shape[1])): + if np.any(mask_X.sum(axis=1) == X.shape[1])\ + or (Y is not X and np.any(mask_YT.sum(axis=0) == Y.shape[1])): raise ValueError("One or more rows only contain missing values.") # else: if missing_values != "NaN" and \ - (np.any(_get_mask(X.data, "NaN")) or - np.any(_get_mask(Y.data, "NaN"))): + (np.any(np.isnan(X)) or + (Y is not X and np.any(np.isnan(Y)))): raise ValueError( "NaN values present but missing_value = {0}".format( missing_values)) - # Get missing mask for X - mask_X = _get_mask(X, missing_values) - - # Get Y.T mask and anti-mask and set Y.T's missing to zero - YT = Y.T - mask_YT = _get_mask(YT, missing_values) + # Get anti-mask and set Y.T's missing to zero NYT = (~mask_YT).astype(np.int32) YT[mask_YT] = 0 @@ -410,6 +387,8 @@ def masked_euclidean_distances(X, Y=None, squared=False, X[mask_X] = 0 # Calculate distances + # The following formula was derived in matrix form by: + # Shreya Bhattarai distances = (X.shape[1] / (np.dot(NX, NYT))) * \ (np.dot(X * X, NYT) - 2 * (np.dot(X, YT)) + diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 812d35eace9cd..6312aa7a2590b 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -451,6 +451,37 @@ def test_masked_euclidean_distances(): D6 = masked_euclidean_distances(X, X, missing_values="NaN") assert_array_almost_equal(D5, D6) + # Check with missing_value = 1 while NaN is present + assert_raises(ValueError, masked_euclidean_distances, X, Y, + missing_values=1) + # Check with inf present + X_inf = np.array([ + [np.inf, np.nan, 3., 4., 2.], + [np.nan, 4., 6., 1., np.nan], + [3., np.nan, np.nan, np.nan, 1.]]) + + assert_raises(ValueError, masked_euclidean_distances, X_inf, Y) + + # Check with a row containing all NaNs + X_nan_row = np.array([ + [1., np.nan, 3., 4., 2.], + [np.nan, 4., 6., 1., np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan]]) + + Y_nan_row = np.array([ + [np.nan, 7., 7., np.nan, 2.], + [np.nan, np.nan, 5., 4., 7.], + [np.nan, np.nan, np.nan, np.nan, np.nan]]) + + assert_raises(ValueError, masked_euclidean_distances, X_nan_row, Y) + assert_raises(ValueError, masked_euclidean_distances, X, Y_nan_row) + + # Check copy = True against copy = False + # Note: This test will alter X and Y + D7 = masked_euclidean_distances(X, Y, copy=True) + D8 = masked_euclidean_distances(X, Y, copy=False) + assert_array_almost_equal(D7, D8) + def test_cosine_distances(): # Check the pairwise Cosine distances computation From eacb19d5740ea435b22182f14a67f6b91383808a Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 31 Jul 2017 01:30:10 -0500 Subject: [PATCH 18/19] Edited comments --- sklearn/metrics/tests/test_pairwise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 6312aa7a2590b..30668d01bd418 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -418,7 +418,7 @@ def test_euclidean_distances(): def test_masked_euclidean_distances(): - # check with pairs of matrices with missing values + # Check with pairs of matrices with missing values X = np.array([[1., np.nan, 3., 4., 2.], [np.nan, 4., 6., 1., np.nan], [3., np.nan, np.nan, np.nan, 1.]]) @@ -432,7 +432,7 @@ def test_masked_euclidean_distances(): assert_array_almost_equal(D1**2, D2) - # check when squared = True + # Check when squared = True D3 = np.array( [[40., 48.33333331, 22.5], [25., 25., 45.], From 351e3b94c020fb4301c7c8ec2884ddc394acb9ff Mon Sep 17 00:00:00 2001 From: harke Date: Mon, 4 Sep 2017 01:00:24 -0500 Subject: [PATCH 19/19] Addressed review comments - 6 --- doc/modules/classes.rst | 1 + sklearn/metrics/pairwise.py | 5 ++--- sklearn/neighbors/tests/test_neighbors.py | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 5399e27ef4d08..d0ab5f9ee3e8b 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1193,6 +1193,7 @@ See the :ref:`metrics` section of the user guide for further details. preprocessing.FunctionTransformer preprocessing.Imputer preprocessing.KernelCenterer + preprocessing.KNNImputer preprocessing.LabelBinarizer preprocessing.LabelEncoder preprocessing.MultiLabelBinarizer diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index c00b6847e379d..a8e31401b7d8d 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -1299,8 +1299,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): Valid values for metric are: - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan']. These metrics support sparse matrix - inputs. + 'manhattan']. These metrics support sparse matrix inputs. Also, ['masked_euclidean'] but it does not yet support sparse matrices. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', @@ -1381,7 +1380,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): X, _ = check_pairwise_arrays(X, Y, precomputed=True) return X elif metric in PAIRWISE_DISTANCE_FUNCTIONS: - func = PAIRWISE_DISTANCE_FUNCTIONS[metric] + func = PAIRWISE_DISTANCE_FUNCTIONS[metric] elif callable(metric): func = partial(_pairwise_callable, metric=metric, **kwds) else: diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index b9390883f9903..51190f7814ec4 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -145,6 +145,11 @@ def test_masked_unsupervised_kneighbors(): assert_array_equal(X2_neigh, N3) assert_array_equal(XY2_neigh, N4) + # Test 3: Sparse matrix with NaN + neigh_sparse = neighbors.NearestNeighbors(n_neighbors=2, + metric="masked_euclidean") + assert_raises(ValueError, neigh_sparse.fit, csr_matrix(X)) + def test_unsupervised_inputs(): # test the types of valid input into NearestNeighbors