Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] Modified sklearn.metrics to enable euclidean distance calculation with NaN #9348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/modules/classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,7 @@ See the :ref:`metrics` section of the user guide for further details.
preprocessing.FunctionTransformer
preprocessing.Imputer
preprocessing.KernelCenterer
preprocessing.KNNImputer
preprocessing.LabelBinarizer
preprocessing.LabelEncoder
preprocessing.MultiLabelBinarizer
Expand Down
194 changes: 177 additions & 17 deletions sklearn/metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Lars Buitinck
# Joel Nothman <[email protected]>
# License: BSD 3 clause

from __future__ import division
import itertools
from functools import partial

Expand All @@ -29,6 +29,15 @@
from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


# Get mask for missing values
def _get_mask(X, value_to_mask):
"""Compute the boolean mask X == missing_values."""
if value_to_mask == "NaN" or np.isnan(value_to_mask):
return np.isnan(X)
else:
return X == value_to_mask


# Utility Functions
def _return_float_dtype(X, Y):
"""
Expand All @@ -54,7 +63,9 @@ def _return_float_dtype(X, Y):
return X, Y, dtype


def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
def check_pairwise_arrays(X, Y, precomputed=False, dtype=None,
accept_sparse='csr', force_all_finite=True,
copy=False):
""" Set X and Y appropriately and checks inputs

If Y is None, it is set as a pointer to X (i.e. not a copy).
Expand Down Expand Up @@ -84,6 +95,20 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):

.. versionadded:: 0.18

accept_sparse : string, boolean or list/tuple of strings
String[s] representing allowed sparse matrix formats, such as 'csc',
'csr', etc. If the input is sparse but not in the allowed format,
it will be converted to the first listed format. True allows the input
to be any format. False means that a sparse matrix input will
raise an error.

force_all_finite : bool
Whether to raise an error on np.inf and np.nan in X (or Y if it exists)

copy : bool
Whether a forced copy will be triggered. If copy=False, a copy might
be triggered by a conversion.

Returns
-------
safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
Expand All @@ -102,12 +127,15 @@ def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
dtype = dtype_float

if Y is X or Y is None:
X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
X = Y = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
copy=copy, force_all_finite=force_all_finite,
warn_on_dtype=warn_on_dtype, estimator=estimator)
else:
X = check_array(X, accept_sparse='csr', dtype=dtype,
X = check_array(X, accept_sparse=accept_sparse, dtype=dtype,
copy=copy, force_all_finite=force_all_finite,
warn_on_dtype=warn_on_dtype, estimator=estimator)
Y = check_array(Y, accept_sparse='csr', dtype=dtype,
Y = check_array(Y, accept_sparse=accept_sparse, dtype=dtype,
copy=copy, force_all_finite=force_all_finite,
warn_on_dtype=warn_on_dtype, estimator=estimator)

if precomputed:
Expand Down Expand Up @@ -217,7 +245,7 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,

See also
--------
paired_distances : distances betweens pairs of elements of X and Y.
paired_distances : distances between pairs of elements of X and Y.
"""
X, Y = check_pairwise_arrays(X, Y)

Expand Down Expand Up @@ -256,6 +284,124 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
return distances if squared else np.sqrt(distances, out=distances)


def masked_euclidean_distances(X, Y=None, squared=False,
missing_values="NaN", copy=True):
"""Calculates euclidean distances in the presence of missing values

Considering the rows of X (and Y=X) as samples, compute the distance matrix
between each pair of samples. Similarly, if Y is not X, then compute the
distance matrix between each sample pair (i.e., each row pair) in X and Y.

When calculating the distance between a pair of samples, this formulation
essentially zero-weights feature coordinates with a missing value in either
sample and scales up the weight of the remaining coordinates:

dist(x,y) = sqrt(weight * sq. distance from non-missing coordinates)
where,
weight = Total # of coordinates / # of non-missing coordinates

Note that if all the coordinates are missing or if there are no common
non-missing coordinates then NaN is returned for that pair.

Read more in the :ref:`User Guide <metrics>`.

Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples_1, n_features)

Y : {array-like, sparse matrix}, shape (n_samples_2, n_features)

squared : boolean, optional
Return squared Euclidean distances.

missing_values : "NaN" or integer, optional
Representation of missing value

copy : boolean, optional
Make and use a deep copy of X and Y (if Y exists)

Returns
-------
distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)

Examples
--------
>>> from sklearn.metrics.pairwise import masked_euclidean_distances
>>> nan = float("NaN")
>>> X = [[0, 1], [1, nan]]
>>> # distance between rows of X
>>> masked_euclidean_distances(X, X)
array([[ 0. , 1.41421356],
[ 1.41421356, 0. ]])

>>> # get distance to origin
>>> masked_euclidean_distances(X, [[0, 0]])
array([[ 1. ],
[ 1.41421356]])

References
----------
* John K. Dixon, "Pattern Recognition with Partly Missing Data",
IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue:
10, pp. 617 - 621, Oct. 1979.
http://ieeexplore.ieee.org/abstract/document/4310090/

See also
--------
paired_distances : distances betweens pairs of elements of X and Y.
"""

# NOTE: force_all_finite=False allows not only NaN but also +/- inf
X, Y = check_pairwise_arrays(X, Y, accept_sparse=False,
force_all_finite=False, copy=copy)
if (np.any(np.isinf(X)) or
(Y is not X and np.any(np.isinf(Y)))):
raise ValueError(
"+/- Infinite values are not allowed.")

# Get missing mask for X and Y.T
mask_X = _get_mask(X, missing_values)

YT = Y.T
mask_YT = _get_mask(YT, missing_values)

# Check if any rows have only missing value
if np.any(mask_X.sum(axis=1) == X.shape[1])\
or (Y is not X and np.any(mask_YT.sum(axis=0) == Y.shape[1])):
raise ValueError("One or more rows only contain missing values.")

# else:
if missing_values != "NaN" and \
(np.any(np.isnan(X)) or
(Y is not X and np.any(np.isnan(Y)))):
raise ValueError(
"NaN values present but missing_value = {0}".format(
missing_values))

# Get anti-mask and set Y.T's missing to zero
NYT = (~mask_YT).astype(np.int32)
YT[mask_YT] = 0

# Get X anti-mask and set X's missing to zero
NX = (~mask_X).astype(np.int32)
X[mask_X] = 0

# Calculate distances
# The following formula was derived in matrix form by:
# Shreya Bhattarai <[email protected]>

distances = (X.shape[1] / (np.dot(NX, NYT))) * \
(np.dot(X * X, NYT) - 2 * (np.dot(X, YT)) +
np.dot(NX, YT * YT))

if X is Y:
# Ensure that distances between vectors and themselves are set to 0.0.
# This may not be the case due to floating point rounding errors.
distances.flat[::distances.shape[0] + 1] = 0.0

return distances if squared else np.sqrt(distances, out=distances)


def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
batch_size=500, metric_kwargs=None):
"""Compute minimum distances between one point and a set of points.
Expand Down Expand Up @@ -1040,6 +1186,7 @@ def chi2_kernel(X, Y=None, gamma=1.):
'l1': manhattan_distances,
'manhattan': manhattan_distances,
'precomputed': None, # HACK: precomputed is always allowed, never called
'masked_euclidean': masked_euclidean_distances,
}


Expand All @@ -1052,16 +1199,17 @@ def distance_metrics():

The valid distance metrics, and the function they map to, are:

============ ====================================
metric Function
============ ====================================
'cityblock' metrics.pairwise.manhattan_distances
'cosine' metrics.pairwise.cosine_distances
'euclidean' metrics.pairwise.euclidean_distances
'l1' metrics.pairwise.manhattan_distances
'l2' metrics.pairwise.euclidean_distances
'manhattan' metrics.pairwise.manhattan_distances
============ ====================================
=================== ============================================
metric Function
=================== ============================================
'cityblock' metrics.pairwise.manhattan_distances
'cosine' metrics.pairwise.cosine_distances
'euclidean' metrics.pairwise.euclidean_distances
'l1' metrics.pairwise.manhattan_distances
'l2' metrics.pairwise.euclidean_distances
'manhattan' metrics.pairwise.manhattan_distances
'masked_euclidean' metrics.pairwise.masked_euclidean_distances
=================== ============================================

Read more in the :ref:`User Guide <metrics>`.

Expand Down Expand Up @@ -1128,7 +1276,10 @@ def _pairwise_callable(X, Y, metric, **kwds):
'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
'russellrao', 'seuclidean', 'sokalmichener',
'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"]
'sokalsneath', 'sqeuclidean', 'yule', "wminkowski",
'masked_euclidean']

_MASKED_SUPPORTED_METRICS = ['masked_euclidean']


def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
Expand All @@ -1149,6 +1300,7 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):

- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']. These metrics support sparse matrix inputs.
Also, ['masked_euclidean'] but it does not yet support sparse matrices.

- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
Expand Down Expand Up @@ -1216,6 +1368,14 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
"Valid metrics are %s, or 'precomputed', or a "
"callable" % (metric, _VALID_METRICS))

if metric in _MASKED_SUPPORTED_METRICS:
missing_values = kwds.get("missing_values") if kwds.get(
"missing_values") is not None else np.nan

if(np.any(_get_mask(X, missing_values).sum(axis=1) == X.shape[1])):
raise ValueError(
"One or more samples(s) only have missing values.")

if metric == "precomputed":
X, _ = check_pairwise_arrays(X, Y, precomputed=True)
return X
Expand Down
76 changes: 76 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from sklearn.externals.six import iteritems

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import masked_euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel
Expand Down Expand Up @@ -56,6 +57,15 @@ def test_pairwise_distances():
S = pairwise_distances(X, Y, metric="euclidean")
S2 = euclidean_distances(X, Y)
assert_array_almost_equal(S, S2)
# Check to ensure NaNs work with pairwise_distances.
X_masked = rng.random_sample((5, 4))
Y_masked = rng.random_sample((2, 4))
X_masked[0, 0] = np.nan
Y_masked[0, 0] = np.nan
S_masked = pairwise_distances(X_masked, Y_masked,
metric="masked_euclidean")
S2_masked = masked_euclidean_distances(X_masked, Y_masked)
assert_array_almost_equal(S_masked, S2_masked)
# Test with tuples as X and Y
X_tuples = tuple([tuple([v for v in row]) for row in X])
Y_tuples = tuple([tuple([v for v in row]) for row in Y])
Expand Down Expand Up @@ -407,6 +417,72 @@ def test_euclidean_distances():
assert_greater(np.max(np.abs(wrong_D - D1)), .01)


def test_masked_euclidean_distances():
# Check with pairs of matrices with missing values
X = np.array([[1., np.nan, 3., 4., 2.],
[np.nan, 4., 6., 1., np.nan],
[3., np.nan, np.nan, np.nan, 1.]])

Y = np.array([[np.nan, 7., 7., np.nan, 2.],
[np.nan, np.nan, 5., 4., 7.],
[np.nan, np.nan, np.nan, 4., 5.]])

D1 = masked_euclidean_distances(X, Y, missing_values="NaN")
D2 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN")

assert_array_almost_equal(D1**2, D2)

# Check when squared = True
D3 = np.array(
[[40., 48.33333331, 22.5],
[25., 25., 45.],
[5., 180., 80.]])
D4 = masked_euclidean_distances(X, Y, squared=True, missing_values="NaN")

assert_array_almost_equal(D3, D4)

# Check with explicit formula and squared=True
assert_array_almost_equal(
masked_euclidean_distances(X[:1], Y[:1], squared=True),
[[5.0/2.0 * ((7-3)**2 + (2-2)**2)]])

# Check when Y = X is explicitly passed
D5 = masked_euclidean_distances(X, missing_values="NaN")
D6 = masked_euclidean_distances(X, X, missing_values="NaN")
assert_array_almost_equal(D5, D6)

# Check with missing_value = 1 while NaN is present
assert_raises(ValueError, masked_euclidean_distances, X, Y,
missing_values=1)
# Check with inf present
X_inf = np.array([
[np.inf, np.nan, 3., 4., 2.],
[np.nan, 4., 6., 1., np.nan],
[3., np.nan, np.nan, np.nan, 1.]])

assert_raises(ValueError, masked_euclidean_distances, X_inf, Y)

# Check with a row containing all NaNs
X_nan_row = np.array([
[1., np.nan, 3., 4., 2.],
[np.nan, 4., 6., 1., np.nan],
[np.nan, np.nan, np.nan, np.nan, np.nan]])

Y_nan_row = np.array([
[np.nan, 7., 7., np.nan, 2.],
[np.nan, np.nan, 5., 4., 7.],
[np.nan, np.nan, np.nan, np.nan, np.nan]])

assert_raises(ValueError, masked_euclidean_distances, X_nan_row, Y)
assert_raises(ValueError, masked_euclidean_distances, X, Y_nan_row)

# Check copy = True against copy = False
# Note: This test will alter X and Y
D7 = masked_euclidean_distances(X, Y, copy=True)
D8 = masked_euclidean_distances(X, Y, copy=False)
assert_array_almost_equal(D7, D8)


def test_cosine_distances():
# Check the pairwise Cosine distances computation
rng = np.random.RandomState(1337)
Expand Down
Loading