Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 60 additions & 7 deletions sklearn/metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,56 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
metric_kwargs)[0]


def haversine_distances(X, Y=None):
"""Compute haversine distance between samples in X and Y.

Haversine (Spherical) Distance

The Haversine distance is the angular distance between two points on
the surface of a sphere. The first distance of each point is assumed
to be the latitude, the second is the longitude, given in radians.
The dimension of the points must be 2.

.. math::
D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2)
+ cos(x1)cos(y1)sin^2((x2 - y2) / 2)}]

Parameters
----------
X : {array-like}, shape (n_samples_1, 2)

Y : {array-like}, shape (n_samples_2, 2)

Returns
-------
distances : {array}, shape (n_samples_1, n_samples_2)

Examples
--------
>>> import numpy as np
>>> from sklearn.metrics.pairwise import haversine_distances
>>> X = np.array([[0, np.pi], [np.pi/2, 0]])
>>> Y = np.array([[-np.pi/2, 0], [-np.pi/2, -np.pi]])
>>> haversine_distances(X, Y) # doctest: +ELLIPSIS
array([[ 1.57..., 1.57...],
[ 3.14..., 3.14...]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a reference (to wikipedia?)


References
----------
* Haversine formula http://en.wikipedia.org/wiki/Haversine_formula
"""
X, Y = check_pairwise_arrays(X, Y)

if X.shape[1] != 2 or Y.shape[1] != 2:
raise ValueError("Haversine distance only valid in 2 dimensions")

D = np.square(np.sin((X[:, np.newaxis, :] - Y[np.newaxis, :, :]) * 0.5))
D = 2 * np.arcsin(np.sqrt(D[:, :, 0] +
np.outer(np.cos(X[:, 0]), np.cos(Y[:, 0].T)) *
D[:, :, 1]))
return D


def manhattan_distances(X, Y=None, sum_over_features=True,
size_threshold=5e8):
""" Compute the L1 distances between the vectors in X and Y.
Expand Down Expand Up @@ -920,6 +970,7 @@ def chi2_kernel(X, Y=None, gamma=1.):
'cityblock': manhattan_distances,
'cosine': cosine_distances,
'euclidean': euclidean_distances,
'haversine': haversine_distances,
'l2': euclidean_distances,
'l1': manhattan_distances,
'manhattan': manhattan_distances, }
Expand All @@ -940,6 +991,7 @@ def distance_metrics():
'cityblock' metrics.pairwise.manhattan_distances
'cosine' metrics.pairwise.cosine_distances
'euclidean' metrics.pairwise.euclidean_distances
'haversine' metrics.pairwise.haversine_distances
'l1' metrics.pairwise.manhattan_distances
'l2' metrics.pairwise.euclidean_distances
'manhattan' metrics.pairwise.manhattan_distances
Expand Down Expand Up @@ -1004,11 +1056,12 @@ def _pairwise_callable(X, Y, metric, **kwds):


_VALID_METRICS = ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock',
'braycurtis', 'canberra', 'chebyshev', 'correlation',
'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
'russellrao', 'seuclidean', 'sokalmichener',
'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"]
'haversine', 'braycurtis', 'canberra', 'chebyshev',
'correlation', 'cosine', 'dice', 'hamming', 'jaccard',
'kulsinski', 'mahalanobis', 'matching', 'minkowski',
'rogerstanimoto', 'russellrao', 'seuclidean',
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule',
'wminkowski']


def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
Expand All @@ -1027,8 +1080,8 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):

Valid values for metric are:

- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
'manhattan']. These metrics support sparse matrix inputs.
- From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'haversine',
'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs.

- From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
Expand Down
58 changes: 58 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_raise_message

from sklearn.externals.six import iteritems

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import haversine_distances
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel
from sklearn.metrics.pairwise import polynomial_kernel
Expand All @@ -42,73 +44,105 @@
def test_pairwise_distances():
# Test the pairwise_distance helper function.
rng = np.random.RandomState(0)

# Euclidean distance should be equivalent to calling the function.
X = rng.random_sample((5, 4))
S = pairwise_distances(X, metric="euclidean")
S2 = euclidean_distances(X)
assert_array_almost_equal(S, S2)

# Euclidean distance, with Y != X.
Y = rng.random_sample((2, 4))
S = pairwise_distances(X, Y, metric="euclidean")
S2 = euclidean_distances(X, Y)
assert_array_almost_equal(S, S2)

# Test with tuples as X and Y
X_tuples = tuple([tuple([v for v in row]) for row in X])
Y_tuples = tuple([tuple([v for v in row]) for row in Y])
S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
assert_array_almost_equal(S, S2)

# Test haversine distance
# The data should be valid latitude and longitude
X = rng.random_sample((5, 2))
X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2
X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
S = pairwise_distances(X, metric="haversine")
S2 = haversine_distances(X)
assert_array_almost_equal(S, S2)

# Test haversine distance, with Y != X
Y = rng.random_sample((2, 2))
Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2
Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi
S = pairwise_distances(X, Y, metric="haversine")
S2 = haversine_distances(X, Y)
assert_array_almost_equal(S, S2)

# "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
S = pairwise_distances(X, metric="cityblock")
S2 = pairwise_distances(X, metric=cityblock)
assert_equal(S.shape[0], S.shape[1])
assert_equal(S.shape[0], X.shape[0])
assert_array_almost_equal(S, S2)

# The manhattan metric should be equivalent to cityblock.
S = pairwise_distances(X, Y, metric="manhattan")
S2 = pairwise_distances(X, Y, metric=cityblock)
assert_equal(S.shape[0], X.shape[0])
assert_equal(S.shape[1], Y.shape[0])
assert_array_almost_equal(S, S2)

# Low-level function for manhattan can divide in blocks to avoid
# using too much memory during the broadcasting
S3 = manhattan_distances(X, Y, size_threshold=10)
assert_array_almost_equal(S, S3)

# Test cosine as a string metric versus cosine callable
# "cosine" uses sklearn metric, cosine (function) is scipy.spatial
S = pairwise_distances(X, Y, metric="cosine")
S2 = pairwise_distances(X, Y, metric=cosine)
assert_equal(S.shape[0], X.shape[0])
assert_equal(S.shape[1], Y.shape[0])
assert_array_almost_equal(S, S2)

# Tests that precomputed metric returns pointer to, and not copy of, X.
S = np.dot(X, X.T)
S2 = pairwise_distances(S, metric="precomputed")
assert_true(S is S2)

# Test with sparse X and Y,
# currently only supported for Euclidean, L1 and cosine.
X_sparse = csr_matrix(X)
Y_sparse = csr_matrix(Y)
S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
S2 = euclidean_distances(X_sparse, Y_sparse)
assert_array_almost_equal(S, S2)

S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
S2 = cosine_distances(X_sparse, Y_sparse)
assert_array_almost_equal(S, S2)

S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
assert_array_almost_equal(S, S2)

S2 = manhattan_distances(X, Y)
assert_array_almost_equal(S, S2)

# Test with scipy.spatial.distance metric, with a kwd
kwds = {"p": 2.0}
S = pairwise_distances(X, Y, metric="minkowski", **kwds)
S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
assert_array_almost_equal(S, S2)

# same with Y = None
kwds = {"p": 2.0}
S = pairwise_distances(X, metric="minkowski", **kwds)
S2 = pairwise_distances(X, metric=minkowski, **kwds)
assert_array_almost_equal(S, S2)

# Test that scipy distance metrics throw an error if sparse matrix given
assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
assert_raises(TypeError, pairwise_distances, X, Y_sparse,
Expand Down Expand Up @@ -335,8 +369,32 @@ def test_euclidean_distances():
assert_array_almost_equal(D, [[1., 2.]])


def test_haversine_distances():
# Check haversine distance with distances computation
def slow_haversine_distances(x, y):
diff_lat = y[0] - x[0]
diff_lon = y[1] - x[1]
a = np.sin(diff_lat / 2) ** 2 + \
np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon/2) ** 2
c = 2 * np.arcsin(np.sqrt(a))
return c
rng = np.random.RandomState(0)
X = rng.random_sample((5, 2))
Y = rng.random_sample((10, 2))
D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X])
D2 = haversine_distances(X, Y)
assert_array_almost_equal(D1, D2)

# Test haversine distance does not accept X where n_feature != 2
X = rng.random_sample((10, 3))
assert_raise_message(ValueError,
"Haversine distance only valid in 2 dimensions",
haversine_distances, X)


# Paired distances


def test_paired_euclidean_distances():
# Check the paired Euclidean distances computation
X = [[0], [0]]
Expand Down
29 changes: 18 additions & 11 deletions sklearn/neighbors/tests/test_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.validation import check_random_state

from sklearn.metrics.pairwise import pairwise_distances
from sklearn import neighbors, datasets

Expand Down Expand Up @@ -797,7 +799,7 @@ def test_neighbors_badargs():
neighbors.NearestNeighbors,
algorithm='blah')

X = rng.random_sample((10, 2))
X = rng.random_sample((10, 3))
Xsparse = csr_matrix(X)
y = np.ones(10)

Expand All @@ -812,13 +814,6 @@ def test_neighbors_badargs():
cls, p=-1)
assert_raises(ValueError,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This still crashes, right? Why did you remove the test?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So you got "error not raised" here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry this was my suggestion.

So you got "error not raised" here?

Yes. Travis failed owing to this. I assumed since haversine was not previously implemented, this test was introduced.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but I thought it is still not implemented for the ball_tree. Is it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is implemented L285 in test_ball_tree.py L22 in ball_tree.pyx. The test is to make sure haversine is not implemented for four neighbors classifiers and regressors.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, ok. The test was slightly confusing to me.

cls, algorithm='blah')
nbrs = cls(algorithm='ball_tree', metric='haversine')
assert_raises(ValueError,
nbrs.predict,
X)
assert_raises(ValueError,
ignore_warnings(nbrs.fit),
Xsparse, y)
nbrs = cls()
assert_raises(ValueError,
nbrs.fit,
Expand All @@ -830,6 +825,12 @@ def test_neighbors_badargs():
assert_raises(ValueError,
nbrs.predict,
[])
nbrs = cls(metric='haversine', algorithm='brute')
nbrs.fit(X, y)
assert_raise_message(ValueError,
"Haversine distance only valid in 2 dimensions",
nbrs.predict,
X)

nbrs = neighbors.NearestNeighbors().fit(X)

Expand Down Expand Up @@ -857,7 +858,8 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
('chebyshev', {}),
('seuclidean', dict(V=rng.rand(n_features))),
('wminkowski', dict(p=3, w=rng.rand(n_features))),
('mahalanobis', dict(VI=VI))]
('mahalanobis', dict(VI=VI)),
('haversine', {})]
algorithms = ['brute', 'ball_tree', 'kd_tree']
X = rng.rand(n_samples, n_features)

Expand All @@ -880,8 +882,13 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
algorithm=algorithm,
metric=metric, p=p,
metric_params=metric_params)
neigh.fit(X)
results.append(neigh.kneighbors(test, return_distance=True))
if metric == 'haversine':
neigh.fit(X[:, :2])
results.append(neigh.kneighbors(test[:, :2],
return_distance=True))
else:
neigh.fit(X)
results.append(neigh.kneighbors(test, return_distance=True))

assert_array_almost_equal(results[0][0], results[1][0])
assert_array_almost_equal(results[0][1], results[1][1])
Expand Down