diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index ac36498339250..bfafa005a99f5 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -426,6 +426,56 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", metric_kwargs)[0] +def haversine_distances(X, Y=None): + """Compute haversine distance between samples in X and Y. + + Haversine (Spherical) Distance + + The Haversine distance is the angular distance between two points on + the surface of a sphere. The first distance of each point is assumed + to be the latitude, the second is the longitude, given in radians. + The dimension of the points must be 2. + + .. math:: + D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2) + + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] + + Parameters + ---------- + X : {array-like}, shape (n_samples_1, 2) + + Y : {array-like}, shape (n_samples_2, 2) + + Returns + ------- + distances : {array}, shape (n_samples_1, n_samples_2) + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics.pairwise import haversine_distances + >>> X = np.array([[0, np.pi], [np.pi/2, 0]]) + >>> Y = np.array([[-np.pi/2, 0], [-np.pi/2, -np.pi]]) + >>> haversine_distances(X, Y) # doctest: +ELLIPSIS + array([[ 1.57..., 1.57...], + [ 3.14..., 3.14...]]) + + References + ---------- + * Haversine formula http://en.wikipedia.org/wiki/Haversine_formula + """ + X, Y = check_pairwise_arrays(X, Y) + + if X.shape[1] != 2 or Y.shape[1] != 2: + raise ValueError("Haversine distance only valid in 2 dimensions") + + D = np.square(np.sin((X[:, np.newaxis, :] - Y[np.newaxis, :, :]) * 0.5)) + D = 2 * np.arcsin(np.sqrt(D[:, :, 0] + + np.outer(np.cos(X[:, 0]), np.cos(Y[:, 0].T)) * + D[:, :, 1])) + return D + + def manhattan_distances(X, Y=None, sum_over_features=True, size_threshold=5e8): """ Compute the L1 distances between the vectors in X and Y. @@ -920,6 +970,7 @@ def chi2_kernel(X, Y=None, gamma=1.): 'cityblock': manhattan_distances, 'cosine': cosine_distances, 'euclidean': euclidean_distances, + 'haversine': haversine_distances, 'l2': euclidean_distances, 'l1': manhattan_distances, 'manhattan': manhattan_distances, } @@ -940,6 +991,7 @@ def distance_metrics(): 'cityblock' metrics.pairwise.manhattan_distances 'cosine' metrics.pairwise.cosine_distances 'euclidean' metrics.pairwise.euclidean_distances + 'haversine' metrics.pairwise.haversine_distances 'l1' metrics.pairwise.manhattan_distances 'l2' metrics.pairwise.euclidean_distances 'manhattan' metrics.pairwise.manhattan_distances @@ -1004,11 +1056,12 @@ def _pairwise_callable(X, Y, metric, **kwds): _VALID_METRICS = ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock', - 'braycurtis', 'canberra', 'chebyshev', 'correlation', - 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', - 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', - 'russellrao', 'seuclidean', 'sokalmichener', - 'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"] + 'haversine', 'braycurtis', 'canberra', 'chebyshev', + 'correlation', 'cosine', 'dice', 'hamming', 'jaccard', + 'kulsinski', 'mahalanobis', 'matching', 'minkowski', + 'rogerstanimoto', 'russellrao', 'seuclidean', + 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule', + 'wminkowski'] def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): @@ -1027,8 +1080,8 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds): Valid values for metric are: - - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan']. These metrics support sparse matrix inputs. + - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'haversine', + 'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs. - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 43a8eaddb84b4..727e0d078f976 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -11,11 +11,13 @@ from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_raise_message from sklearn.externals.six import iteritems from sklearn.metrics.pairwise import euclidean_distances from sklearn.metrics.pairwise import manhattan_distances +from sklearn.metrics.pairwise import haversine_distances from sklearn.metrics.pairwise import linear_kernel from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel from sklearn.metrics.pairwise import polynomial_kernel @@ -42,37 +44,61 @@ def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) + # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) + # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) + # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) + + # Test haversine distance + # The data should be valid latitude and longitude + X = rng.random_sample((5, 2)) + X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2 + X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi + S = pairwise_distances(X, metric="haversine") + S2 = haversine_distances(X) + assert_array_almost_equal(S, S2) + + # Test haversine distance, with Y != X + Y = rng.random_sample((2, 2)) + Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2 + Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi + S = pairwise_distances(X, Y, metric="haversine") + S2 = haversine_distances(X, Y) + assert_array_almost_equal(S, S2) + # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) + # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) + # Low-level function for manhattan can divide in blocks to avoid # using too much memory during the broadcasting S3 = manhattan_distances(X, Y, size_threshold=10) assert_array_almost_equal(S, S3) + # Test cosine as a string metric versus cosine callable # "cosine" uses sklearn metric, cosine (function) is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") @@ -80,10 +106,12 @@ def test_pairwise_distances(): assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) + # Tests that precomputed metric returns pointer to, and not copy of, X. S = np.dot(X, X.T) S2 = pairwise_distances(S, metric="precomputed") assert_true(S is S2) + # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) @@ -91,24 +119,30 @@ def test_pairwise_distances(): S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) + S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) + S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) + S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) + # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) + # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) + # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, @@ -335,8 +369,32 @@ def test_euclidean_distances(): assert_array_almost_equal(D, [[1., 2.]]) +def test_haversine_distances(): + # Check haversine distance with distances computation + def slow_haversine_distances(x, y): + diff_lat = y[0] - x[0] + diff_lon = y[1] - x[1] + a = np.sin(diff_lat / 2) ** 2 + \ + np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon/2) ** 2 + c = 2 * np.arcsin(np.sqrt(a)) + return c + rng = np.random.RandomState(0) + X = rng.random_sample((5, 2)) + Y = rng.random_sample((10, 2)) + D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X]) + D2 = haversine_distances(X, Y) + assert_array_almost_equal(D1, D2) + + # Test haversine distance does not accept X where n_feature != 2 + X = rng.random_sample((10, 3)) + assert_raise_message(ValueError, + "Haversine distance only valid in 2 dimensions", + haversine_distances, X) + + # Paired distances + def test_paired_euclidean_distances(): # Check the paired Euclidean distances computation X = [[0], [0]] diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 494f3fe11f14d..e01f49e134b5d 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -11,8 +11,10 @@ from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_warns +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import ignore_warnings from sklearn.utils.validation import check_random_state + from sklearn.metrics.pairwise import pairwise_distances from sklearn import neighbors, datasets @@ -797,7 +799,7 @@ def test_neighbors_badargs(): neighbors.NearestNeighbors, algorithm='blah') - X = rng.random_sample((10, 2)) + X = rng.random_sample((10, 3)) Xsparse = csr_matrix(X) y = np.ones(10) @@ -812,13 +814,6 @@ def test_neighbors_badargs(): cls, p=-1) assert_raises(ValueError, cls, algorithm='blah') - nbrs = cls(algorithm='ball_tree', metric='haversine') - assert_raises(ValueError, - nbrs.predict, - X) - assert_raises(ValueError, - ignore_warnings(nbrs.fit), - Xsparse, y) nbrs = cls() assert_raises(ValueError, nbrs.fit, @@ -830,6 +825,12 @@ def test_neighbors_badargs(): assert_raises(ValueError, nbrs.predict, []) + nbrs = cls(metric='haversine', algorithm='brute') + nbrs.fit(X, y) + assert_raise_message(ValueError, + "Haversine distance only valid in 2 dimensions", + nbrs.predict, + X) nbrs = neighbors.NearestNeighbors().fit(X) @@ -857,7 +858,8 @@ def test_neighbors_metrics(n_samples=20, n_features=3, ('chebyshev', {}), ('seuclidean', dict(V=rng.rand(n_features))), ('wminkowski', dict(p=3, w=rng.rand(n_features))), - ('mahalanobis', dict(VI=VI))] + ('mahalanobis', dict(VI=VI)), + ('haversine', {})] algorithms = ['brute', 'ball_tree', 'kd_tree'] X = rng.rand(n_samples, n_features) @@ -880,8 +882,13 @@ def test_neighbors_metrics(n_samples=20, n_features=3, algorithm=algorithm, metric=metric, p=p, metric_params=metric_params) - neigh.fit(X) - results.append(neigh.kneighbors(test, return_distance=True)) + if metric == 'haversine': + neigh.fit(X[:, :2]) + results.append(neigh.kneighbors(test[:, :2], + return_distance=True)) + else: + neigh.fit(X) + results.append(neigh.kneighbors(test, return_distance=True)) assert_array_almost_equal(results[0][0], results[1][0]) assert_array_almost_equal(results[0][1], results[1][1])