scikit-learn · xuewei4d · Mar 28, 2015 · amueller · Apr 1, 2015 · amueller
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -426,6 +426,56 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
                                          metric_kwargs)[0]
 
 
+def haversine_distances(X, Y=None):
+    """Compute haversine distance between samples in X and Y.
+
+    Haversine (Spherical) Distance
+
+    The Haversine distance is the angular distance between two points on
+    the surface of a sphere. The first distance of each point is assumed
+    to be the latitude, the second is the longitude, given in radians.
+    The dimension of the points must be 2.
+
+    .. math::
+       D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2)
+                                + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}]
+
+    Parameters
+    ----------
+    X : {array-like}, shape (n_samples_1, 2)
+
+    Y : {array-like}, shape (n_samples_2, 2)
+
+    Returns
+    -------
+    distances : {array}, shape (n_samples_1, n_samples_2)
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics.pairwise import haversine_distances
+    >>> X = np.array([[0, np.pi], [np.pi/2, 0]])
+    >>> Y = np.array([[-np.pi/2, 0], [-np.pi/2, -np.pi]])
+    >>> haversine_distances(X, Y) # doctest: +ELLIPSIS
+    array([[ 1.57...,  1.57...],
+           [ 3.14...,  3.14...]])
+
+    References
+    ----------
+    * Haversine formula http://en.wikipedia.org/wiki/Haversine_formula
+    """
+    X, Y = check_pairwise_arrays(X, Y)
+
+    if X.shape[1] != 2 or Y.shape[1] != 2:
+        raise ValueError("Haversine distance only valid in 2 dimensions")
+
+    D = np.square(np.sin((X[:, np.newaxis, :] - Y[np.newaxis, :, :]) * 0.5))
+    D = 2 * np.arcsin(np.sqrt(D[:, :, 0] +
+                              np.outer(np.cos(X[:, 0]), np.cos(Y[:, 0].T)) *
+                              D[:, :, 1]))
+    return D
+
+
 def manhattan_distances(X, Y=None, sum_over_features=True,
                         size_threshold=5e8):
     """ Compute the L1 distances between the vectors in X and Y.
@@ -920,6 +970,7 @@ def chi2_kernel(X, Y=None, gamma=1.):
     'cityblock': manhattan_distances,
     'cosine': cosine_distances,
     'euclidean': euclidean_distances,
+    'haversine': haversine_distances,
     'l2': euclidean_distances,
     'l1': manhattan_distances,
     'manhattan': manhattan_distances, }
@@ -940,6 +991,7 @@ def distance_metrics():
     'cityblock'      metrics.pairwise.manhattan_distances
     'cosine'         metrics.pairwise.cosine_distances
     'euclidean'      metrics.pairwise.euclidean_distances
+    'haversine'      metrics.pairwise.haversine_distances
     'l1'             metrics.pairwise.manhattan_distances
     'l2'             metrics.pairwise.euclidean_distances
     'manhattan'      metrics.pairwise.manhattan_distances
@@ -1004,11 +1056,12 @@ def _pairwise_callable(X, Y, metric, **kwds):
 
 
 _VALID_METRICS = ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock',
-                  'braycurtis', 'canberra', 'chebyshev', 'correlation',
-                  'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
-                  'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
-                  'russellrao', 'seuclidean', 'sokalmichener',
-                  'sokalsneath', 'sqeuclidean', 'yule', "wminkowski"]
+                  'haversine', 'braycurtis', 'canberra', 'chebyshev',
+                  'correlation', 'cosine', 'dice', 'hamming', 'jaccard',
+                  'kulsinski', 'mahalanobis', 'matching', 'minkowski',
+                  'rogerstanimoto', 'russellrao', 'seuclidean',
+                  'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule',
+                  'wminkowski']
 
 
 def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
@@ -1027,8 +1080,8 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
 
     Valid values for metric are:
 
-    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-      'manhattan']. These metrics support sparse matrix inputs.
+    - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'haversine',
+      'l1', 'l2', 'manhattan']. These metrics support sparse matrix inputs.
 
     - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
       'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -11,11 +11,13 @@
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import assert_true
+from sklearn.utils.testing import assert_raise_message
 
 from sklearn.externals.six import iteritems
 
 from sklearn.metrics.pairwise import euclidean_distances
 from sklearn.metrics.pairwise import manhattan_distances
+from sklearn.metrics.pairwise import haversine_distances
 from sklearn.metrics.pairwise import linear_kernel
 from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel
 from sklearn.metrics.pairwise import polynomial_kernel
@@ -42,73 +44,105 @@
 def test_pairwise_distances():
     # Test the pairwise_distance helper function.
     rng = np.random.RandomState(0)
+
     # Euclidean distance should be equivalent to calling the function.
     X = rng.random_sample((5, 4))
     S = pairwise_distances(X, metric="euclidean")
     S2 = euclidean_distances(X)
     assert_array_almost_equal(S, S2)
+
     # Euclidean distance, with Y != X.
     Y = rng.random_sample((2, 4))
     S = pairwise_distances(X, Y, metric="euclidean")
     S2 = euclidean_distances(X, Y)
     assert_array_almost_equal(S, S2)
+
     # Test with tuples as X and Y
     X_tuples = tuple([tuple([v for v in row]) for row in X])
     Y_tuples = tuple([tuple([v for v in row]) for row in Y])
     S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
     assert_array_almost_equal(S, S2)
+
+    # Test haversine distance
+    # The data should be valid latitude and longitude
+    X = rng.random_sample((5, 2))
+    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2
+    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
+    S = pairwise_distances(X, metric="haversine")
+    S2 = haversine_distances(X)
+    assert_array_almost_equal(S, S2)
+
+    # Test haversine distance, with Y != X
+    Y = rng.random_sample((2, 2))
+    Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2
+    Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi
+    S = pairwise_distances(X, Y, metric="haversine")
+    S2 = haversine_distances(X, Y)
+    assert_array_almost_equal(S, S2)
+
     # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
     S = pairwise_distances(X, metric="cityblock")
     S2 = pairwise_distances(X, metric=cityblock)
     assert_equal(S.shape[0], S.shape[1])
     assert_equal(S.shape[0], X.shape[0])
     assert_array_almost_equal(S, S2)
+
     # The manhattan metric should be equivalent to cityblock.
     S = pairwise_distances(X, Y, metric="manhattan")
     S2 = pairwise_distances(X, Y, metric=cityblock)
     assert_equal(S.shape[0], X.shape[0])
     assert_equal(S.shape[1], Y.shape[0])
     assert_array_almost_equal(S, S2)
+
     # Low-level function for manhattan can divide in blocks to avoid
     # using too much memory during the broadcasting
     S3 = manhattan_distances(X, Y, size_threshold=10)
     assert_array_almost_equal(S, S3)
+
     # Test cosine as a string metric versus cosine callable
     # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
     S = pairwise_distances(X, Y, metric="cosine")
     S2 = pairwise_distances(X, Y, metric=cosine)
     assert_equal(S.shape[0], X.shape[0])
     assert_equal(S.shape[1], Y.shape[0])
     assert_array_almost_equal(S, S2)
+
     # Tests that precomputed metric returns pointer to, and not copy of, X.
     S = np.dot(X, X.T)
     S2 = pairwise_distances(S, metric="precomputed")
     assert_true(S is S2)
+
     # Test with sparse X and Y,
     # currently only supported for Euclidean, L1 and cosine.
     X_sparse = csr_matrix(X)
     Y_sparse = csr_matrix(Y)
     S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
     S2 = euclidean_distances(X_sparse, Y_sparse)
     assert_array_almost_equal(S, S2)
+
     S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
     S2 = cosine_distances(X_sparse, Y_sparse)
     assert_array_almost_equal(S, S2)
+
     S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
     S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
     assert_array_almost_equal(S, S2)
+
     S2 = manhattan_distances(X, Y)
     assert_array_almost_equal(S, S2)
+
     # Test with scipy.spatial.distance metric, with a kwd
     kwds = {"p": 2.0}
     S = pairwise_distances(X, Y, metric="minkowski", **kwds)
     S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
     assert_array_almost_equal(S, S2)
+
     # same with Y = None
     kwds = {"p": 2.0}
     S = pairwise_distances(X, metric="minkowski", **kwds)
     S2 = pairwise_distances(X, metric=minkowski, **kwds)
     assert_array_almost_equal(S, S2)
+
     # Test that scipy distance metrics throw an error if sparse matrix given
     assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
     assert_raises(TypeError, pairwise_distances, X, Y_sparse,
@@ -335,8 +369,32 @@ def test_euclidean_distances():
     assert_array_almost_equal(D, [[1., 2.]])
 
 
+def test_haversine_distances():
+    # Check haversine distance with distances computation
+    def slow_haversine_distances(x, y):
+        diff_lat = y[0] - x[0]
+        diff_lon = y[1] - x[1]
+        a = np.sin(diff_lat / 2) ** 2 + \
+            np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon/2) ** 2
+        c = 2 * np.arcsin(np.sqrt(a))
+        return c
+    rng = np.random.RandomState(0)
+    X = rng.random_sample((5, 2))
+    Y = rng.random_sample((10, 2))
+    D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X])
+    D2 = haversine_distances(X, Y)
+    assert_array_almost_equal(D1, D2)
+
+    # Test haversine distance does not accept X where n_feature != 2
+    X = rng.random_sample((10, 3))
+    assert_raise_message(ValueError,
+                         "Haversine distance only valid in 2 dimensions",
+                         haversine_distances, X)
+
+
 # Paired distances
 
+
 def test_paired_euclidean_distances():
     # Check the paired Euclidean distances computation
     X = [[0], [0]]

diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -11,8 +11,10 @@
 from sklearn.utils.testing import assert_equal
 from sklearn.utils.testing import assert_true
 from sklearn.utils.testing import assert_warns
+from sklearn.utils.testing import assert_raise_message
 from sklearn.utils.testing import ignore_warnings
 from sklearn.utils.validation import check_random_state
+
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn import neighbors, datasets
 
@@ -797,7 +799,7 @@ def test_neighbors_badargs():
                   neighbors.NearestNeighbors,
                   algorithm='blah')
 
-    X = rng.random_sample((10, 2))
+    X = rng.random_sample((10, 3))
     Xsparse = csr_matrix(X)
     y = np.ones(10)
 
@@ -812,13 +814,6 @@ def test_neighbors_badargs():
                       cls, p=-1)
         assert_raises(ValueError,
                       cls, algorithm='blah')
-        nbrs = cls(algorithm='ball_tree', metric='haversine')
-        assert_raises(ValueError,
-                      nbrs.predict,
-                      X)
-        assert_raises(ValueError,
-                      ignore_warnings(nbrs.fit),
-                      Xsparse, y)
         nbrs = cls()
         assert_raises(ValueError,
                       nbrs.fit,
@@ -830,6 +825,12 @@ def test_neighbors_badargs():
         assert_raises(ValueError,
                       nbrs.predict,
                       [])
+        nbrs = cls(metric='haversine', algorithm='brute')
+        nbrs.fit(X, y)
+        assert_raise_message(ValueError,
+                             "Haversine distance only valid in 2 dimensions",
+                             nbrs.predict,
+                             X)
 
     nbrs = neighbors.NearestNeighbors().fit(X)
 
@@ -857,7 +858,8 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
                ('chebyshev', {}),
                ('seuclidean', dict(V=rng.rand(n_features))),
                ('wminkowski', dict(p=3, w=rng.rand(n_features))),
-               ('mahalanobis', dict(VI=VI))]
+               ('mahalanobis', dict(VI=VI)),
+               ('haversine', {})]
     algorithms = ['brute', 'ball_tree', 'kd_tree']
     X = rng.rand(n_samples, n_features)
 
@@ -880,8 +882,13 @@ def test_neighbors_metrics(n_samples=20, n_features=3,
                                                algorithm=algorithm,
                                                metric=metric, p=p,
                                                metric_params=metric_params)
-            neigh.fit(X)
-            results.append(neigh.kneighbors(test, return_distance=True))
+            if metric == 'haversine':
+                neigh.fit(X[:, :2])
+                results.append(neigh.kneighbors(test[:, :2],
+                                                return_distance=True))
+            else:
+                neigh.fit(X)
+                results.append(neigh.kneighbors(test, return_distance=True))
 
         assert_array_almost_equal(results[0][0], results[1][0])
         assert_array_almost_equal(results[0][1], results[1][1])