scikit-learn · rth · Sep 21, 2018 · Sep 23, 2018 · Sep 24, 2018 · jnothman
diff --git a/sklearn/exceptions.py b/sklearn/exceptions.py
@@ -96,6 +96,17 @@ class EfficiencyWarning(UserWarning):
     """
 
 
+class NumericalPrecisionWarning(UserWarning):
+    """Warning used to notify the user of possibly inaccurate computation.
+
+    This warning notifies the user that the numerical accuracy may not be
+    optimal due to some reason which may be included as a part of the warning
+    message.
+
+    .. versionadded:: 0.21
+    """
+
+
 class FitFailedWarning(RuntimeWarning):
     """Warning class used if there is an error while fitting the estimator.
 

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -18,6 +18,7 @@
 from scipy.sparse import csr_matrix
 from scipy.sparse import issparse
 
+from ..exceptions import NumericalPrecisionWarning
 from ..utils.validation import _num_samples
 from ..utils import check_array
 from ..utils import gen_even_slices
@@ -27,6 +28,7 @@
 from ..utils import Parallel
 from ..utils import delayed
 from ..utils import effective_n_jobs
+from ..utils.fixes import nanpercentile
 
 from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 
@@ -244,6 +246,51 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
     else:
         YY = row_norms(Y, squared=True)[np.newaxis, :]
 
+    if XX.size > 0 and YY.size > 0:
+        # Heuristics to detect possible numerical precision issues, with
+        # the used quadratic expansion that occurs when the distance between
+        # considered vectors is much lower than their norm.
+        #
+        # Here, a warning is raised if all considered vectors are within a ND
+        # sperical shell with a thickness to diameter ratio lower than
+        # the threshold value of 1e-7 in 64 bit. While such a configuration
+        # is generally uncommon, it occurs when a distribution with an
+        # inertia (or mean radius) of I, is shifted by I/threshold out of
+        # of the origin.
+
+        # min and max of sample norms, ignoring outliers
+        XX_min = nanpercentile(XX, 5)**0.5
+        XX_max = nanpercentile(XX, 95)**0.5
+        YY_min = nanpercentile(YY, 5)**0.5
+        YY_max = nanpercentile(YY, 95)**0.5
+        XY_min = min(XX_min, YY_min)
+        XY_max = max(XX_max, YY_max)
+
+        # only float64, float32 dtypes are possible
+        if X.dtype == np.float64:
+            threshold = 1e-7
+        elif X.dtype == np.float32:
+            threshold = 1e-3
+
+        if (abs(XY_max - XY_min) < threshold*XY_max and
+                # ignore null vector
+                XX_max > 0.0 and YY_max > 0
+                # ignore comparison between the vector and itself
+                and XX_max != YY_max):
+
+            warning_message = (
+                    "with the provided data, computing "
+                    "Euclidean distances with the quadratic expansion may "
+                    "lead to numerically inaccurate results. ")
+            if not issparse(X):
+                warning_message += (
+                    "Consider standardizing features by removing the mean, "
+                    "or setting globally "
+                    "euclidean_distances_algorithm='exact' for slower but "
+                    "more precise implementation.")
+
+            warnings.warn(warning_message, NumericalPrecisionWarning)
+
     distances = safe_sparse_dot(X, Y.T, dense_output=True)
     distances *= -2
     distances += XX

diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -49,8 +49,7 @@
 from sklearn.metrics.pairwise import paired_manhattan_distances
 from sklearn.preprocessing import normalize
 from sklearn.exceptions import DataConversionWarning
-
-import pytest
+from sklearn.exceptions import NumericalPrecisionWarning
 
 
 def test_pairwise_distances():
@@ -874,3 +873,59 @@ def test_check_preserve_type():
                                                    XB.astype(np.float))
     assert_equal(XA_checked.dtype, np.float)
     assert_equal(XB_checked.dtype, np.float)
+
+
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+def test_euclidean_distance_precision(dtype):
+    """Checks for the most problematic cases when computing the
+    the euclidean distance with dot(x,x) - 2 dot(x,y) + dot(y,y)"""
+    if dtype == 'float32':
+        offset = 10000
+    else:
+        offset = 1e8
+
+    XA = np.array([[offset]], dtype)
+    XB = np.array([[offset + 1]], dtype)
+
+    with pytest.warns(NumericalPrecisionWarning):
+        with pytest.raises(AssertionError):
+            assert euclidean_distances(XA, XB)[0, 0] == 1.
+
+    with pytest.warns(NumericalPrecisionWarning):
+        with pytest.raises(AssertionError):
+            assert pairwise_distances(XA, XB)[0, 0] == 1.
+
+    # Note: this needs https://github.com/scikit-learn/scikit-learn/pull/12136
+    # to be merged first.
+    #
+    #  with pytest.warns(None) as record:
+    #      assert euclidean_distances(XA, XB, algorithm='exact')[0, 0] == 1
+
+    #      with sklearn.config_context(euclidean_distances_algorithm='exact'):
+    #          assert euclidean_distances(XA, XB)[0, 0] == 1
+    #          assert pairwise_distances(XA, XB)[0, 0] == 1.
+    #  assert len(record) == 0
+
+
+@pytest.mark.parametrize('dtype', ('float32', 'float64'))
+def test_euclidean_distance_distribution_precision(dtype):
+    """Computing euclidean_distances with the quadratic expansion
+    on data strongly shifted off the origin leads to numerical precision
+    issues"""
+    XA = np.random.RandomState(42).randn(100, 10).astype(dtype)
+    XB = np.random.RandomState(41).randn(200, 10).astype(dtype)
+
+    if dtype == 'float32':
+        offset = 10000
+    else:
+        offset = 1e8
+
+    with pytest.warns(None) as record:
+        euclidean_distances(XA, XB)
+    assert len(record) == 0
+
+    XA += offset
+    XB += offset
+
+    with pytest.warns(NumericalPrecisionWarning):
+        euclidean_distances(XA, XB)