Thanks to visit codestin.com
Credit goes to github.com

Skip to content

[MRG] Allow exact Euclidean distance calculations #12136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions doc/whats_new/v0.21.rst
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,16 @@ Support for Python 3.4 and below has been officially dropped.
:pr:`12344` by :user:`Adrin Jalali <adrinjalali>`.


:mod:`sklearn.metrics`
......................

- |Feature| Added ``algorithm`` parameter to :func:`metrics.euclidean_distances`
to compute Euclidean distances without the quadratic expansion formula,
which is slower but more precise numerically, particularly in 32 bit.
Also added a global ``euclidean_distances_algorithm`` config parameter
with the same effect. :issue:`12136` by `Roman Yurchak`_.


Multiple modules
................

Expand Down
26 changes: 24 additions & 2 deletions sklearn/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

_global_config = {
'assume_finite': bool(os.environ.get('SKLEARN_ASSUME_FINITE', False)),
'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024))
'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)),
'euclidean_distance_algorithm': 'quadratic-expansion'
}


Expand All @@ -20,7 +21,8 @@ def get_config():
return _global_config.copy()


def set_config(assume_finite=None, working_memory=None):
def set_config(assume_finite=None, working_memory=None,
euclidean_distance_algorithm=None):
"""Set global scikit-learn configuration

.. versionadded:: 0.19
Expand All @@ -43,11 +45,22 @@ def set_config(assume_finite=None, working_memory=None):

.. versionadded:: 0.20

euclidean_distance_algorithm : {str, None}
Method of computing the euclidean distances: "exact" uses
``scipy.spatial.distance.cdist`` while "quadratic-expansion" uses
a faster but less precise quadratic expansion. For sparse data, only
"quadratic-expansion" is supported.
Global default: "quadratic-expansion"

.. versionadded:: 0.21
"""
if assume_finite is not None:
_global_config['assume_finite'] = assume_finite
if working_memory is not None:
_global_config['working_memory'] = working_memory
if euclidean_distance_algorithm is not None:
_global_config['euclidean_distance_algorithm'] = (
euclidean_distance_algorithm)


@contextmanager
Expand All @@ -68,6 +81,15 @@ def config_context(**new_config):
computation time and memory on expensive operations that can be
performed in chunks. Global default: 1024.

euclidean_distance_algorithm : {str, None}
Method of computing the euclidean distances: "exact" uses
``scipy.spatial.distance.cdist`` while "quadratic-expansion" uses
a faster but less precise quadratic expansion. For sparse data, only
"quadratic-expansion" is supported.
Global default: "quadratic-expansion"

.. versionadded:: 0.21

Notes
-----
All settings, not just those presently modified, will be returned to
Expand Down
40 changes: 37 additions & 3 deletions sklearn/metrics/pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@
from ..utils import gen_batches, get_chunk_n_rows
from ..utils.extmath import row_norms, safe_sparse_dot
from ..preprocessing import normalize
from ..utils import get_config
from ..utils._joblib import Parallel
from ..utils._joblib import delayed
from ..utils._joblib import effective_n_jobs


from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan


Expand Down Expand Up @@ -163,13 +165,13 @@ def check_paired_arrays(X, Y):

# Pairwise distances
def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
X_norm_squared=None):
X_norm_squared=None, algorithm=None):
"""
Considering the rows of X (and Y=X) as vectors, compute the
distance matrix between each pair of vectors.

For efficiency reasons, the euclidean distance between a pair of row
vector x and y is computed as::
For efficiency reasons, by default, the euclidean distance between a
pair of row vector x and y is computed as::

dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y))

Expand All @@ -181,6 +183,12 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
However, this is not the most precise way of doing this computation, and
the distance matrix returned by this function may not be exactly
symmetric as required by, e.g., ``scipy.spatial.distance`` functions.
To use a slower but exact approach for dense data, either provide
`algorithm="exact"` or set the global ``euclidean_distance_algorithm``
parameter::

with sklearn.config_context(euclidean_distance_algorithm='exact'):
knn = KNeighboursClassifier(algorithm='brute', metric='euclidean')

Read more in the :ref:`User Guide <metrics>`.

Expand All @@ -201,6 +209,18 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
Pre-computed dot-products of vectors in X (e.g.,
``(X**2).sum(axis=1)``)

algorithm : {str, None}, default: None
Method of computing the euclidean distances: "exact" uses
``scipy.spatial.distance.cdist`` while "quadratic-expansion" uses
a faster but less precise quadratic expansion. For sparse data, only
"quadratic-expansion" is supported.

When None (default), the value of
``sklearn.get_config()['euclidean_distance_algorithm']`` is used (
default: "quadratic-expansion")

.. versionadded:: 0.21

Returns
-------
distances : {array, sparse matrix}, shape (n_samples_1, n_samples_2)
Expand All @@ -224,6 +244,20 @@ def euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False,
"""
X, Y = check_pairwise_arrays(X, Y)

if algorithm is None:
algorithm = get_config()['euclidean_distance_algorithm']

if algorithm not in ['exact', 'quadratic-expansion']:
raise ValueError('algorithm=%s invalid, must be one of '
'"exact", "quadratic-expansion"' % algorithm)

if algorithm == 'exact':
if issparse(X) or issparse(Y):
raise ValueError("algorithm='exact' does not support sparse data")

metric = 'sqeuclidean' if squared else 'euclidean'
return distance.cdist(X, Y, metric)

if X_norm_squared is not None:
XX = check_array(X_norm_squared)
if XX.shape == (1, X.shape[0]):
Expand Down
27 changes: 27 additions & 0 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@

from scipy.sparse import dok_matrix, csr_matrix, issparse
from scipy.spatial.distance import cosine, cityblock, minkowski, wminkowski
from scipy.spatial.distance import cdist

import pytest

import sklearn
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_allclose
Expand Down Expand Up @@ -881,3 +883,28 @@ def test_check_preserve_type():
XB.astype(np.float))
assert_equal(XA_checked.dtype, np.float)
assert_equal(XB_checked.dtype, np.float)


@pytest.mark.parametrize('dtype', ('float32', 'float64'))
def test_euclidean_distance_algorithm(dtype):
XA = np.random.RandomState(42).rand(100, 10).astype(dtype)
XB = np.random.RandomState(41).rand(200, 10).astype(dtype)

dist_exact = euclidean_distances(XA, XB, algorithm='exact')
assert_allclose(dist_exact, cdist(XA, XB, 'euclidean'))

dist_exact_squared = euclidean_distances(XA, XB, algorithm='exact',
squared=True)

assert_allclose(dist_exact_squared, dist_exact**2)

dist_approx = euclidean_distances(XA, XB, algorithm='quadratic-expansion')
assert_allclose(dist_exact, dist_approx, rtol=1e-5)

with sklearn.config_context(euclidean_distance_algorithm='exact'):
assert_allclose(dist_exact,
euclidean_distances(XA, XB))

with pytest.raises(ValueError,
match="algorithm='exact' does not support sparse data"):
euclidean_distances(csr_matrix(XA), csr_matrix(XB), algorithm='exact')
12 changes: 9 additions & 3 deletions sklearn/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@


def test_config_context():
assert get_config() == {'assume_finite': False, 'working_memory': 1024}
assert get_config() == {
'assume_finite': False, 'working_memory': 1024,
'euclidean_distance_algorithm': 'quadratic-expansion'}

# Not using as a context manager affects nothing
config_context(assume_finite=True)
assert get_config()['assume_finite'] is False

with config_context(assume_finite=True):
assert get_config() == {'assume_finite': True, 'working_memory': 1024}
assert get_config() == {
'assume_finite': True, 'working_memory': 1024,
'euclidean_distance_algorithm': 'quadratic-expansion'}
assert get_config()['assume_finite'] is False

with config_context(assume_finite=True):
Expand All @@ -34,7 +38,9 @@ def test_config_context():

assert get_config()['assume_finite'] is True

assert get_config() == {'assume_finite': False, 'working_memory': 1024}
assert get_config() == {
'assume_finite': False, 'working_memory': 1024,
'euclidean_distance_algorithm': 'quadratic-expansion'}

# No positional arguments
assert_raises(TypeError, config_context, True)
Expand Down