Thanks to visit codestin.com
Credit goes to github.com

Skip to content

MAINT Introduce Pairwise Distances Reductions private submodule #22064

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
de166e0
MAINT Introduce Pairwise Distances Reductions private submodule
jjerphan Dec 23, 2021
14106c4
Retrigger CI for failing Circle CI job
jjerphan Dec 23, 2021
3cdd3a5
TST Improve _get_dummy_metric_params_list
jjerphan Dec 23, 2021
31db785
Address review comments
jjerphan Dec 23, 2021
b60e897
Address review comments
jjerphan Dec 23, 2021
5d7ea09
DEBUG TST Try removing handling of unstable OpenBLAS configuration
jjerphan Dec 23, 2021
fb927e7
TST Remove useless mahalanobis case
jjerphan Dec 23, 2021
a2f7b6d
Factor the logic for computing last chunks indices
jjerphan Jan 3, 2022
e9acef7
Improve comments regarding strategies and parallel sections
jjerphan Jan 3, 2022
51dad2b
Address reviews' comments
jjerphan Jan 4, 2022
59b153c
Remove unused _sqeuclidean_row_norms
jjerphan Jan 4, 2022
395f92a
Swap argkmin_indices and argkmin_distances
jjerphan Jan 4, 2022
09a9527
Move initializations from __init__ to __cinit__
jjerphan Jan 4, 2022
f396a58
Improve docstring comment
jjerphan Jan 5, 2022
22f4f30
Improve comments
jjerphan Jan 5, 2022
234fb01
Add 'pairwise_dist_chunk_size' to scikit-learn config
jjerphan Jan 5, 2022
f89c65d
TST Adapt test for PairwiseDistancesArgKmin translation invariance
jjerphan Jan 5, 2022
fe17af1
test_pairwise_distances_argkmin
ogrisel Jan 5, 2022
38715d2
Simpler variable names
ogrisel Jan 5, 2022
a461355
Merge pull request #7 from ogrisel/test_pairwise_distances_argkmin
jjerphan Jan 5, 2022
ce986d5
fixup! TST Adapt test for PairwiseDistancesArgKmin translation invari…
jjerphan Jan 5, 2022
70a28b7
fixup! fixup! TST Adapt test for PairwiseDistancesArgKmin translation…
jjerphan Jan 5, 2022
06ca869
Use correct orthograph for 'Callback'
jjerphan Jan 5, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions sklearn/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
"working_memory": int(os.environ.get("SKLEARN_WORKING_MEMORY", 1024)),
"print_changed_only": True,
"display": "text",
"pairwise_dist_chunk_size": int(
os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256)
),
}
_threadlocal = threading.local()

Expand Down Expand Up @@ -40,7 +43,11 @@ def get_config():


def set_config(
assume_finite=None, working_memory=None, print_changed_only=None, display=None
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
):
"""Set global scikit-learn configuration

Expand Down Expand Up @@ -80,6 +87,12 @@ def set_config(

.. versionadded:: 0.23

pairwise_dist_chunk_size : int, default=None
The number of vectors per chunk for PairwiseDistancesReduction.
Default is 256 (suitable for most of modern laptops' caches and architectures).

.. versionadded:: 1.1

See Also
--------
config_context : Context manager for global scikit-learn configuration.
Expand All @@ -95,11 +108,18 @@ def set_config(
local_config["print_changed_only"] = print_changed_only
if display is not None:
local_config["display"] = display
if pairwise_dist_chunk_size is not None:
local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size


@contextmanager
def config_context(
*, assume_finite=None, working_memory=None, print_changed_only=None, display=None
*,
assume_finite=None,
working_memory=None,
print_changed_only=None,
display=None,
pairwise_dist_chunk_size=None,
):
"""Context manager for global scikit-learn configuration.

Expand Down Expand Up @@ -138,6 +158,12 @@ def config_context(

.. versionadded:: 0.23

pairwise_dist_chunk_size : int, default=None
The number of vectors per chunk for PairwiseDistancesReduction.
Default is 256 (suitable for most of modern laptops' caches and architectures).

.. versionadded:: 1.1

Yields
------
None.
Expand Down
21 changes: 21 additions & 0 deletions sklearn/metrics/_dist_metrics.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,24 @@ cdef class DistanceMetric:
cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1

cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1


######################################################################
# DatasetsPair base class
cdef class DatasetsPair:
cdef DistanceMetric distance_metric

cdef ITYPE_t n_samples_X(self) nogil

cdef ITYPE_t n_samples_Y(self) nogil

cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil

cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil


cdef class DenseDenseDatasetsPair(DatasetsPair):
cdef:
const DTYPE_t[:, ::1] X
const DTYPE_t[:, ::1] Y
ITYPE_t d
194 changes: 185 additions & 9 deletions sklearn/metrics/_dist_metrics.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import numpy as np
cimport numpy as np
from cython cimport final

np.import_array() # required in order to use C-API


Expand All @@ -23,10 +25,10 @@ cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n):
return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)


# some handy constants
from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
cdef DTYPE_t INF = np.inf

from scipy.sparse import csr_matrix, issparse
from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE
from ..utils._typedefs import DTYPE, ITYPE
from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper
Expand Down Expand Up @@ -67,6 +69,16 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance,
'haversine': HaversineDistance,
'pyfunc': PyFuncDistance}

BOOL_METRICS = [
"matching",
"jaccard",
"dice",
"kulsinski",
"rogerstanimoto",
"russellrao",
"sokalmichener",
"sokalsneath",
]

def get_valid_metric_ids(L):
"""Given an iterable of metric class names or class identifiers,
Expand Down Expand Up @@ -195,8 +207,8 @@ cdef class DistanceMetric:
"""
def __cinit__(self):
self.p = 2
self.vec = np.zeros(1, dtype=DTYPE, order='c')
self.mat = np.zeros((1, 1), dtype=DTYPE, order='c')
self.vec = np.zeros(1, dtype=DTYPE, order='C')
self.mat = np.zeros((1, 1), dtype=DTYPE, order='C')
self.size = 1

def __reduce__(self):
Expand Down Expand Up @@ -306,8 +318,9 @@ cdef class DistanceMetric:
This can optionally be overridden in a base class.

The rank-preserving surrogate distance is any measure that yields the same
rank as the distance, but is more efficient to compute. For example, for the
Euclidean metric, the surrogate distance is the squared-euclidean distance.
rank as the distance, but is more efficient to compute. For example, the
rank-preserving surrogate distance of the Euclidean metric is the
squared-euclidean distance.
"""
return self.dist(x1, x2, size)

Expand Down Expand Up @@ -343,8 +356,9 @@ cdef class DistanceMetric:
"""Convert the rank-preserving surrogate distance to the distance.

The surrogate distance is any measure that yields the same rank as the
distance, but is more efficient to compute. For example, for the
Euclidean metric, the surrogate distance is the squared-euclidean distance.
distance, but is more efficient to compute. For example, the
rank-preserving surrogate distance of the Euclidean metric is the
squared-euclidean distance.

Parameters
----------
Expand All @@ -362,8 +376,9 @@ cdef class DistanceMetric:
"""Convert the true distance to the rank-preserving surrogate distance.

The surrogate distance is any measure that yields the same rank as the
distance, but is more efficient to compute. For example, for the
Euclidean metric, the surrogate distance is the squared-euclidean distance.
distance, but is more efficient to compute. For example, the
rank-preserving surrogate distance of the Euclidean metric is the
squared-euclidean distance.

Parameters
----------
Expand Down Expand Up @@ -1150,3 +1165,164 @@ cdef class PyFuncDistance(DistanceMetric):

cdef inline double fmax(double a, double b) nogil:
return max(a, b)


######################################################################
# Datasets Pair Classes
cdef class DatasetsPair:
"""Abstract class which wraps a pair of datasets (X, Y).

This class allows computing distances between a single pair of rows of
of X and Y at a time given the pair of their indices (i, j). This class is
specialized for each metric thanks to the :func:`get_for` factory classmethod.

The handling of parallelization over chunks to compute the distances
and aggregation for several rows at a time is done in dedicated
subclasses of PairwiseDistancesReduction that in-turn rely on
subclasses of DatasetsPair for each pair of rows in the data. The goal
is to make it possible to decouple the generic parallelization and
aggregation logic from metric-specific computation as much as
possible.

X and Y can be stored as C-contiguous np.ndarrays or CSR matrices
in subclasses.

This class avoids the overhead of dispatching distance computations
to :class:`sklearn.metrics.DistanceMetric` based on the physical
representation of the vectors (sparse vs. dense). It makes use of
cython.final to remove the overhead of dispatching method calls.

Parameters
----------
distance_metric: DistanceMetric
The distance metric responsible for computing distances
between two vectors of (X, Y).
"""

@classmethod
def get_for(
cls,
X,
Y,
str metric="euclidean",
dict metric_kwargs=None,
) -> DatasetsPair:
"""Return the DatasetsPair implementation for the given arguments.

Parameters
----------
X : {ndarray, sparse matrix} of shape (n_samples_X, n_features)
Input data.
If provided as a ndarray, it must be C-contiguous.
If provided as a sparse matrix, it must be in CSR format.

Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features)
Input data.
If provided as a ndarray, it must be C-contiguous.
If provided as a sparse matrix, it must be in CSR format.

metric : str, default='euclidean'
The distance metric to compute between rows of X and Y.
The default metric is a fast implementation of the Euclidean
metric. For a list of available metrics, see the documentation
of :class:`~sklearn.metrics.DistanceMetric`.

metric_kwargs : dict, default=None
Keyword arguments to pass to specified metric function.

Returns
-------
datasets_pair: DatasetsPair
The suited DatasetsPair implementation.
"""
cdef:
DistanceMetric distance_metric = DistanceMetric.get_metric(
metric,
**(metric_kwargs or {})
)

if not(X.dtype == Y.dtype == np.float64):
raise ValueError(
f"Only 64bit float datasets are supported at this time, "
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}"
)

# Metric-specific checks that do not replace nor duplicate `check_array`.
distance_metric._validate_data(X)
distance_metric._validate_data(Y)

# TODO: dispatch to other dataset pairs for sparse support once available:
if issparse(X) or issparse(Y):
raise ValueError("Only dense datasets are supported for X and Y.")

return DenseDenseDatasetsPair(X, Y, distance_metric)

def __init__(self, DistanceMetric distance_metric):
self.distance_metric = distance_metric

cdef ITYPE_t n_samples_X(self) nogil:
"""Number of samples in X."""
# This is a abstract method.
# This _must_ always be overwritten in subclasses.
# TODO: add "with gil: raise" here when supporting Cython 3.0
return -999

cdef ITYPE_t n_samples_Y(self) nogil:
"""Number of samples in Y."""
# This is a abstract method.
# This _must_ always be overwritten in subclasses.
# TODO: add "with gil: raise" here when supporting Cython 3.0
return -999

cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
return self.dist(i, j)

cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
# This is a abstract method.
# This _must_ always be overwritten in subclasses.
# TODO: add "with gil: raise" here when supporting Cython 3.0
return -1

@final
cdef class DenseDenseDatasetsPair(DatasetsPair):
"""Compute distances between row vectors of two arrays.

Parameters
----------
X: ndarray of shape (n_samples_X, n_features)
Rows represent vectors. Must be C-contiguous.

Y: ndarray of shape (n_samples_Y, n_features)
Rows represent vectors. Must be C-contiguous.

distance_metric: DistanceMetric
The distance metric responsible for computing distances
between two row vectors of (X, Y).
"""

def __init__(self, X, Y, DistanceMetric distance_metric):
super().__init__(distance_metric)
# Arrays have already been checked
self.X = X
self.Y = Y
self.d = X.shape[1]

@final
cdef ITYPE_t n_samples_X(self) nogil:
return self.X.shape[0]

@final
cdef ITYPE_t n_samples_Y(self) nogil:
return self.Y.shape[0]

@final
cdef DTYPE_t surrogate_dist(self, ITYPE_t i, ITYPE_t j) nogil:
return self.distance_metric.rdist(&self.X[i, 0],
&self.Y[j, 0],
self.d)

@final
cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil:
return self.distance_metric.dist(&self.X[i, 0],
&self.Y[j, 0],
self.d)
Loading