diff --git a/.circleci/config.yml b/.circleci/config.yml index 2e8958a2ab879..ce7a170103502 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -116,6 +116,8 @@ jobs: environment: - OMP_NUM_THREADS: 2 - OPENBLAS_NUM_THREADS: 2 + - NUMPY_VERSION: 'latest' + - SCIPY_VERSION: 'latest' - CYTHON_VERSION: 'latest' - JOBLIB_VERSION: 'latest' - THREADPOOLCTL_VERSION: 'latest' diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh index 3d555f66227c4..9ad7418e855ca 100755 --- a/build_tools/circle/build_test_arm.sh +++ b/build_tools/circle/build_test_arm.sh @@ -21,39 +21,51 @@ source build_tools/shared.sh sudo add-apt-repository --remove ppa:ubuntu-toolchain-r/test sudo apt-get update -sudo apt-get install python3-virtualenv ccache -python3 -m virtualenv --system-site-packages --python=python3 testenv -source testenv/bin/activate -pip install --upgrade pip + +# Setup conda environment +MINICONDA_URL="https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-aarch64.sh" + +# Install Mambaforge +wget $MINICONDA_URL -O mambaforge.sh +MINICONDA_PATH=$HOME/miniconda +chmod +x mambaforge.sh && ./mambaforge.sh -b -p $MINICONDA_PATH +export PATH=$MINICONDA_PATH/bin:$PATH +mamba update --yes conda + +# Create environment and install dependencies +mamba create -n testenv --yes python=3.7 +source activate testenv + +# Use the latest by default +mamba install --verbose -y ccache \ + pip \ + $(get_dep numpy $NUMPY_VERSION) \ + $(get_dep scipy $SCIPY_VERSION) \ + $(get_dep cython $CYTHON_VERSION) \ + $(get_dep joblib $JOBLIB_VERSION) \ + $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ + $(get_dep pytest $PYTEST_VERSION) \ + $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) setup_ccache -python -m pip install $(get_dep cython $CYTHON_VERSION) \ - $(get_dep joblib $JOBLIB_VERSION) -python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ - $(get_dep pytest $PYTEST_VERSION) \ - $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) if [[ "$COVERAGE" == "true" ]]; then - python -m pip install codecov pytest-cov -fi - -if [[ "$PYTEST_XDIST_VERSION" != "none" ]]; then - python -m pip install pytest-xdist + mamba install --verbose -y codecov pytest-cov fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then # numpydoc requires sphinx - python -m pip install sphinx - python -m pip install numpydoc + mamba install --verbose -y sphinx + mamba install --verbose -y numpydoc fi python --version +conda list # Set parallelism to 3 to overlap IO bound tasks with CPU bound tasks on CI # workers with 2 cores when building the compiled extensions of scikit-learn. export SKLEARN_BUILD_PARALLEL=3 -python -m pip list -pip install --verbose --editable . +pip install --verbose --editable . --no-build-isolation ccache -s python -c "import sklearn; sklearn.show_versions()" python -m threadpoolctl --import sklearn diff --git a/doc/glossary.rst b/doc/glossary.rst index 010f16a361531..2b4c6af0d1866 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -644,9 +644,8 @@ General Concepts Note that for most distance metrics, we rely on implementations from :mod:`scipy.spatial.distance`, but may reimplement for efficiency in - our context. The :mod:`neighbors` module also duplicates some metric - implementations for integration with efficient binary tree search data - structures. + our context. The :class:`metrics.DistanceMetric` interface is used to implement + distance metrics for integration with efficient neighbors search. pd A shorthand for `Pandas `_ due to the @@ -1023,7 +1022,7 @@ such as: Further examples: -* :class:`neighbors.DistanceMetric` +* :class:`metrics.DistanceMetric` * :class:`gaussian_process.kernels.Kernel` * ``tree.Criterion`` diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 72b67b23e8dc3..b7000bcf7cbb2 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1058,6 +1058,16 @@ further details. metrics.consensus_score +Distance metrics +---------------- + +.. currentmodule:: sklearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + metrics.DistanceMetric Pairwise metrics ---------------- @@ -1317,7 +1327,6 @@ Model validation :template: class.rst neighbors.BallTree - neighbors.DistanceMetric neighbors.KDTree neighbors.KernelDensity neighbors.KNeighborsClassifier diff --git a/doc/modules/density.rst b/doc/modules/density.rst index 115d318183577..6440bf79ab729 100644 --- a/doc/modules/density.rst +++ b/doc/modules/density.rst @@ -136,7 +136,7 @@ The form of these kernels is as follows: :math:`K(x; h) \propto \cos(\frac{\pi x}{2h})` if :math:`x < h` The kernel density estimator can be used with any of the valid distance -metrics (see :class:`~sklearn.neighbors.DistanceMetric` for a list of available metrics), though +metrics (see :class:`~sklearn.metrics.DistanceMetric` for a list of available metrics), though the results are properly normalized only for the Euclidean metric. One particularly useful metric is the `Haversine distance `_ diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index c400f5ba57685..91322dba632d6 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -523,7 +523,9 @@ def predict(self, X): if self.cluster_centers_.shape[0] > 0: with config_context(assume_finite=True): - return pairwise_distances_argmin(X, self.cluster_centers_) + return pairwise_distances_argmin( + X, self.cluster_centers_, metric="fast_euclidean" + ) else: warnings.warn( "This model does not have any cluster centers " diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 6606f370b81eb..70b3a5028169b 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -16,8 +16,8 @@ from ..base import BaseEstimator, ClusterMixin from ..metrics.pairwise import paired_distances -from ..neighbors import DistanceMetric -from ..neighbors._dist_metrics import METRIC_MAPPING +from ..metrics import DistanceMetric +from ..metrics._dist_metrics import METRIC_MAPPING from ..utils import check_array from ..utils._fast_dict import IntFloatDict from ..utils.fixes import _astype_copy_false diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 78c15bb8e1a15..d2dd3f937a27d 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -12,7 +12,6 @@ from ..metrics import pairwise_distances_argmin from ..metrics.pairwise import euclidean_distances from ..base import TransformerMixin, ClusterMixin, BaseEstimator -from ..utils.extmath import row_norms from ..utils import deprecated from ..utils.validation import check_is_fitted from ..exceptions import ConvergenceWarning @@ -654,11 +653,10 @@ def predict(self, X): """ check_is_fitted(self) X = self._validate_data(X, accept_sparse="csr", reset=False) - kwargs = {"Y_norm_squared": self._subcluster_norms} with config_context(assume_finite=True): argmin = pairwise_distances_argmin( - X, self.subcluster_centers_, metric_kwargs=kwargs + X, self.subcluster_centers_, metric="fast_euclidean" ) return self.subcluster_labels_[argmin] @@ -704,9 +702,6 @@ def _global_clustering(self, X=None): "n_clusters should be an instance of ClusterMixin or an int" ) - # To use in predict to avoid recalculation. - self._subcluster_norms = row_norms(self.subcluster_centers_, squared=True) - if clusterer is None or not_enough_centroids: self.subcluster_labels_ = np.arange(len(centroids)) if not_enough_centroids: diff --git a/sklearn/cluster/_hierarchical_fast.pyx b/sklearn/cluster/_hierarchical_fast.pyx index 2a58757ce327d..11ea3294c086a 100644 --- a/sklearn/cluster/_hierarchical_fast.pyx +++ b/sklearn/cluster/_hierarchical_fast.pyx @@ -13,7 +13,7 @@ ctypedef np.int8_t INT8 np.import_array() -from ..neighbors._dist_metrics cimport DistanceMetric +from ..metrics._dist_metrics cimport DistanceMetric from ..utils._fast_dict cimport IntFloatDict # C++ @@ -236,8 +236,8 @@ def max_merge(IntFloatDict a, IntFloatDict b, def average_merge(IntFloatDict a, IntFloatDict b, np.ndarray[ITYPE_t, ndim=1] mask, ITYPE_t n_a, ITYPE_t n_b): - """Merge two IntFloatDicts with the average strategy: when the - same key is present in the two dicts, the weighted average of the two + """Merge two IntFloatDicts with the average strategy: when the + same key is present in the two dicts, the weighted average of the two values is used. Parameters @@ -290,13 +290,13 @@ def average_merge(IntFloatDict a, IntFloatDict b, ############################################################################### -# An edge object for fast comparisons +# An edge object for fast comparisons cdef class WeightedEdge: cdef public ITYPE_t a cdef public ITYPE_t b cdef public DTYPE_t weight - + def __init__(self, DTYPE_t weight, ITYPE_t a, ITYPE_t b): self.weight = weight self.a = a @@ -326,7 +326,7 @@ cdef class WeightedEdge: return self.weight > other.weight elif op == 5: return self.weight >= other.weight - + def __repr__(self): return "%s(weight=%f, a=%i, b=%i)" % (self.__class__.__name__, self.weight, @@ -475,7 +475,7 @@ def mst_linkage_core( dist_metric: DistanceMetric A DistanceMetric object conforming to the API from - ``sklearn.neighbors._dist_metrics.pxd`` that will be + ``sklearn.metrics._dist_metrics.pxd`` that will be used to compute distances. Returns @@ -534,4 +534,3 @@ def mst_linkage_core( current_node = new_node return np.array(result) - diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index cc3930891d880..542ed0dbc97aa 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -512,4 +512,6 @@ def predict(self, X): check_is_fitted(self) X = self._validate_data(X, reset=False) with config_context(assume_finite=True): - return pairwise_distances_argmin(X, self.cluster_centers_) + return pairwise_distances_argmin( + X, self.cluster_centers_, metric="fast_euclidean" + ) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 92f92dc3736e3..3525643383c26 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -17,7 +17,7 @@ from scipy.sparse.csgraph import connected_components from sklearn.metrics.cluster import adjusted_rand_score -from sklearn.neighbors.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS +from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import ignore_warnings @@ -31,6 +31,7 @@ _fix_connectivity, ) from sklearn.feature_extraction.image import grid_to_graph +from sklearn.metrics import DistanceMetric from sklearn.metrics.pairwise import ( PAIRED_DISTANCES, cosine_distances, @@ -38,7 +39,7 @@ pairwise_distances, ) from sklearn.metrics.cluster import normalized_mutual_info_score -from sklearn.neighbors import kneighbors_graph, DistanceMetric +from sklearn.neighbors import kneighbors_graph from sklearn.cluster._hierarchical_fast import ( average_merge, max_merge, diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 46958ea4ef7f8..e4339229c5b64 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -36,6 +36,8 @@ from ._classification import brier_score_loss from ._classification import multilabel_confusion_matrix +from ._dist_metrics import DistanceMetric + from . import cluster from .cluster import adjusted_mutual_info_score from .cluster import adjusted_rand_score @@ -115,6 +117,7 @@ "davies_bouldin_score", "DetCurveDisplay", "det_curve", + "DistanceMetric", "euclidean_distances", "explained_variance_score", "f1_score", diff --git a/sklearn/neighbors/_dist_metrics.pxd b/sklearn/metrics/_dist_metrics.pxd similarity index 64% rename from sklearn/neighbors/_dist_metrics.pxd rename to sklearn/metrics/_dist_metrics.pxd index 5b223f8c6d8a8..e87f442019a9d 100644 --- a/sklearn/neighbors/_dist_metrics.pxd +++ b/sklearn/metrics/_dist_metrics.pxd @@ -1,14 +1,13 @@ #!python -#cython: boundscheck=False -#cython: wraparound=False -#cython: cdivision=True +# cython: boundscheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: wraparound=False -cimport cython cimport numpy as np -from libc.math cimport fabs, sqrt, exp, cos, pow +from libc.math cimport sqrt, exp -from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t -from ._typedefs import DTYPE, ITYPE +from ..utils._typedefs cimport DTYPE_t, ITYPE_t ###################################################################### # Inline distance functions @@ -60,9 +59,25 @@ cdef class DistanceMetric: cdef DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1 - cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, + cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1 + cdef DTYPE_t csr_dist( + self, + const DTYPE_t[:] x1_data, + const ITYPE_t[:] x1_indices, + const DTYPE_t[:] x2_data, + const ITYPE_t[:] x2_indices, + ) nogil except -1 + + cdef DTYPE_t csr_rdist( + self, + const DTYPE_t[:] x1_data, + const ITYPE_t[:] x1_indices, + const DTYPE_t[:] x2_data, + const ITYPE_t[:] x2_indices, + ) nogil except -1 + cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1 cdef int cdist(self, const DTYPE_t[:, ::1] X, const DTYPE_t[:, ::1] Y, @@ -71,3 +86,24 @@ cdef class DistanceMetric: cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1 cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1 + + +###################################################################### +# DatasetsPair base class +cdef class DatasetsPair: + cdef DistanceMetric distance_metric + + cdef ITYPE_t n_X(self) nogil + + cdef ITYPE_t n_Y(self) nogil + + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil + + cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil + + +cdef class DenseDenseDatasetsPair(DatasetsPair): + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + ITYPE_t d diff --git a/sklearn/neighbors/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx similarity index 73% rename from sklearn/neighbors/_dist_metrics.pyx rename to sklearn/metrics/_dist_metrics.pyx index 240a7a3f7d14d..f75a3a2a75fcb 100755 --- a/sklearn/neighbors/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -1,8 +1,8 @@ -#!python -#cython: boundscheck=False -#cython: wraparound=False -#cython: initializedcheck=False -#cython: cdivision=True +# cython: boundscheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: wraparound=False + # By Jake Vanderplas (2013) # written for the scikit-learn project @@ -10,6 +10,8 @@ import numpy as np cimport numpy as np +from cython cimport final + np.import_array() # required in order to use C-API @@ -19,7 +21,7 @@ cdef extern from "arrayobject.h": int typenum, void* data) -cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): +cdef inline np.ndarray _buffer_to_ndarray(const DTYPE_t* x, np.npy_intp n): # Wrap a memory buffer with an ndarray. Warning: this is not robust. # In particular, if x is deallocated before the returned array goes # out of scope, this could cause memory errors. Since there is not @@ -29,13 +31,14 @@ cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): return PyArray_SimpleNewFromData(1, &n, DTYPECODE, x) -# some handy constants from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin cdef DTYPE_t INF = np.inf -from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE -from ._typedefs import DTYPE, ITYPE +from scipy.sparse import csr_matrix, issparse +from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE +from ..utils._typedefs import DTYPE, ITYPE +from ..utils import check_array ###################################################################### # newObj function @@ -73,6 +76,16 @@ METRIC_MAPPING = {'euclidean': EuclideanDistance, 'haversine': HaversineDistance, 'pyfunc': PyFuncDistance} +BOOL_METRICS = [ + "matching", + "jaccard", + "dice", + "kulsinski", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", +] def get_valid_metric_ids(L): """Given an iterable of metric class names or class identifiers, @@ -98,7 +111,7 @@ cdef class DistanceMetric: Examples -------- - >>> from sklearn.neighbors import DistanceMetric + >>> from sklearn.metrics import DistanceMetric >>> dist = DistanceMetric.get_metric('euclidean') >>> X = [[0, 1, 2], [3, 4, 5]] @@ -197,8 +210,8 @@ cdef class DistanceMetric: """ def __cinit__(self): self.p = 2 - self.vec = np.zeros(1, dtype=DTYPE, order='c') - self.mat = np.zeros((1, 1), dtype=DTYPE, order='c') + self.vec = np.zeros(1, dtype=DTYPE, order='C') + self.mat = np.zeros((1, 1), dtype=DTYPE, order='C') self.size = 1 def __reduce__(self): @@ -291,17 +304,50 @@ cdef class DistanceMetric: cdef DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: - """Compute the reduced distance between vectors x1 and x2. + """Compute the rank-preserving surrogate distance between vectors x1 and x2. This can optionally be overridden in a base class. - The reduced distance is any measure that yields the same rank as the - distance, but is more efficient to compute. For example, for the - Euclidean metric, the reduced distance is the squared-euclidean - distance. + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, for the + Euclidean metric, the rank-preserving surrogate distance is the + squared-euclidean distance. """ return self.dist(x1, x2, size) + cdef DTYPE_t csr_dist( + self, + const DTYPE_t[:] x1_data, + const ITYPE_t[:] x1_indices, + const DTYPE_t[:] x2_data, + const ITYPE_t[:] x2_indices, + ) nogil except -1: + """Compute the rank-preserving surrogate distance between vectors x1 and x2 + given non null coordinates and their corresponding indices. + + This should be overridden in a base class. + """ + return -999 + + cdef DTYPE_t csr_rdist( + self, + const DTYPE_t[:] x1_data, + const ITYPE_t[:] x1_indices, + const DTYPE_t[:] x2_data, + const ITYPE_t[:] x2_indices, + ) nogil except -1: + """Compute the rank-preserving surrogate distance between vectors x1 and x2 + given non null coordinates and their corresponding indices. + + This can optionally be overridden in a base class. + + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, for the + Euclidean metric, the rank-preserving surrogate distance is the + squared-euclidean distance. + """ + return self.csr_dist(x1_data, x1_indices, x2_data, x2_indices) + cdef int pdist(self, const DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: """compute the pairwise distances between points in X""" cdef ITYPE_t i1, i2 @@ -323,25 +369,25 @@ cdef class DistanceMetric: return 0 cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1: - """Convert the reduced distance to the distance""" + """Convert the rank-preserving surrogate distance to the distance""" return rdist cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: - """Convert the distance to the reduced distance""" + """Convert the distance to the rank-preserving surrogate distance""" return dist def rdist_to_dist(self, rdist): - """Convert the Reduced distance to the true distance. + """Convert the rank-preserving surrogate distance to the true distance. - The reduced distance, defined for some metrics, is a computationally - more efficient measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, for the + Euclidean metric, the rank-preserving surrogate distance is the + squared-euclidean distance. Parameters ---------- rdist : double - Reduced distance. + Rank-preserving surrogate distance. Returns ------- @@ -351,12 +397,12 @@ cdef class DistanceMetric: return rdist def dist_to_rdist(self, dist): - """Convert the true distance to the reduced distance. + """Convert the true distance to the rank-preserving surrogate distance. - The reduced distance, defined for some metrics, is a computationally - more efficient measure which preserves the rank of the true distance. - For example, in the Euclidean distance metric, the reduced distance - is the squared-euclidean distance. + The rank-preserving surrogate distance is any measure that yields the same + rank as the distance, but is more efficient to compute. For example, for the + Euclidean metric, the rank-preserving surrogate distance is the + squared-euclidean distance. Parameters ---------- @@ -366,7 +412,7 @@ cdef class DistanceMetric: Returns ------- double - Reduced distance. + Rank-preserving surrogate distance. """ return dist @@ -519,7 +565,7 @@ cdef class ChebyshevDistance(DistanceMetric): Examples -------- - >>> from sklearn.neighbors.dist_metrics import DistanceMetric + >>> from sklearn.metrics import DistanceMetric >>> dist = DistanceMetric.get_metric('chebyshev') >>> X = [[0, 1, 2], ... [3, 4, 5]] @@ -1145,3 +1191,340 @@ cdef class PyFuncDistance(DistanceMetric): cdef inline double fmax(double a, double b) nogil: return max(a, b) + + +###################################################################### +# Datasets Pair Classes +cdef class DatasetsPair: + """Abstract class which wraps a pair of datasets (X, Y). + + This class allows computing distances between two vectors (X_i, Y_j) + (rows of X and Y) at a time given the pair of their indices (i, j). + + X and Y can be stored as np.ndarrays or CSR matrices in subclasses. + + This class avoids the overhead of dispatching distance computations + to :class:`sklearn.metrics.DistanceMetric` based on the physical + representation of the vectors (sparse vs. dense). It makes use of + cython.final to remove the overhead of method calls' dispatch. + + Parameters + ---------- + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + @classmethod + def get_for( + cls, + X, + Y, + str metric="euclidean", + dict metric_kwargs=None, + ) -> DatasetsPair: + """Return the DatasetsPair implementation for the given arguments. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_X, d) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + Y : {ndarray, sparse matrix} of shape (n_Y, d) + Input data. + If provided as a ndarray, it must be C-contiguous. + If provided as a sparse matrix, it must be in CSR format. + + metric : str, default='euclidean' + The distance metric to use for argkmin. The default metric is + a fast implementation of the standard Euclidean metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + Returns + ------- + datasets_pair: DatasetsPair + The suited DatasetsPair implementation. + """ + cdef: + DistanceMetric distance_metric = DistanceMetric.get_metric( + metric, + **(metric_kwargs or {}) + ) + + if X.dtype != np.float64 or Y.dtype != np.float64: + raise ValueError("Only 64bit float datasets are supported for X and Y.") + + # Metric-specific checks that do not replace nor duplicate `check_array`. + distance_metric._validate_data(X) + distance_metric._validate_data(Y) + + if not issparse(X) and not issparse(Y): + return DenseDenseDatasetsPair(X, Y, distance_metric) + if issparse(X) and not issparse(Y): + return SparseDenseDatasetsPair(X, Y, distance_metric) + if not issparse(X) and issparse(Y): + return DenseSparseDatasetsPair(X, Y, distance_metric) + return SparseSparseDatasetsPair(X, Y, distance_metric) + + @classmethod + def unpack_csr_matrix(cls, X: csr_matrix): + """Ensure getting ITYPE instead of int internally used for CSR matrices.""" + X_data = np.asarray(X.data, dtype=DTYPE) + X_indices = np.asarray(X.indices, dtype=ITYPE) + X_indptr = np.asarray(X.indptr, dtype=ITYPE) + return X_data, X_indptr, X_indptr + + def __init__(self, DistanceMetric distance_metric): + self.distance_metric = distance_metric + + cdef ITYPE_t n_X(self) nogil: + """Number of samples in X.""" + return -999 + + cdef ITYPE_t n_Y(self) nogil: + """Number of samples in Y.""" + return -999 + + cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.dist(i, j) + + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + return -1 + +@final +cdef class DenseDenseDatasetsPair(DatasetsPair): + """Compute distances between vectors of two arrays. + + Parameters + ---------- + X: ndarray of shape (n_X, d) + Rows represent vectors. Must be C-contiguous. + + Y: ndarray of shape (n_Y, d) + Rows represent vectors. Must be C-contiguous. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + + def __init__(self, X, Y, DistanceMetric distance_metric): + super().__init__(distance_metric) + # Arrays have already been checked + self.X = X + self.Y = Y + self.d = X.shape[1] + + @final + cdef ITYPE_t n_X(self) nogil: + return self.X.shape[0] + + @final + cdef ITYPE_t n_Y(self) nogil: + return self.Y.shape[0] + + @final + cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.rdist(&self.X[i, 0], + &self.Y[j, 0], + self.d) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + return self.distance_metric.dist(&self.X[i, 0], + &self.Y[j, 0], + self.d) + +@final +cdef class SparseSparseDatasetsPair(DatasetsPair): + """Compute distances between vectors of two CSR matrices. + + Parameters + ---------- + X: sparse matrix of shape (n_X, d) + Rows represent vectors. Must be in CSR format. + + Y: sparse matrix of shape (n_X, d) + Rows represent vectors. Must be in CSR format. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + cdef: + const DTYPE_t[:] X_data + const ITYPE_t[:] X_indices, + const ITYPE_t[:] X_indptr, + + const DTYPE_t[:] Y_data + const ITYPE_t[:] Y_indices + const ITYPE_t[:] Y_indptr + + + def __init__(self, X, Y, DistanceMetric distance_metric): + DatasetsPair.__init__(self, distance_metric) + + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + self.Y_data, self.Y_indices, self.Y_indptr = self.unpack_csr_matrix(Y) + + @final + cdef ITYPE_t n_X(self) nogil: + return self.X_indptr.shape[0] - 1 + + @final + cdef ITYPE_t n_Y(self) nogil: + return self.Y_indptr.shape[0] -1 + + @final + cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef: + ITYPE_t xi_start = self.X_indptr[i] + ITYPE_t xi_end = self.X_indptr[i + 1] + ITYPE_t yj_start = self.Y_indptr[j] + ITYPE_t yj_end = self.Y_indptr[j + 1] + + return self.distance_metric.csr_rdist( + self.X_data[xi_start:xi_end], + self.X_indices[xi_start:xi_end], + self.Y_data[yj_start:yj_end], + self.Y_indices[yj_start:yj_end], + ) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef: + ITYPE_t xi_start = self.X_indptr[i] + ITYPE_t xi_end = self.X_indptr[i + 1] + ITYPE_t yj_start = self.Y_indptr[j] + ITYPE_t yj_end = self.Y_indptr[j + 1] + + return self.distance_metric.csr_dist( + self.X_data[xi_start:xi_end], + self.X_indices[xi_start:xi_end], + self.Y_data[yj_start:yj_end], + self.Y_indices[yj_start:yj_end] + ) + +@final +cdef class SparseDenseDatasetsPair(DatasetsPair): + """Compute distances between vectors of a CSR matrix and a dense array. + + Parameters + ---------- + X: sparse matrix of shape (n_X, d) + Rows represent vectors. Must be in CSR format. + + Y: ndarray of shape (n_Y, d) + Rows represent vectors. Must be C-contiguous. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + cdef: + const DTYPE_t[:] X_data + const ITYPE_t[:] X_indices, + const ITYPE_t[:] X_indptr, + + const DTYPE_t[:, ::1] Y # shape: (n_Y, d) + const ITYPE_t[:] Y_indices + + def __init__(self, X, Y, DistanceMetric distance_metric): + super().__init__(distance_metric) + + self.X_data, self.X_indices, self.X_indptr = self.unpack_csr_matrix(X) + + # This array already has been checked here + self.Y = Y + self.Y_indices = np.arange(self.Y.shape[1], dtype=ITYPE) + + @final + cdef ITYPE_t n_X(self) nogil: + return self.X_indptr.shape[0] - 1 + + @final + cdef ITYPE_t n_Y(self) nogil: + return self.Y.shape[0] + + @final + cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef: + ITYPE_t xi_start = self.X_indptr[i] + ITYPE_t xi_end = self.X_indptr[i + 1] + + # TODO: the 2D to 1D memory-view conversion might make computation slower, see: + # https://github.com/scikit-learn/scikit-learn/issues/17299 + # Ideally, we could pass pointers and indices and access elements + # then in distance_metric.dist + return self.distance_metric.csr_rdist( + self.X_data[xi_start:xi_end], + self.X_indices[xi_start:xi_end], + self.Y[j, :], + self.Y_indices + ) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + cdef: + ITYPE_t xi_start = self.X_indptr[i] + ITYPE_t xi_end = self.X_indptr[i + 1] + + # TODO: same as previous comment + return self.distance_metric.csr_dist( + self.X_data[xi_start:xi_end], + self.X_indices[xi_start:xi_end], + self.Y[j, :], + self.Y_indices + ) + +@final +cdef class DenseSparseDatasetsPair(DatasetsPair): + """Compute distances between vectors of a dense array and a CSR matrix. + + Parameters + ---------- + X: ndarray of shape (n_X, d) + Rows represent vectors. Must be C-contiguous. + + Y: sparse matrix of shape (n_Y, d) + Rows represent vectors. Must be in CSR format. + + distance_metric: DistanceMetric + The distance metric responsible for computing distances + between two vectors of (X, Y). + """ + cdef: + # As distance metrics are symmetric functions, we can + # simply rely on the other DatasetsPair and swap arguments. + DatasetsPair datasets_pair + + def __init__(self, X, Y, DistanceMetric distance_metric): + super().__init__(distance_metric) + # Swapping arguments on the constructor + self.datasets_pair = SparseDenseDatasetsPair(Y, X, distance_metric) + + @final + cdef ITYPE_t n_X(self) nogil: + # Swapping interface + return self.datasets_pair.n_Y() + + @final + cdef ITYPE_t n_Y(self) nogil: + # Swapping interface + return self.datasets_pair.n_X() + + @final + cdef DTYPE_t ranking_preserving_dist(self, ITYPE_t i, ITYPE_t j) nogil: + # Swapping arguments on the same interface + return self.datasets_pair.ranking_preserving_dist(j, i) + + @final + cdef DTYPE_t dist(self, ITYPE_t i, ITYPE_t j) nogil: + # Swapping arguments on the same interface + return self.datasets_pair.dist(j, i) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx new file mode 100644 index 0000000000000..19f29681c311f --- /dev/null +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -0,0 +1,1852 @@ +# cython: boundscheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: wraparound=False +# distutils: language=c++ + +# Pairwise Distances Reductions +# ============================= +# +# Author: Julien Jerphanion +# +# +# The routines defined here are used in various algorithms performing +# the same structure of operations on distances between vectors +# of a datasets pair (X, Y). + +import numpy as np +cimport numpy as np + +from .. import get_config + +np.import_array() + +from libc.stdlib cimport free, malloc +from libc.float cimport DBL_MAX +from libc.math cimport exp +from libcpp.vector cimport vector +from cython cimport final +from cpython.object cimport PyObject +from cython.operator cimport dereference as deref +from cython.parallel cimport parallel, prange +from cpython.ref cimport Py_INCREF + +from ._dist_metrics cimport DatasetsPair, DenseDenseDatasetsPair +from ..utils._cython_blas cimport ( + BLAS_Order, + BLAS_Trans, + ColMajor, + NoTrans, + RowMajor, + Trans, + _dot, + _gemm, +) +from ..utils._heap cimport simultaneous_sort, heap_push +from ..utils._openmp_helpers cimport _openmp_thread_num +from ..utils._typedefs cimport ITYPE_t, DTYPE_t, DITYPE_t +from ..utils._typedefs cimport ITYPECODE, DTYPECODE + +from numbers import Integral, Real +from typing import List +from scipy.sparse import issparse +from threadpoolctl import threadpool_limits +from ._dist_metrics import BOOL_METRICS, METRIC_MAPPING +from ..utils import check_scalar, _in_unstable_openblas_configuration +from ..utils._openmp_helpers import _openmp_effective_n_threads +from ..utils._typedefs import ITYPE, DTYPE + +# Those constants have been chosen for modern laptops' caches and architecture. +DEF CHUNK_SIZE = 256 # number of vectors +DEF MIN_CHUNK_SAMPLES = 20 + + +# TODO: change for `libcpp.algorithm.move` once Cython 3 is used +# Introduction in Cython: +# https://github.com/cython/cython/blob/05059e2a9b89bf6738a7750b905057e5b1e3fe2e/Cython/Includes/libcpp/algorithm.pxd#L47 #noqa +cdef extern from "" namespace "std" nogil: + OutputIt move[InputIt, OutputIt](InputIt first, InputIt last, OutputIt d_first) except + #noqa + +###################### +## std::vector to np.ndarray coercion +# As type covariance is not supported for C++ containers via Cython, +# we need to redefine fused types. +ctypedef fused vector_DITYPE_t: + vector[ITYPE_t] + vector[DTYPE_t] + + +ctypedef fused vector_vector_DITYPE_t: + vector[vector[ITYPE_t]] + vector[vector[DTYPE_t]] + + +cdef class StdVectorSentinel: + """Wraps a reference to a vector which will be deallocated with this object. + + When created, the StdVectorSentinel swaps the reference of its internal + vectors with the provided one (vec_ptr), thus making the StdVectorSentinel + manage the provided one's lifetime. + """ + pass + + +# We necessarily need to define two extension types extending StdVectorSentinel +# because we need to provide the dtype of the vector but can't use numeric fused types. +cdef class StdVectorSentinelDTYPE(StdVectorSentinel): + cdef vector[DTYPE_t] vec + + @staticmethod + cdef StdVectorSentinel create_for(vector[DTYPE_t] * vec_ptr): + # This initializes the object directly without calling __init__ + cdef StdVectorSentinelDTYPE sentinel = StdVectorSentinelDTYPE.__new__(StdVectorSentinelDTYPE) + sentinel.vec.swap(deref(vec_ptr)) + return sentinel + + +cdef class StdVectorSentinelITYPE(StdVectorSentinel): + cdef vector[ITYPE_t] vec + + @staticmethod + cdef StdVectorSentinel create_for(vector[ITYPE_t] * vec_ptr): + # This initializes the object directly without calling __init__ + cdef StdVectorSentinelITYPE sentinel = StdVectorSentinelITYPE.__new__(StdVectorSentinelITYPE) + sentinel.vec.swap(deref(vec_ptr)) + return sentinel + + +cpdef DTYPE_t[::1] _sqeuclidean_row_norms( + const DTYPE_t[:, ::1] X, + ITYPE_t num_threads, +): + """Compute the squared euclidean norm of the rows of X in parallel. + + This is faster than using np.einsum("ij, ij->i") even when using a single thread. + """ + cdef: + # Casting for X to remove the const qualifier is needed because APIs + # exposed via scipy.linalg.cython_blas aren't reflecting the arguments' + # const qualifier. + DTYPE_t * X_ptr = &X[0, 0] + ITYPE_t idx = 0 + ITYPE_t n = X.shape[0] + ITYPE_t d = X.shape[1] + DTYPE_t[::1] row_norms = np.empty(n, dtype=DTYPE) + + for idx in prange(n, schedule='static', nogil=True, num_threads=num_threads): + row_norms[idx] = _dot(d, X_ptr + idx * d, 1, X_ptr + idx * d, 1) + + return row_norms + +cdef np.ndarray vector_to_nd_array(vector_DITYPE_t * vect_ptr): + """Create a numpy ndarray given a C++ vector. + + The numpy array buffer is the one of the C++ vector. + A StdVectorSentinel is registered as the base object for the numpy array, + freeing the C++ vector it encapsulates when the numpy array is freed. + """ + typenum = DTYPECODE if vector_DITYPE_t is vector[DTYPE_t] else ITYPECODE + cdef: + np.npy_intp size = deref(vect_ptr).size() + np.ndarray arr = np.PyArray_SimpleNewFromData(1, &size, typenum, + deref(vect_ptr).data()) + StdVectorSentinel sentinel + + if vector_DITYPE_t is vector[DTYPE_t]: + sentinel = StdVectorSentinelDTYPE.create_for(vect_ptr) + else: + sentinel = StdVectorSentinelITYPE.create_for(vect_ptr) + + # Makes the numpy array responsible of the life-cycle of its buffer. + # A reference to the StdVectorSentinel will be stolen by the call bellow, + # so we increase its reference counter. + # See: https://docs.python.org/3/c-api/intro.html#reference-count-details + Py_INCREF(sentinel) + np.PyArray_SetBaseObject(arr, sentinel) + return arr + + +cdef np.ndarray[object, ndim=1] coerce_vectors_to_nd_arrays( + vector_vector_DITYPE_t* vecs +): + """Coerce a std::vector of std::vector to a ndarray of ndarray.""" + cdef: + ITYPE_t n = deref(vecs).size() + np.ndarray[object, ndim=1] nd_arrays_of_nd_arrays = np.empty(n, + dtype=np.ndarray) + + for i in range(n): + nd_arrays_of_nd_arrays[i] = vector_to_nd_array(&(deref(vecs)[i])) + + return nd_arrays_of_nd_arrays + +##################### + +cdef class PairwiseDistancesReduction: + """Abstract class which compute pairwise distances between + a set of vectors (rows) X and another set of vectors (rows) of Y + and apply a reduction on top. + + The computations of the distances and the reduction is parallelized + on chunks of vectors of X and Y. + + Parameters + ---------- + datasets_pair: DatasetsPair + The pair of dataset to use. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + n_threads: int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on :method:`~PairwiseDistancesReduction.compute`. + + None and -1 means using all processors. + """ + + cdef: + DatasetsPair _datasets_pair + + ITYPE_t n_threads + ITYPE_t effective_omp_n_thread + ITYPE_t n_samples_chunk, chunk_size + + ITYPE_t n_X, X_n_samples_chunk, X_n_chunks, X_n_samples_remainder + ITYPE_t n_Y, Y_n_samples_chunk, Y_n_chunks, Y_n_samples_remainder + + @classmethod + def valid_metrics(cls) -> List[str]: + excluded = { + "pyfunc", # is relatively slow because we need to coerce data as np arrays + "mahalanobis", # is numerically unstable + # TODO: In order to support discrete distance metrics, we need to have a + # simultaneous sort which breaks ties on indices when distances are identical. + # The best might be using a std::sort and a Comparator which might need + # AoS instead of SoA (currently used). + "hamming", + *BOOL_METRICS, + } + return sorted({"fast_euclidean", "fast_sqeuclidean", + *METRIC_MAPPING.keys()}.difference(excluded)) + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + """Return True if the PairwiseDistancesReduction for the given parameters. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_X, d) + Input data. + + Y : {ndarray, sparse matrix} of shape (n_Y, d) + Input data. + + metric : str, default='euclidean' + The distance metric to use. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + Returns + ------- + True if the PairwiseDistancesReduction can be used, else False. + """ + # Coercing to np.array to get the dtype + # TODO: what is the best way to get lists' dtype? + X = np.asarray(X) if isinstance(X, (tuple, list)) else X + Y = np.asarray(Y) if isinstance(Y, (tuple, list)) else Y + # TODO: support sparse arrays and 32 bits + return (not issparse(X) and X.dtype == np.float64 and X.ndim == 2 and + not issparse(Y) and Y.dtype == np.float64 and Y.ndim == 2 and + metric in cls.valid_metrics()) + + @property + def datasets_pair(self) -> DatasetsPair: + return self._datasets_pair + + def __init__( + self, + DatasetsPair datasets_pair, + chunk_size=None, + n_threads=None, + ): + cdef: + ITYPE_t X_n_full_chunks, Y_n_full_chunks + + if chunk_size is None: + chunk_size = get_config().get("pairwise_dist_chunk_size", CHUNK_SIZE) + + check_scalar(chunk_size, "chunk_size", Integral, min_val=1) + self.chunk_size = chunk_size + + self.effective_omp_n_thread = _openmp_effective_n_threads(n_threads) + + self.n_samples_chunk = max(MIN_CHUNK_SAMPLES, chunk_size) + + self._datasets_pair = datasets_pair + + self.n_Y = datasets_pair.n_Y() + self.Y_n_samples_chunk = min(self.n_Y, self.n_samples_chunk) + Y_n_full_chunks = self.n_Y // self.Y_n_samples_chunk + self.Y_n_samples_remainder = self.n_Y % self.Y_n_samples_chunk + + self.n_X = datasets_pair.n_X() + self.X_n_samples_chunk = min(self.n_X, self.n_samples_chunk) + X_n_full_chunks = self.n_X // self.X_n_samples_chunk + self.X_n_samples_remainder = self.n_X % self.X_n_samples_chunk + + # Counting remainder chunk in total number of chunks + self.Y_n_chunks = Y_n_full_chunks + ( + self.n_Y != (Y_n_full_chunks * self.Y_n_samples_chunk) + ) + + self.X_n_chunks = X_n_full_chunks + ( + self.n_X != (X_n_full_chunks * self.X_n_samples_chunk) + ) + + def compute( + self, + str strategy=None, + bint return_distance=False, + ): + """Computes the reduction of vectors (rows) of X on Y. + + Parameters + ---------- + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + Strategies differs on the dispatching they use for chunks on threads: + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation + but is less used in practice (because X is smaller than Y generally). + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread then iterates on all the chunks of X. This strategy is + embarrassingly parallel but uses intermediate datastructures + synchronisation. However it is more useful in practice (because Y is + larger than X generally). + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y'. + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + return_distance : boolean, default=False + Return distances between each X vector and its + argkmin if set to True. + + Returns + ------- + Results for the PairwiseDistancesReduction, usually an array of indices + and optionally an array of associated distances if return_distance is True. + """ + + if strategy is None: + strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto') + + if strategy == 'auto': + # This is a simple heuristic whose constant for the + # comparison has been chosen based on experiments. + if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X: + strategy = 'parallel_on_X' + else: + strategy = 'parallel_on_Y' + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if strategy == 'parallel_on_Y': + self._parallel_on_Y() + elif strategy == 'parallel_on_X': + self._parallel_on_X() + else: + raise RuntimeError(f"strategy '{strategy}' not supported.") + + return self._finalize_results(return_distance) + + @final + cdef void _parallel_on_X(self) nogil: + """Computes the reduction of each vector (row) of X on Y + by parallelizing computation on chunks of X. + + This strategy dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + + Private datastructures are modified internally by threads. + + Private template methods can be implemented on subclasses to + interact with those datastructures at various stages. + """ + cdef: + ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + ITYPE_t num_threads = min(self.X_n_chunks, self.effective_omp_n_thread) + ITYPE_t thread_num + + with nogil, parallel(num_threads=num_threads): + thread_num = _openmp_thread_num() + + # Allocating thread datastructures + self._on_X_parallel_init(thread_num) + + for X_chunk_idx in prange(self.X_n_chunks, schedule='static'): + X_start = X_chunk_idx * self.X_n_samples_chunk + if (X_chunk_idx == self.X_n_chunks - 1 + and self.X_n_samples_remainder > 0): + X_end = X_start + self.X_n_samples_remainder + else: + X_end = X_start + self.X_n_samples_chunk + + # Reinitializing thread datastructures for the new X chunk + self._on_X_prange_iter_init(thread_num, X_start, X_end) + + for Y_chunk_idx in range(self.Y_n_chunks): + Y_start = Y_chunk_idx * self.Y_n_samples_chunk + if (Y_chunk_idx == self.Y_n_chunks - 1 + and self.Y_n_samples_remainder > 0): + Y_end = Y_start + self.Y_n_samples_remainder + else: + Y_end = Y_start + self.Y_n_samples_chunk + + self._compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + + # Adjusting thread datastructures on the full pass on Y + self._on_X_prange_iter_finalize(thread_num, X_start, X_end) + + # end: for X_chunk_idx + + # Deallocating thread datastructures + self._on_X_parallel_finalize(thread_num) + + # end: with nogil, parallel + return + + @final + cdef void _parallel_on_Y(self) nogil: + """Computes the reduction of each vector (row) of X on Y + by parallelizing computation on chunks of Y. + + This strategy dispatches chunks of Y uniformly on threads. + Each thread then iterates on all the chunks of X. This strategy is + embarrassingly parallel but uses intermediate datastructures + synchronisation. + + Private datastructures are modified internally by threads. + + Private template methods can be implemented on subclasses to + interact with those datastructures at various stages. + """ + cdef: + ITYPE_t Y_start, Y_end, X_start, X_end, X_chunk_idx, Y_chunk_idx + ITYPE_t num_threads = min(self.Y_n_chunks, self.effective_omp_n_thread) + ITYPE_t thread_num + + # Allocating datastructures + self._on_Y_init(num_threads) + + for X_chunk_idx in range(self.X_n_chunks): + X_start = X_chunk_idx * self.X_n_samples_chunk + if X_chunk_idx == self.X_n_chunks - 1 and self.X_n_samples_remainder > 0: + X_end = X_start + self.X_n_samples_remainder + else: + X_end = X_start + self.X_n_samples_chunk + + with nogil, parallel(num_threads=num_threads): + thread_num = _openmp_thread_num() + + # Initializing datastructures used in this thread + self._on_Y_parallel_init(thread_num) + + for Y_chunk_idx in prange(self.Y_n_chunks, schedule='static'): + Y_start = Y_chunk_idx * self.Y_n_samples_chunk + if Y_chunk_idx == self.Y_n_chunks - 1 \ + and self.Y_n_samples_remainder > 0: + Y_end = Y_start + self.Y_n_samples_remainder + else: + Y_end = Y_start + self.Y_n_samples_chunk + + self._compute_and_reduce_distances_on_chunks( + X_start, X_end, + Y_start, Y_end, + thread_num, + ) + # end: prange + # end: with nogil, parallel + + # Synchronizing the thread datastructures with the main ones + self._on_Y_after_parallel(num_threads, X_start, X_end) + + # end: for X_chunk_idx + # Deallocating temporary datastructures and adjusting main datastructures + self._on_Y_finalize(num_threads) + return + + # Placeholder methods which have to be implemented + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + """Compute the pairwise distances on two chunks of X and Y and reduce them. + + This is the core critical region of PairwiseDistanceReductions' computations + which must be implemented in subclasses. + """ + return + + def _finalize_results(self, bint return_distance): + """Call-back adapting datastructures before returning results. + + This must be implemented in subclasses. + """ + return None + + # Placeholder methods which can be implemented + + cdef void compute_exact_distances(self) nogil: + """Convert ranking-preserving distances to exact distances or recompute them.""" + return + + cdef void _on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + """Allocate datastructures used in a thread given its number.""" + return + + cdef void _on_X_prange_iter_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + """Initialise datastructures used in a thread given its number.""" + return + + cdef void _on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + """Interact with datastructures after a reduction on chunks.""" + return + + cdef void _on_X_parallel_finalize( + self, + ITYPE_t thread_num + ) nogil: + """Interact with datastructures after executing all the reductions.""" + return + + cdef void _on_Y_init( + self, + ITYPE_t num_threads, + ) nogil: + """Allocate datastructures used in threads.""" + return + + cdef void _on_Y_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + """Initialise datastructures used in a thread given its number.""" + return + + cdef void _on_Y_after_parallel( + self, + ITYPE_t num_threads, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + """Interact with datastructures after a threads parallel region.""" + return + + cdef void _on_Y_finalize( + self, + ITYPE_t num_threads, + ) nogil: + """Interact with datastructures after executing all the reductions.""" + return + +cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): + """Computes the argkmin of vectors (rows) of a set of + vectors (rows) of X on another set of vectors (rows) of Y. + + Parameters + ---------- + datasets_pair: DatasetsPair + The dataset pairs (X, Y) for the reduction. + + k: int + The k for the argkmin reduction. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + n_threads: int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on :method:`~ArgKmin.compute`. + + None and -1 means using all processors. + """ + + cdef: + ITYPE_t k + + ITYPE_t[:, ::1] argkmin_indices + DTYPE_t[:, ::1] argkmin_distances + + # Used as array of pointers to private datastructures used in threads. + DTYPE_t ** heaps_r_distances_chunks + ITYPE_t ** heaps_indices_chunks + + @classmethod + def get_for( + cls, + X, + Y, + ITYPE_t k, + str metric="fast_euclidean", + chunk_size=None, + dict metric_kwargs=None, + n_threads=None, + ) -> PairwiseDistancesArgKmin: + """Return the PairwiseDistancesArgKmin implementation for the given arguments. + + Parameters + ---------- + X : array-like of shape (n_X, d) + Input data. + + Y : array-like of shape (n_Y, d) + Input data. + + k : int + The k for the argkmin reduction. + + metric : str, default='fast_euclidean' + The distance metric to use for argkmin. The default metric is + a fast implementation of the standard Euclidean metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + n_threads : int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on + :method:`~PairwiseDistancesArgKmin.compute`. + + None and -1 means using all processors. + + Returns + ------- + argkmin: PairwiseDistancesArgKmin + The suited PairwiseDistancesArgKmin implementation. + """ + # This factory comes to handle specialisations. + if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y): + use_squared_distances = metric == "fast_sqeuclidean" + return FastEuclideanPairwiseDistancesArgKmin( + X=X, Y=Y, k=k, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size + ) + + return PairwiseDistancesArgKmin( + datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), + k=k, + chunk_size=chunk_size, + ) + + def __init__( + self, + DatasetsPair datasets_pair, + ITYPE_t k, + chunk_size=None, + n_threads=None, + ): + super().__init__(datasets_pair, chunk_size, n_threads) + + check_scalar(k, "k", Integral, min_val=1) + self.k = k + + # Allocating pointers to datastructures but not the datastructures themselves. + # There as many pointers as available threads. + # When reducing on small datasets, there can be more pointers than actual + # threads used for the reduction but there won't be allocated but unused + # datastructures. + self.heaps_r_distances_chunks = malloc( + sizeof(DTYPE_t *) * self.effective_omp_n_thread + ) + self.heaps_indices_chunks = malloc( + sizeof(ITYPE_t *) * self.effective_omp_n_thread + ) + + # Main heaps used by PairwiseDistancesArgKmin.compute to return results. + self.argkmin_indices = np.full((self.n_X, self.k), 0, dtype=ITYPE) + self.argkmin_distances = np.full((self.n_X, self.k), DBL_MAX, dtype=DTYPE) + + def __dealloc__(self): + if self.heaps_indices_chunks is not NULL: + free(self.heaps_indices_chunks) + + if self.heaps_r_distances_chunks is not NULL: + free(self.heaps_r_distances_chunks) + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t n_X = X_end - X_start + ITYPE_t n_Y = Y_end - Y_start + ITYPE_t k = self.k + DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num] + ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num] + + # Pushing the distance and their associated indices on heaps + # which keep tracks of the argkmin. + for i in range(n_X): + for j in range(n_Y): + heap_push( + heaps_r_distances + i * self.k, + heaps_indices + i * self.k, + k, + self._datasets_pair.ranking_preserving_dist(X_start + i, Y_start + j), + Y_start + j, + ) + + @final + cdef void _on_X_prange_iter_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + # As this strategy is embarrassingly parallel, we can set the + # thread heaps pointers to the proper position on the main heaps + self.heaps_r_distances_chunks[thread_num] = &self.argkmin_distances[X_start, 0] + self.heaps_indices_chunks[thread_num] = &self.argkmin_indices[X_start, 0] + + @final + cdef void _on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, jdx + + # Sorting indices of the argkmin for each query vector of X + for idx in range(X_end - X_start): + simultaneous_sort( + self.heaps_r_distances_chunks[thread_num] + idx * self.k, + self.heaps_indices_chunks[thread_num] + idx * self.k, + self.k + ) + + cdef void _on_Y_init( + self, + ITYPE_t num_threads, + ) nogil: + cdef: + # Maximum number of scalar elements (the last chunks can be smaller) + ITYPE_t heaps_size = self.X_n_samples_chunk * self.k + ITYPE_t thread_num + + for thread_num in prange(num_threads, schedule='static', nogil=True, + num_threads=num_threads): + # As chunks of X are shared across threads, so must their + # heaps. To solve this, each thread has its own heaps + # which are then synchronised back in the main ones. + self.heaps_r_distances_chunks[thread_num] = malloc( + heaps_size * sizeof(DTYPE_t) + ) + self.heaps_indices_chunks[thread_num] = malloc( + heaps_size * sizeof(ITYPE_t) + ) + + @final + cdef void _on_Y_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + # Initialising heaps (memset can't be used here) + for idx in range(self.X_n_samples_chunk * self.k): + self.heaps_r_distances_chunks[thread_num][idx] = DBL_MAX + self.heaps_indices_chunks[thread_num][idx] = -1 + + @final + cdef void _on_Y_after_parallel( + self, + ITYPE_t num_threads, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, jdx, thread_num + with nogil, parallel(num_threads=self.effective_omp_n_thread): + # Synchronising the thread heaps with the main heaps + # This is done in parallel samples-wise (no need for locks) + for idx in prange(X_end - X_start, schedule="static"): + for thread_num in range(num_threads): + for jdx in range(self.k): + heap_push( + &self.argkmin_distances[X_start + idx, 0], + &self.argkmin_indices[X_start + idx, 0], + self.k, + self.heaps_r_distances_chunks[thread_num][idx * self.k + jdx], + self.heaps_indices_chunks[thread_num][idx * self.k + jdx], + ) + + cdef void _on_Y_finalize( + self, + ITYPE_t num_threads, + ) nogil: + cdef: + ITYPE_t idx, thread_num + + with nogil, parallel(num_threads=self.effective_omp_n_thread): + # Deallocating temporary datastructures + for thread_num in prange(num_threads, schedule='static'): + free(self.heaps_r_distances_chunks[thread_num]) + free(self.heaps_indices_chunks[thread_num]) + + # Sort the main heaps into arrays in parallel + # in ascending order w.r.t the distances + for idx in prange(self.n_X, schedule='static'): + simultaneous_sort( + &self.argkmin_distances[idx, 0], + &self.argkmin_indices[idx, 0], + self.k, + ) + return + + cdef void compute_exact_distances(self) nogil: + cdef: + ITYPE_t i, j + ITYPE_t[:, ::1] Y_indices = self.argkmin_indices + DTYPE_t[:, ::1] distances = self.argkmin_distances + for i in prange(self.n_X, schedule='static', nogil=True, + num_threads=self.effective_omp_n_thread): + for j in range(self.k): + distances[i, j] = self._datasets_pair.distance_metric._rdist_to_dist( + # Guard against eventual -0., causing nan production. + distances[i, j] if distances[i, j] > 0. else 0. + ) + + def _finalize_results(self, bint return_distance=False): + if return_distance: + # We eventually need to recompute distances because we relied on proxies. + self.compute_exact_distances() + return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) + + return np.asarray(self.argkmin_indices) + + +cdef class FastEuclideanPairwiseDistancesArgKmin(PairwiseDistancesArgKmin): + """Fast specialized alternative for PairwiseDistancesArgKmin on EuclideanDistance. + + Notes + ----- + This implementation has a superior arithmetic intensity and hence + better running time when the alternative is IO bound, but it can suffer + from numerical instability. + + PairwiseDistancesArgKmin with EuclideanDistance must be used when higher + numerical precision is needed. + """ + + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + const DTYPE_t[::1] X_sq_norms + const DTYPE_t[::1] Y_sq_norms + + # Buffers for GEMM + DTYPE_t ** dist_middle_terms_chunks + bint use_squared_distances + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + return (PairwiseDistancesArgKmin.is_usable_for(X, Y, metric) and + not _in_unstable_openblas_configuration()) + + def __init__( + self, + X, + Y, + ITYPE_t k, + bint use_squared_distances=False, + chunk_size=None, + ): + super().__init__( + # The datasets pair here is used for exact distances computations + datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"), + k=k, + chunk_size=chunk_size, + ) + # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair + cdef: + DenseDenseDatasetsPair datasets_pair = self.datasets_pair + self.X, self.Y = datasets_pair.X, datasets_pair.Y + self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread) + self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread) + self.use_squared_distances = use_squared_distances + + # Temporary datastructures used in threads + self.dist_middle_terms_chunks = malloc( + sizeof(DTYPE_t *) * self.effective_omp_n_thread + ) + + def __dealloc__(self): + if self.dist_middle_terms_chunks is not NULL: + free(self.dist_middle_terms_chunks) + + @final + cdef void compute_exact_distances(self) nogil: + if not self.use_squared_distances: + PairwiseDistancesArgKmin.compute_exact_distances(self) + + @final + cdef void _on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesArgKmin._on_X_parallel_init(self, thread_num) + + # Temporary buffer for the -2 * X_c.dot(Y_c.T) term + self.dist_middle_terms_chunks[thread_num] = malloc( + self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t) + ) + + @final + cdef void _on_X_parallel_finalize( + self, + ITYPE_t thread_num + ) nogil: + PairwiseDistancesArgKmin._on_X_parallel_finalize(self, thread_num) + free(self.dist_middle_terms_chunks[thread_num]) + + @final + cdef void _on_Y_init( + self, + ITYPE_t num_threads, + ) nogil: + cdef ITYPE_t thread_num + PairwiseDistancesArgKmin._on_Y_init(self, num_threads) + + for thread_num in range(num_threads): + # Temporary buffer for the -2 * X_c.dot(Y_c.T) term + self.dist_middle_terms_chunks[thread_num] = malloc( + self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t) + ) + + @final + cdef void _on_Y_finalize( + self, + ITYPE_t num_threads, + ) nogil: + cdef ITYPE_t thread_num + PairwiseDistancesArgKmin._on_Y_finalize(self, num_threads) + + for thread_num in range(num_threads): + free(self.dist_middle_terms_chunks[thread_num]) + + @final + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + ITYPE_t k = self.k + + const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] + const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num] + DTYPE_t *heaps_r_distances = self.heaps_r_distances_chunks[thread_num] + ITYPE_t *heaps_indices = self.heaps_indices_chunks[thread_num] + + # We compute the full pairwise squared distances matrix as follows + # + # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||², + # + # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM. + # + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] + DTYPE_t alpha = - 2. + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + DTYPE_t * A = & X_c[0, 0] + ITYPE_t lda = X_c.shape[1] + DTYPE_t * B = & Y_c[0, 0] + ITYPE_t ldb = X_c.shape[1] + DTYPE_t beta = 0. + DTYPE_t * C = dist_middle_terms + ITYPE_t ldc = Y_c.shape[0] + + # dist_middle_terms = -2 * X_c.dot(Y_c.T) + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc) + + # Pushing the distance and their associated indices on heaps + # which keep tracks of the argkmin. + for i in range(X_c.shape[0]): + for j in range(Y_c.shape[0]): + heap_push( + heaps_r_distances + i * k, + heaps_indices + i * k, + k, + # Using the squared euclidean distance as the ranking-preserving distance: + # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + ( + self.X_sq_norms[i + X_start] + + dist_middle_terms[i * Y_c.shape[0] + j] + + self.Y_sq_norms[j + Y_start] + ), + j + Y_start, + ) + + +cdef class PairwiseDistancesRadiusNeighborhood(PairwiseDistancesReduction): + """Returns radius-based neighbors vectors' indices in a dataset Y of + of vectors in a dataset X. + + Parameters + ---------- + datasets_pair: DatasetsPair + The dataset pairs (X, Y) for the reduction. + + radius: float + The radius defining the neighborhood. + + chunk_size: int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + n_threads: int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on + :method:`~PairwiseDistancesRadiusNeighborhood.compute`. + + None and -1 means using all processors. + """ + + cdef: + DTYPE_t radius + + # DistanceMetric compute ranking-preserving surrogate distance via rdist + # which are proxies necessitating less computations. + # We get the equivalent for the radius to be able to compare it against + # vectors' ranking-preserving surrogate distances. + DTYPE_t r_radius + + # Neighbors indices and distances are returned as np.ndarray of np.ndarray. + # + # We want resizable buffers which we will to wrapped within numpy + # arrays at the end. std::vector comes as a handy interface for + # interacting efficiently with resizable buffers. + # + # Though it is possible to access their buffer address with + # std::vector::data, they can't be stolen: buffers lifetime + # is tight to their std::vector and are deallocated when + # std::vectors are. + # + # To solve this, we dynamically allocate std::vectors and then + # encapsulate them in a StdVectorSentinel responsible for + # freeing them when the associated np.ndarray is freed. + vector[vector[ITYPE_t]] * neigh_indices + vector[vector[DTYPE_t]] * neigh_distances + + # Used as array of pointers to private datastructures used in threads. + vector[vector[ITYPE_t]] ** neigh_indices_chunks + vector[vector[DTYPE_t]] ** neigh_distances_chunks + + bint sort_results + + @classmethod + def get_for( + cls, + X, + Y, + DTYPE_t radius, + str metric="fast_euclidean", + chunk_size=None, + dict metric_kwargs=None, + n_threads=None, + bint sort_results=False, + ) -> PairwiseDistancesRadiusNeighborhood: + """Return the PairwiseDistancesRadiusNeighborhood implementation for the given arguments. + + Parameters + ---------- + X : array-like of shape (n_X, d) + Input data. + + Y : array-like of shape (n_Y, d) + Input data. + + radius : float + The radius defining the neighborhood. + + metric : str, default='fast_euclidean' + The distance metric to use for argkmin. The default metric is + a fast implementation of the standard Euclidean metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + metric_kwargs : dict, default=None + Keyword arguments to pass to specified metric function. + + n_threads: int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on + :method:`~PairwiseDistancesRadiusNeighborhood.compute`. + + None and -1 means using all processors. + + sort_results : boolean, default=False + Sort results with respect to distances between each X vector and its + neighbors if set to True. + + Returns + ------- + radius_neighborhood: PairwiseDistancesRadiusNeighborhood + The suited PairwiseDistancesRadiusNeighborhood implementation. + """ + # This factory comes to handle specialisations. + if metric in ("fast_euclidean", "fast_sqeuclidean") and not issparse(X) and not issparse(Y): + use_squared_distances = metric == "fast_sqeuclidean" + return FastEuclideanPairwiseDistancesRadiusNeighborhood( + X=X, Y=Y, radius=radius, + use_squared_distances=use_squared_distances, + chunk_size=chunk_size, + sort_results=sort_results, + ) + + return PairwiseDistancesRadiusNeighborhood( + datasets_pair=DatasetsPair.get_for(X, Y, metric, metric_kwargs), + radius=radius, + chunk_size=chunk_size, + sort_results=sort_results, + ) + + def __init__( + self, + DatasetsPair datasets_pair, + DTYPE_t radius, + chunk_size=None, + n_threads=None, + sort_results=False + ): + super().__init__(datasets_pair, chunk_size, n_threads) + + check_scalar(radius, "radius", Real, min_val=0) + self.radius = radius + self.r_radius = self._datasets_pair.distance_metric._dist_to_rdist(radius) + self.sort_results = sort_results + + # Allocating pointers to datastructures but not the datastructures themselves. + # There as many pointers as available threads. + # When reducing on small datasets, there can be more pointers than actual + # threads used for the reduction but there won't be allocated but unused + # datastructures. + self.neigh_distances_chunks = malloc( + sizeof(self.neigh_distances) * self.effective_omp_n_thread + ) + self.neigh_indices_chunks = malloc( + sizeof(self.neigh_indices) * self.effective_omp_n_thread + ) + + # Temporary datastructures which will be coerced to numpy arrays on before + # PairwiseDistancesRadiusNeighborhood.compute "return" and will be then freed. + self.neigh_indices = new vector[vector[ITYPE_t]](self.n_X) + self.neigh_distances = new vector[vector[DTYPE_t]](self.n_X) + + def __dealloc__(self): + if self.neigh_distances_chunks is not NULL: + free(self.neigh_distances_chunks) + + if self.neigh_indices_chunks is not NULL: + free(self.neigh_indices_chunks) + + if self.neigh_indices is not NULL: + del self.neigh_indices + + if self.neigh_distances is not NULL: + del self.neigh_distances + + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + DTYPE_t r_dist_i_j + + for i in range(X_start, X_end): + for j in range(Y_start, Y_end): + r_dist_i_j = self._datasets_pair.ranking_preserving_dist(i, j) + if r_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i].push_back(r_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i].push_back(j) + + def _finalize_results(self, bint return_distance=False): + if return_distance: + self.compute_exact_distances() + return ( + coerce_vectors_to_nd_arrays(self.neigh_distances), + coerce_vectors_to_nd_arrays(self.neigh_indices), + ) + + return coerce_vectors_to_nd_arrays(self.neigh_indices) + + @final + cdef void _on_X_prange_iter_init( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + + # As this strategy is embarrassingly parallel, we can set the + # thread vectors' pointers to the main vectors'. + self.neigh_distances_chunks[thread_num] = self.neigh_distances + self.neigh_indices_chunks[thread_num] = self.neigh_indices + + @final + cdef void _on_X_prange_iter_finalize( + self, + ITYPE_t thread_num, + ITYPE_t X_start, + ITYPE_t X_end, + ) nogil: + cdef: + ITYPE_t idx, jdx + + # Sorting neighbors for each query vector of X + if self.sort_results: + for idx in range(X_start, X_end): + simultaneous_sort( + deref(self.neigh_distances)[idx].data(), + deref(self.neigh_indices)[idx].data(), + deref(self.neigh_indices)[idx].size() + ) + + cdef void _on_Y_init( + self, + ITYPE_t num_threads, + ) nogil: + cdef: + ITYPE_t thread_num + # As chunks of X are shared across threads, so must datastructures + # to avoid race conditions. + # Each thread has its own vectors of n_X vectors which are then merged + # back in the main n_X vectors. + for thread_num in range(num_threads): + self.neigh_distances_chunks[thread_num] = new vector[vector[DTYPE_t]](self.n_X) + self.neigh_indices_chunks[thread_num] = new vector[vector[ITYPE_t]](self.n_X) + + @final + cdef void _merge_vectors( + self, + ITYPE_t idx, + ITYPE_t num_threads, + ) nogil: + cdef: + ITYPE_t thread_num + ITYPE_t idx_n_elements = 0 + ITYPE_t last_element_idx = deref(self.neigh_indices)[idx].size() + + # Resizing buffers only once for the given + for thread_num in range(num_threads): + idx_n_elements += deref(self.neigh_distances_chunks[thread_num])[idx].size() + + deref(self.neigh_distances)[idx].resize(last_element_idx + idx_n_elements) + deref(self.neigh_indices)[idx].resize(last_element_idx + idx_n_elements) + + # Moving the elements by range using the range first element + # as the reference for the insertion + for thread_num in range(num_threads): + move( + deref(self.neigh_distances_chunks[thread_num])[idx].begin(), + deref(self.neigh_distances_chunks[thread_num])[idx].end(), + deref(self.neigh_distances)[idx].begin() + last_element_idx + ) + move( + deref(self.neigh_indices_chunks[thread_num])[idx].begin(), + deref(self.neigh_indices_chunks[thread_num])[idx].end(), + deref(self.neigh_indices)[idx].begin() + last_element_idx + ) + last_element_idx += deref(self.neigh_distances_chunks[thread_num])[idx].size() + + + cdef void _on_Y_finalize( + self, + ITYPE_t num_threads, + ) nogil: + cdef: + ITYPE_t idx, jdx, thread_num, idx_n_element, idx_current + + with nogil, parallel(num_threads=self.effective_omp_n_thread): + # Merge vectors used in threads into the main ones. + # This is done in parallel sample-wise (no need for locks) + # using dynamic scheduling because we generally do not have + # the same number of neighbors for each query vectors. + # TODO: compare 'dynamic' vs 'static' vs 'guided' + for idx in prange(self.n_X, schedule='dynamic'): + self._merge_vectors(idx, num_threads) + + # The content of the vector have been std::moved, + # Hence they can't be used anymore and can only be deleted. + for thread_num in prange(num_threads, schedule='static'): + del self.neigh_distances_chunks[thread_num] + del self.neigh_indices_chunks[thread_num] + + # Sort in parallel in ascending order w.r.t the distances if needed + if self.sort_results: + for idx in prange(self.n_X, schedule='static'): + simultaneous_sort( + deref(self.neigh_distances)[idx].data(), + deref(self.neigh_indices)[idx].data(), + deref(self.neigh_indices)[idx].size() + ) + + return + + cdef void compute_exact_distances(self) nogil: + """Convert ranking-preserving distances to pairwise distances in parallel.""" + cdef: + ITYPE_t i, j + + for i in prange(self.n_X, nogil=True, schedule='static', + num_threads=self.effective_omp_n_thread): + for j in range(deref(self.neigh_indices)[i].size()): + deref(self.neigh_distances)[i][j] = ( + self._datasets_pair.distance_metric._rdist_to_dist( + # Guard against eventual -0., causing nan production. + deref(self.neigh_distances)[i][j] + if deref(self.neigh_distances)[i][j] > 0. + else 0 + ) + ) + + @final + def compute( + self, + str strategy=None, + bint return_distance=False, + ): + if self.sort_results and not return_distance: + raise ValueError("return_distance must be True if sort_results is True.") + + return super().compute(strategy=strategy, return_distance=return_distance) + + +cdef class FastEuclideanPairwiseDistancesRadiusNeighborhood(PairwiseDistancesRadiusNeighborhood): + """Fast specialized alternative for PairwiseDistancesRadiusNeighborhood on EuclideanDistance. + + Notes + ----- + This implementation has a superior arithmetic intensity and hence + better running time when the alternative is IO bound, but it can suffer + from numerical instability. + + RadiusNeighborhood with EuclideanDistance must be used when higher + numerical precision is needed. + """ + + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + const DTYPE_t[::1] X_sq_norms + const DTYPE_t[::1] Y_sq_norms + + # Buffers for GEMM + DTYPE_t ** dist_middle_terms_chunks + bint use_squared_distances + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + return (PairwiseDistancesRadiusNeighborhood.is_usable_for(X, Y, metric) + and not _in_unstable_openblas_configuration()) + + def __init__( + self, + X, + Y, + DTYPE_t radius, + bint use_squared_distances=False, + chunk_size=None, + sort_results=False, + ): + super().__init__( + # The datasets pair here is used for exact distances computations + datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"), + radius=radius, + chunk_size=chunk_size, + sort_results=sort_results, + ) + # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair + cdef: + DenseDenseDatasetsPair datasets_pair = self.datasets_pair + self.X, self.Y = datasets_pair.X, datasets_pair.Y + self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread) + self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread) + self.use_squared_distances = use_squared_distances + + if use_squared_distances: + # In this specialisation and this setup, the value passed to the radius is + # already considered to be the adapted radius, so we overwrite it. + self.r_radius = radius + + # Temporary datastructures used in threads + self.dist_middle_terms_chunks = malloc( + sizeof(DTYPE_t *) * self.effective_omp_n_thread + ) + + def __dealloc__(self): + if self.dist_middle_terms_chunks is not NULL: + free(self.dist_middle_terms_chunks) + + @final + cdef void compute_exact_distances(self) nogil: + if not self.use_squared_distances: + PairwiseDistancesRadiusNeighborhood.compute_exact_distances(self) + + @final + cdef void _on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + PairwiseDistancesRadiusNeighborhood._on_X_parallel_init(self, thread_num) + + # Temporary buffer for the -2 * X_c.dot(Y_c.T) term + self.dist_middle_terms_chunks[thread_num] = malloc( + self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t) + ) + + @final + cdef void _on_X_parallel_finalize( + self, + ITYPE_t thread_num + ) nogil: + PairwiseDistancesRadiusNeighborhood._on_X_parallel_finalize(self, thread_num) + free(self.dist_middle_terms_chunks[thread_num]) + + @final + cdef void _on_Y_init( + self, + ITYPE_t num_threads, + ) nogil: + cdef ITYPE_t thread_num + PairwiseDistancesRadiusNeighborhood._on_Y_init(self, num_threads) + + for thread_num in range(num_threads): + # Temporary buffer for the -2 * X_c.dot(Y_c.T) term + self.dist_middle_terms_chunks[thread_num] = malloc( + self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t) + ) + + @final + cdef void _on_Y_finalize( + self, + ITYPE_t num_threads, + ) nogil: + cdef ITYPE_t thread_num + PairwiseDistancesRadiusNeighborhood._on_Y_finalize(self, num_threads) + + for thread_num in range(num_threads): + free(self.dist_middle_terms_chunks[thread_num]) + + @final + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + DTYPE_t squared_dist_i_j + + const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] + const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num] + + # We compute the full pairwise squared distances matrix as follows + # + # ||X_c - Y_c||² = ||X_c||² - 2 X_c.Y_c^T + ||Y_c||², + # + # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM. + # + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] + DTYPE_t alpha = - 2. + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + DTYPE_t * A = & X_c[0, 0] + ITYPE_t lda = X_c.shape[1] + DTYPE_t * B = & Y_c[0, 0] + ITYPE_t ldb = X_c.shape[1] + DTYPE_t beta = 0. + DTYPE_t * C = dist_middle_terms + ITYPE_t ldc = Y_c.shape[0] + + # dist_middle_terms = -2 * X_c.dot(Y_c.T) + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc) + + # Pushing the distance and their associated indices in vectors. + for i in range(X_c.shape[0]): + for j in range(Y_c.shape[0]): + # Using the squared euclidean distance as the ranking-preserving distance: + # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + squared_dist_i_j = ( + self.X_sq_norms[i + X_start] + + dist_middle_terms[i * Y_c.shape[0] + j] + + self.Y_sq_norms[j + Y_start] + ) + if squared_dist_i_j <= self.r_radius: + deref(self.neigh_distances_chunks[thread_num])[i + X_start].push_back(squared_dist_i_j) + deref(self.neigh_indices_chunks[thread_num])[i + X_start].push_back(j + Y_start) + + +cdef class Kernel(PairwiseDistancesReduction): + + cdef: + DTYPE_t[:, ::1] K + + @classmethod + def get_for( + cls, + X, + Y, + str kernel="rbf", + chunk_size=None, + dict kernel_kwargs=None, + n_threads=None, + ) -> PairwiseDistancesArgKmin: + """Return the Kernel implementation for the given arguments. + + Parameters + ---------- + kernel : str, default='rbf' + The kernel to use. + + chunk_size : int, default=None, + The number of vectors per chunk. If None (default) looks-up in + scikit-learn configuration for `pairwise_dist_chunk_size`, + and use 256 if it is not set. + + kernel_kwargs : dict, default=None + Keyword arguments to pass to specified kernel. + + n_threads : int, default=None + The number of OpenMP threads to use for the reduction. + Parallelism is done on chunks and the sharding of chunks + depends on the `strategy` set on + :method:`~Kernel.compute`. + + None and -1 means using all processors. + + Returns + ------- + argkmin: PairwiseDistancesArgKmin + The suited PairwiseDistancesArgKmin implementation. + """ + # This factory comes to handle specialisations. + if kernel == "rbf": + return RBFKernel(X, Y, chunk_size=chunk_size) + else: + raise ValueError(f"Unsupported kernel: {kernel}") + + def __init__( + self, + DatasetsPair datasets_pair, + chunk_size=None, + n_threads=None, + ): + super().__init__(datasets_pair, chunk_size, n_threads) + + # The Gram matrix: K[i,j] = K(x_i, y_j) + self.K = np.empty((self.n_X, self.n_Y), dtype=DTYPE) + + def compute( + self, + str strategy=None, + ): + """Computes the kernel between vectors of X and Y. + + Parameters + ---------- + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None + The chunking strategy defining which dataset parallelization are made on. + + Strategies differs on the dispatching they use for chunks on threads: + - 'parallel_on_X' dispatches chunks of X uniformly on threads. + Each thread then iterates on all the chunks of Y. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. + Each thread then iterates on all the chunks of X. This strategy is + embarrassingly parallel and comes with no datastructures synchronisation. + - 'auto' relies on a simple heuristic to choose between + 'parallel_on_X' and 'parallel_on_Y'. + - None (default) looks-up in scikit-learn configuration for + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. + + Returns + ------- + K : ndarray of shape (n_X, n_Y) + A kernel matrix K such that K_{i, j} is the kernel between the + ith and jth vectors of the given matrix X and Y. + """ + + if strategy is None: + strategy = get_config().get("pairwise_dist_parallel_strategy", 'auto') + + if strategy == 'auto': + # This is a simple heuristic whose constant for the + # comparison has been chosen based on experiments. + if 4 * self.chunk_size * self.effective_omp_n_thread < self.n_X: + strategy = 'parallel_on_X' + else: + strategy = 'parallel_on_Y' + + # Limit the number of threads in second level of nested parallelism for BLAS + # to avoid threads over-subscription (in GEMM for instance). + with threadpool_limits(limits=1, user_api="blas"): + if strategy == 'parallel_on_Y': + self._parallel_on_Y() + elif strategy == 'parallel_on_X': + self._parallel_on_X() + else: + raise RuntimeError(f"strategy '{strategy}' not supported.") + + return self._finalize_results() + +cdef class RBFKernel(Kernel): + + cdef: + const DTYPE_t[:, ::1] X + const DTYPE_t[:, ::1] Y + const DTYPE_t[::1] X_sq_norms + const DTYPE_t[::1] Y_sq_norms + + # Buffers for GEMM + DTYPE_t ** dist_middle_terms_chunks + bint use_squared_distances + + DTYPE_t gamma + + def __init__( + self, + X, + Y, + gamma=None, + chunk_size=None, + n_threads=None, + ): + super().__init__( + # The datasets pair here is used for exact distances computations + datasets_pair=DatasetsPair.get_for(X, Y, metric="euclidean"), + chunk_size=chunk_size, + n_threads=n_threads + ) + # X and Y are checked by the DatasetsPair implemented as a DenseDenseDatasetsPair + cdef: + DenseDenseDatasetsPair datasets_pair = self.datasets_pair + self.X, self.Y = datasets_pair.X, datasets_pair.Y + self.X_sq_norms = _sqeuclidean_row_norms(self.X, self.effective_omp_n_thread) + self.Y_sq_norms = _sqeuclidean_row_norms(self.Y, self.effective_omp_n_thread) + + # Temporary datastructures used in threads + self.dist_middle_terms_chunks = malloc( + sizeof(DTYPE_t *) * self.effective_omp_n_thread + ) + + self.gamma = 1.0 / X.shape[1] if gamma is None else gamma + + + def __dealloc__(self): + if self.dist_middle_terms_chunks is not NULL: + free(self.dist_middle_terms_chunks) + + @classmethod + def is_usable_for(cls, X, Y, metric) -> bool: + return (super().is_usable_for(X, Y, metric) + and not _in_unstable_openblas_configuration()) + + @final + cdef void _on_X_parallel_init( + self, + ITYPE_t thread_num, + ) nogil: + Kernel._on_X_parallel_init(self, thread_num) + + # Temporary buffer for the -2 * X_c.dot(Y_c.T) term + self.dist_middle_terms_chunks[thread_num] = malloc( + self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t) + ) + + @final + cdef void _on_X_parallel_finalize( + self, + ITYPE_t thread_num + ) nogil: + Kernel._on_X_parallel_finalize(self, thread_num) + free(self.dist_middle_terms_chunks[thread_num]) + + @final + cdef void _on_Y_init( + self, + ITYPE_t num_threads, + ) nogil: + cdef ITYPE_t thread_num + Kernel._on_Y_init(self, num_threads) + + for thread_num in range(num_threads): + # Temporary buffer for the -2 * X_c.dot(Y_c.T) term + self.dist_middle_terms_chunks[thread_num] = malloc( + self.Y_n_samples_chunk * self.X_n_samples_chunk * sizeof(DTYPE_t) + ) + + @final + cdef void _on_Y_finalize( + self, + ITYPE_t num_threads, + ) nogil: + cdef ITYPE_t thread_num + Kernel._on_Y_finalize(self, num_threads) + + for thread_num in range(num_threads): + free(self.dist_middle_terms_chunks[thread_num]) + + @final + cdef void _compute_and_reduce_distances_on_chunks( + self, + ITYPE_t X_start, + ITYPE_t X_end, + ITYPE_t Y_start, + ITYPE_t Y_end, + ITYPE_t thread_num, + ) nogil: + cdef: + ITYPE_t i, j + DTYPE_t squared_dist_i_j + + const DTYPE_t[:, ::1] X_c = self.X[X_start:X_end, :] + const DTYPE_t[:, ::1] Y_c = self.Y[Y_start:Y_end, :] + DTYPE_t *dist_middle_terms = self.dist_middle_terms_chunks[thread_num] + + # We compute the full pairwise squared distances matrix as follows + # + # exp(- gamma ||X_c - Y_c||²) = exp(- gamma( ||X_c||² - 2 X_c.Y_c^T + ||Y_c||²) ) + # + # The middle term gets computed efficiently bellow using BLAS Level 3 GEMM. + # + # Careful: LDA, LDB and LDC are given for F-ordered arrays + # in BLAS documentations, for instance: + # https://www.netlib.org/lapack/explore-html/db/dc9/group__single__blas__level3_gafe51bacb54592ff5de056acabd83c260.html #noqa + # + # Here, we use their counterpart values to work with C-ordered arrays. + BLAS_Order order = RowMajor + BLAS_Trans ta = NoTrans + BLAS_Trans tb = Trans + ITYPE_t m = X_c.shape[0] + ITYPE_t n = Y_c.shape[0] + ITYPE_t K = X_c.shape[1] + DTYPE_t alpha = - 2. + # Casting for A and B to remove the const is needed because APIs exposed via + # scipy.linalg.cython_blas aren't reflecting the arguments' const qualifier. + DTYPE_t * A = & X_c[0, 0] + ITYPE_t lda = X_c.shape[1] + DTYPE_t * B = & Y_c[0, 0] + ITYPE_t ldb = X_c.shape[1] + DTYPE_t beta = 0. + DTYPE_t * C = dist_middle_terms + ITYPE_t ldc = Y_c.shape[0] + + # dist_middle_terms = -2 * X_c.dot(Y_c.T) + _gemm(order, ta, tb, m, n, K, alpha, A, lda, B, ldb, beta, C, ldc) + + # Pushing the distance and their associated indices in vectors. + for i in range(X_c.shape[0]): + for j in range(Y_c.shape[0]): + # Using the squared euclidean distance as the ranking-preserving distance: + # |X_c_i||² - 2 X_c_i.Y_c_j^T + ||Y_c_j||² + squared_dist_i_j = ( + self.X_sq_norms[i + X_start] + + dist_middle_terms[i * Y_c.shape[0] + j] + + self.Y_sq_norms[j + Y_start] + ) + self.K[i + X_start, j + Y_start] = - self.gamma * squared_dist_i_j + + + def _finalize_results(self): + return np.exp(self.K) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index d493ad68603ea..7a60bbb0b4ef1 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -31,6 +31,7 @@ from ..utils.fixes import delayed from ..utils.fixes import sp_version, parse_version +from ._pairwise_distances_reduction import PairwiseDistancesArgKmin from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan from ..exceptions import DataConversionWarning @@ -574,6 +575,10 @@ def _argmin_min_reduce(dist, start): return indices, values +def _argmin_reduce(dist, start): + return dist.argmin(axis=1) + + def pairwise_distances_argmin_min( X, Y, *, axis=1, metric="euclidean", metric_kwargs=None ): @@ -646,19 +651,33 @@ def pairwise_distances_argmin_min( """ X, Y = check_pairwise_arrays(X, Y) - if metric_kwargs is None: - metric_kwargs = {} - if axis == 0: X, Y = Y, X - indices, values = zip( - *pairwise_distances_chunked( - X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs + if metric_kwargs is None: + metric_kwargs = {} + + if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): + values, indices = PairwiseDistancesArgKmin.get_for( + X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs + ).compute(strategy="auto", return_distance=True) + values = values.flatten() + indices = indices.flatten() + else: + # TODO: once ArgKmin supports sparse input matrices and 32 bit, + # we won't need to fallback to pairwise_distances_chunked anymore. + # When PairwiseDistancesArgKmin is not supported and when the user + # asked for a fast alternative, we need to revert to the standard one. + if metric == "fast_euclidean": + metric = "euclidean" + + indices, values = zip( + *pairwise_distances_chunked( + X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs + ) ) - ) - indices = np.concatenate(indices) - values = np.concatenate(values) + indices = np.concatenate(indices) + values = np.concatenate(values) return indices, values @@ -730,9 +749,38 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs if metric_kwargs is None: metric_kwargs = {} - return pairwise_distances_argmin_min( - X, Y, axis=axis, metric=metric, metric_kwargs=metric_kwargs - )[0] + X, Y = check_pairwise_arrays(X, Y) + + if axis == 0: + X, Y = Y, X + + if metric_kwargs is None: + metric_kwargs = {} + + if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): + indices = PairwiseDistancesArgKmin.get_for( + X=X, Y=Y, k=1, metric=metric, metric_kwargs=metric_kwargs + ).compute(strategy="auto", return_distance=False) + indices = indices.flatten() + else: + # TODO: once ArgKmin supports sparse input matrices and 32 bit, + # we won't need to fallback to pairwise_distances_chunked anymore. + # When PairwiseDistancesArgKmin is not supported and when the user + # asked for a fast alternative, we need to revert to the standard one. + if metric == "fast_euclidean": + metric = "euclidean" + + indices = np.concatenate( + list( + # This returns a np.ndarray generator whose arrays we need + # to flatten into one. + pairwise_distances_chunked( + X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs + ) + ) + ) + + return indices def haversine_distances(X, Y=None): @@ -780,7 +828,7 @@ def haversine_distances(X, Y=None): array([[ 0. , 11099.54035582], [11099.54035582, 0. ]]) """ - from ..neighbors import DistanceMetric + from ..metrics import DistanceMetric return DistanceMetric.get_metric("haversine").pairwise(X, Y) diff --git a/sklearn/metrics/setup.py b/sklearn/metrics/setup.py index df1a1caad17e0..cd32817574dd3 100644 --- a/sklearn/metrics/setup.py +++ b/sklearn/metrics/setup.py @@ -1,4 +1,5 @@ import os +import numpy as np from numpy.distutils.misc_util import Configuration @@ -18,6 +19,20 @@ def configuration(parent_package="", top_path=None): "_pairwise_fast", sources=["_pairwise_fast.pyx"], libraries=libraries ) + config.add_extension( + "_pairwise_distances_reduction", + sources=["_pairwise_distances_reduction.pyx"], + language="c++", + libraries=libraries, + ) + + config.add_extension( + "_dist_metrics", + sources=["_dist_metrics.pyx"], + include_dirs=[np.get_include(), os.path.join(np.get_include(), "numpy")], + libraries=libraries, + ) + config.add_subpackage("tests") return config diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py similarity index 93% rename from sklearn/neighbors/tests/test_dist_metrics.py rename to sklearn/metrics/tests/test_dist_metrics.py index 08298f087c216..9f0750fd75669 100644 --- a/sklearn/neighbors/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -7,8 +7,8 @@ import pytest from scipy.spatial.distance import cdist -from sklearn.neighbors import DistanceMetric -from sklearn.neighbors import BallTree +from sklearn.metrics import DistanceMetric +from sklearn.metrics._dist_metrics import BOOL_METRICS from sklearn.utils import check_random_state from sklearn.utils._testing import create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -37,16 +37,6 @@ def dist_func(x1, x2, p): V = rng.random_sample((d, d)) VI = np.dot(V, V.T) -BOOL_METRICS = [ - "matching", - "jaccard", - "dice", - "kulsinski", - "rogerstanimoto", - "russellrao", - "sokalmichener", - "sokalsneath", -] METRICS_DEFAULT_PARAMS = { "euclidean": {}, @@ -62,6 +52,16 @@ def dist_func(x1, x2, p): } +# TODO: remove this test in 1.2 +def test_neighbors_distance_metric_deprecation(): + from sklearn.neighbors import DistanceMetric as DeprecatedDistanceMetric + + with pytest.warns( + FutureWarning, match="sklearn.neighbors.DistanceMetric has been moved" + ): + DeprecatedDistanceMetric.get_metric("euclidean") + + @pytest.mark.parametrize("metric", METRICS_DEFAULT_PARAMS) @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)]) def test_cdist(metric, X1, X2): @@ -230,16 +230,6 @@ def test_pyfunc_metric(): assert_array_almost_equal(D1_pkl, D2_pkl) -def test_bad_pyfunc_metric(): - def wrong_distance(x, y): - return "1" - - X = np.ones((5, 2)) - msg = "Custom distance function must accept two vectors" - with pytest.raises(TypeError, match=msg): - BallTree(X, metric=wrong_distance) - - def test_input_data_size(): # Regression test for #6288 # Previously, a metric requiring a particular input dimension would fail diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index b7e90e63f2af1..90b8db305b83b 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -440,7 +440,6 @@ def test_pairwise_distances_argmin_min(): expected_idx = [0, 1] expected_vals = [2, 2] - expected_vals_sq = [4, 4] # euclidean metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean") @@ -458,10 +457,12 @@ def test_pairwise_distances_argmin_min(): # euclidean metric squared idx, vals = pairwise_distances_argmin_min( - X, Y, metric="euclidean", metric_kwargs={"squared": True} + X, + Y, + metric="fast_euclidean", ) assert_array_almost_equal(idx, expected_idx) - assert_array_almost_equal(vals, expected_vals_sq) + assert_array_almost_equal(vals, expected_vals) # Non-euclidean scikit-learn metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") @@ -1464,3 +1465,34 @@ def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x): # and fails due to rounding errors rtol = 1e-5 if dtype is np.float32 else 1e-7 assert_allclose(dist, expected_dist, rtol=rtol) + + +@pytest.mark.parametrize("X_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]]) +@pytest.mark.parametrize("Y_translation", [10 ** i for i in [2, 3, 4, 5, 6, 7]]) +@pytest.mark.parametrize("sign", [1, -1]) +def test_fast_euclidean_correctness( + X_translation, Y_translation, sign, n_samples=10000, n_features=10 +): + # This is the only failing test case, so we prefer xfailing. + numerical_edge_cases = {(1e6, 1e6, 1), (1e7, 1e7, 1)} + if (X_translation, Y_translation, sign) in numerical_edge_cases: + pytest.xfail( + "Numerical edge-case: (X_translation, Y_translation," + f" sign)={(X_translation, Y_translation, sign)}" + ) + + # The fast squared euclidean strategy must return results + # that are close to the ones obtained with the euclidean distance + rng = np.random.RandomState(1) + + spread = 100 + X = X_translation + rng.rand(n_samples, n_features) * spread + Y = (Y_translation + rng.rand(n_samples, n_features) * spread) * sign + + argmins, distances = pairwise_distances_argmin_min(X, Y, metric="euclidean") + fsq_argmins, fsq_distances = pairwise_distances_argmin_min( + X, Y, metric="fast_euclidean" + ) + + np.testing.assert_array_equal(argmins, fsq_argmins) + np.testing.assert_allclose(distances, fsq_distances, rtol=1e-5) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py new file mode 100644 index 0000000000000..e68f66fe5a40c --- /dev/null +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -0,0 +1,469 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal, assert_allclose +from scipy.sparse import csr_matrix + +from sklearn.metrics._dist_metrics import ( + DenseDenseDatasetsPair, + DenseSparseDatasetsPair, + SparseDenseDatasetsPair, + SparseSparseDatasetsPair, +) + +from sklearn.metrics._pairwise_distances_reduction import ( + PairwiseDistancesReduction, + PairwiseDistancesArgKmin, + PairwiseDistancesRadiusNeighborhood, + FastEuclideanPairwiseDistancesArgKmin, + FastEuclideanPairwiseDistancesRadiusNeighborhood, + _sqeuclidean_row_norms, +) + +from sklearn.utils import _in_unstable_openblas_configuration +from sklearn.utils._testing import ( + fails_if_unstable_openblas, + get_dummy_metric_kwargs, +) + + +def assert_radius_neighborhood_results_equality(ref_dist, dist, ref_indices, indices): + # We get arrays of arrays and we need to check for individual pairs + for i in range(ref_dist.shape[0]): + assert_array_equal( + ref_indices[i], + indices[i], + err_msg=f"Query vector #{i} has different neighbors' indices", + ) + assert_allclose( + ref_dist[i], + dist[i], + err_msg=f"Query vector #{i} has different neighbors' distances", + rtol=1e-7, + ) + + +def assert_argkmin_results_equality(ref_dist, dist, ref_indices, indices): + assert_array_equal( + ref_indices, + indices, + err_msg="Query vectors have different neighbors' indices", + ) + assert_allclose( + ref_dist, + dist, + err_msg="Query vectors have different neighbors' distances", + rtol=1e-7, + ) + + +ASSERT_RESULT = { + PairwiseDistancesArgKmin: assert_argkmin_results_equality, + PairwiseDistancesRadiusNeighborhood: assert_radius_neighborhood_results_equality, +} + + +def test_pairwise_distances_reduction_is_usable_for(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + metric = "euclidean" + assert PairwiseDistancesReduction.is_usable_for(X, Y, metric) + assert not PairwiseDistancesReduction.is_usable_for( + X.astype(np.int64), Y.astype(np.int64), metric + ) + + assert not PairwiseDistancesReduction.is_usable_for(X[0], Y, metric) + assert not PairwiseDistancesReduction.is_usable_for(X, Y[0], metric) + + assert not PairwiseDistancesReduction.is_usable_for(X, Y, metric="pyfunc") + # TODO: remove once 32 bits datasets are supported + assert not PairwiseDistancesReduction.is_usable_for(X.astype(np.float32), Y, metric) + assert not PairwiseDistancesReduction.is_usable_for(X, Y.astype(np.int32), metric) + + # TODO: remove once sparse matrices are supported + assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric) + assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric) + + +def test_argkmin_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + k = 5 + metric = "euclidean" + + with pytest.raises( + ValueError, match="Only 64bit float datasets are supported for X and Y." + ): + PairwiseDistancesArgKmin.get_for( + X=X.astype(np.float32), Y=Y, k=k, metric=metric + ) + + with pytest.raises( + ValueError, match="Only 64bit float datasets are supported for X and Y." + ): + PairwiseDistancesArgKmin.get_for(X=X, Y=Y.astype(np.int32), k=k, metric=metric) + + with pytest.raises(ValueError, match="k == -1, must be >= 1."): + PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=-1, metric=metric) + + with pytest.raises(ValueError, match="k == 0, must be >= 1."): + PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=0, metric=metric) + + with pytest.raises(ValueError, match="Unrecognized metric"): + PairwiseDistancesArgKmin.get_for(X=X, Y=Y, k=k, metric="wrong metric") + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + PairwiseDistancesArgKmin.get_for( + X=np.array([1.0, 2.0]), Y=Y, k=k, metric=metric + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + PairwiseDistancesArgKmin.get_for( + X=np.asfortranarray(X), Y=Y, k=k, metric=metric + ) + + +def test_radius_neighborhood_factory_method_wrong_usages(): + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + radius = 5 + metric = "euclidean" + + with pytest.raises( + ValueError, match="Only 64bit float datasets are supported for X and Y." + ): + PairwiseDistancesRadiusNeighborhood.get_for( + X=X.astype(np.float32), Y=Y, radius=radius, metric=metric + ) + + with pytest.raises( + ValueError, match="Only 64bit float datasets are supported for X and Y." + ): + PairwiseDistancesRadiusNeighborhood.get_for( + X=X, Y=Y.astype(np.int32), radius=radius, metric=metric + ) + + with pytest.raises(ValueError, match="radius == -1.0, must be >= 0."): + PairwiseDistancesRadiusNeighborhood.get_for(X=X, Y=Y, radius=-1, metric=metric) + + with pytest.raises(ValueError, match="Unrecognized metric"): + PairwiseDistancesRadiusNeighborhood.get_for( + X=X, Y=Y, radius=radius, metric="wrong metric" + ) + + with pytest.raises( + ValueError, match=r"Buffer has wrong number of dimensions \(expected 2, got 1\)" + ): + PairwiseDistancesRadiusNeighborhood.get_for( + X=np.array([1.0, 2.0]), Y=Y, radius=radius, metric=metric + ) + + with pytest.raises(ValueError, match="ndarray is not C-contiguous"): + PairwiseDistancesRadiusNeighborhood.get_for( + X=np.asfortranarray(X), Y=Y, radius=radius, metric=metric + ) + + +@fails_if_unstable_openblas +@pytest.mark.filterwarnings("ignore:Constructing a DIA matrix") +@pytest.mark.parametrize( + "PairwiseDistancesReduction, FastPairwiseDistancesReduction", + [ + (PairwiseDistancesArgKmin, FastEuclideanPairwiseDistancesArgKmin), + ( + PairwiseDistancesRadiusNeighborhood, + FastEuclideanPairwiseDistancesRadiusNeighborhood, + ), + ], +) +def test_pairwise_distances_reduction_factory_method( + PairwiseDistancesReduction, FastPairwiseDistancesReduction +): + # Test all the combinations of DatasetsPair for creation + rng = np.random.RandomState(1) + X = rng.rand(100, 10) + Y = rng.rand(100, 10) + metric = "euclidean" + + # Dummy value for k or radius + dummy_arg = 5 + + dense_dense_instance = PairwiseDistancesReduction.get_for(X, Y, dummy_arg, metric) + assert isinstance(dense_dense_instance.datasets_pair, DenseDenseDatasetsPair) + + sparse_sparse_instance = PairwiseDistancesReduction.get_for( + csr_matrix(X), csr_matrix(Y), dummy_arg, metric + ) + assert isinstance(sparse_sparse_instance.datasets_pair, SparseSparseDatasetsPair) + + dense_sparse_instance = PairwiseDistancesReduction.get_for( + X, csr_matrix(Y), dummy_arg, metric=metric + ) + assert isinstance(dense_sparse_instance.datasets_pair, DenseSparseDatasetsPair) + + sparse_dense_instance = PairwiseDistancesReduction.get_for( + csr_matrix(X), Y, dummy_arg, metric=metric + ) + assert isinstance(sparse_dense_instance.datasets_pair, SparseDenseDatasetsPair) + + # Test specialisations creation + fast_euclidean_instance = PairwiseDistancesReduction.get_for( + X, Y, dummy_arg, metric="fast_euclidean" + ) + assert isinstance(fast_euclidean_instance, PairwiseDistancesReduction) + assert isinstance(fast_euclidean_instance, FastPairwiseDistancesReduction) + + +@fails_if_unstable_openblas +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("chunk_size", [50, 512, 1024]) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], +) +def test_chunk_size_agnosticism( + PairwiseDistancesReduction, + seed, + n_samples, + chunk_size, + metric="fast_euclidean", + n_features=100, + dtype=np.float64, +): + # Results should not depend on the chunk size + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius with the dimensions + else 10 ** np.log(n_features) + ) + + ref_dist, ref_indices = PairwiseDistancesReduction.get_for( + X, Y, parameter, metric="euclidean" + ).compute(return_distance=True) + + dist, indices = PairwiseDistancesReduction.get_for( + X, Y, parameter, metric=metric, chunk_size=chunk_size + ).compute(return_distance=True) + + ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) + + +@fails_if_unstable_openblas +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("chunk_size", [50, 512, 1024]) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], +) +def test_n_threads_agnosticism( + PairwiseDistancesReduction, + seed, + n_samples, + chunk_size, + metric="fast_euclidean", + n_features=100, + dtype=np.float64, +): + # Results should not depend on the number of threads + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius with the dimensions + else 10 ** np.log(n_features) + ) + + ref_dist, ref_indices = PairwiseDistancesReduction.get_for( + X, Y, parameter, metric="euclidean" + ).compute(return_distance=True) + + dist, indices = PairwiseDistancesReduction.get_for( + X, Y, parameter, metric=metric, n_threads=1 + ).compute(return_distance=True) + + ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) + + +@pytest.mark.parametrize("seed", range(5)) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) +@pytest.mark.parametrize( + "PairwiseDistancesReduction", + [PairwiseDistancesArgKmin, PairwiseDistancesRadiusNeighborhood], +) +def test_strategies_consistency( + PairwiseDistancesReduction, + metric, + n_samples, + seed, + n_features=10, + dtype=np.float64, +): + # Results obtained using both parallelization strategies must be identical + if _in_unstable_openblas_configuration() and metric == { + "fast_sqeuclidean", + "fast_euclidean", + }: + pytest.xfail( + "OpenBLAS (used for 'fast_(sq)euclidean') is unstable in this configuration" + ) + + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + # Haversine distance only accepts 2D data + if metric == "haversine": + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) + + parameter = ( + 10 + if PairwiseDistancesReduction is PairwiseDistancesArgKmin + # Scaling the radius with the dimensions + else 10 ** np.log(n_features) + ) + + pairwise_distances_reduction = PairwiseDistancesReduction.get_for( + X, + Y, + parameter, + metric=metric, + metric_kwargs=get_dummy_metric_kwargs(metric, n_features), + # To be sure to use parallelization + chunk_size=n_samples // 4, + ) + + dist_par_X, indices_par_X = pairwise_distances_reduction.compute( + strategy="parallel_on_X", return_distance=True + ) + + dist_par_Y, indices_par_Y = pairwise_distances_reduction.compute( + strategy="parallel_on_Y", return_distance=True + ) + + ASSERT_RESULT[PairwiseDistancesReduction]( + dist_par_X, dist_par_Y, indices_par_X, indices_par_Y + ) + + +@fails_if_unstable_openblas +@pytest.mark.parametrize("seed", range(10)) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("n_features", [5, 10, 100]) +@pytest.mark.parametrize("k, radius", [(50, 100)]) +def test_fast_sqeuclidean_correctness( + seed, + n_samples, + n_features, + k, + radius, + dtype=np.float64, +): + # The fast squared euclidean strategy must return results + # that are close to the ones obtained with the euclidean distance + if n_samples < k: + pytest.skip( + f"Skipping as n_samples (={n_samples}) < k (={k})", + allow_module_level=True, + ) + + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + eucl_dist, eucl_indices = PairwiseDistancesArgKmin.get_for( + X, Y, k, metric="euclidean" + ).compute(return_distance=True) + fse_dist, fse_indices = PairwiseDistancesArgKmin.get_for( + X, Y, k, metric="fast_euclidean" + ).compute(return_distance=True) + + assert_argkmin_results_equality(eucl_dist, fse_dist, eucl_indices, fse_indices) + + eucl_dist, eucl_indices = PairwiseDistancesRadiusNeighborhood.get_for( + X, Y, radius, metric="euclidean" + ).compute(return_distance=True) + fse_dist, fse_indices = PairwiseDistancesRadiusNeighborhood.get_for( + X, Y, radius, metric="fast_euclidean" + ).compute(return_distance=True) + + assert_radius_neighborhood_results_equality( + eucl_dist, fse_dist, eucl_indices, fse_indices + ) + + +@fails_if_unstable_openblas +@pytest.mark.parametrize("seed", range(10)) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("n_features", [5, 10, 100]) +@pytest.mark.parametrize("k", [1, 10, 100]) +@pytest.mark.parametrize("translation", [10 ** i for i in [4]]) +def test_fast_sqeuclidean_translation_invariance( + seed, + n_samples, + n_features, + k, + translation, + dtype=np.float64, +): + # The fast squared euclidean strategy should be translation invariant. + if n_samples < k: + pytest.skip( + f"Skipping as n_samples (={n_samples}) < n_neighbors (={k})", + allow_module_level=True, + ) + + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + Y = rng.rand(n_samples, n_features).astype(dtype) * spread + + reference_dist, reference_indices = PairwiseDistancesArgKmin.get_for( + X, Y, k, metric="fast_sqeuclidean" + ).compute(return_distance=True) + + dist, indices = PairwiseDistancesArgKmin.get_for( + X + translation, Y + translation, k, metric="fast_sqeuclidean" + ).compute(return_distance=True) + + assert_argkmin_results_equality(reference_dist, dist, reference_indices, indices) + + +@pytest.mark.parametrize("seed", range(10)) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("n_features", [5, 10, 100]) +@pytest.mark.parametrize("num_threads", [1, 2, 8]) +def test_sqeuclidean_row_norms( + seed, + n_samples, + n_features, + num_threads, + dtype=np.float64, +): + rng = np.random.RandomState(seed) + spread = 100 + X = rng.rand(n_samples, n_features).astype(dtype) * spread + + sq_row_norm_reference = np.linalg.norm(X, axis=1) ** 2 + sq_row_norm = np.asarray(_sqeuclidean_row_norms(X, num_threads=num_threads)) + + assert_allclose(sq_row_norm_reference, sq_row_norm) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index ceb1df3420e38..779ed9e39f34a 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -399,7 +399,7 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator): >>> X, y = make_multilabel_classification(n_classes=3, random_state=0) >>> clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y) >>> clf.predict(X[-2:]) - array([[1, 1, 0], [1, 1, 1]]) + array([[1, 1, 1], [1, 1, 1]]) """ def __init__(self, estimator, *, n_jobs=None): diff --git a/sklearn/neighbors/__init__.py b/sklearn/neighbors/__init__.py index 8a0934eecf142..ff5ad4875d77d 100644 --- a/sklearn/neighbors/__init__.py +++ b/sklearn/neighbors/__init__.py @@ -4,8 +4,8 @@ """ from ._ball_tree import BallTree +from ._distance_metric import DistanceMetric from ._kd_tree import KDTree -from ._dist_metrics import DistanceMetric from ._graph import kneighbors_graph, radius_neighbors_graph from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer from ._unsupervised import NearestNeighbors diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 4e01cf2cd1076..e21a0ffb36a28 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -23,6 +23,10 @@ from ..base import is_classifier from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from ..metrics._pairwise_distances_reduction import ( + PairwiseDistancesArgKmin, + PairwiseDistancesRadiusNeighborhood, +) from ..utils import ( check_array, gen_even_slices, @@ -50,6 +54,8 @@ "correlation", "cosine", "dice", + "fast_euclidean", + "fast_sqeuclidean", "hamming", "jaccard", "kulsinski", @@ -361,6 +367,20 @@ def _check_algorithm_metric(self): else: alg_check = self.algorithm + if alg_check != "brute" and self.metric in ( + "fast_sqeuclidean", + "fast_euclidean", + ): + alternative = self.metric.replace("fast_", "") + warnings.warn( + f"'{self.metric}' is only available for algorithm='brute' but" + f" algorithm='{self.algorithm}' is used. Falling back on" + f" metric='{alternative}'.", + UserWarning, + stacklevel=3, + ) + self.metric = alternative + if callable(self.metric): if self.algorithm == "kd_tree": # callable metric is only valid for brute force and ball_tree @@ -397,7 +417,9 @@ def _check_algorithm_metric(self): def _fit(self, X, y=None): if self._get_tags()["requires_y"]: if not isinstance(X, (KDTree, BallTree, NeighborsBase)): - X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True) + X, y = self._validate_data( + X, y, accept_sparse="csr", multi_output=True, order="C" + ) if is_classifier(self): # Classification targets require a specific format @@ -432,7 +454,7 @@ def _fit(self, X, y=None): else: if not isinstance(X, (KDTree, BallTree, NeighborsBase)): - X = self._validate_data(X, accept_sparse="csr") + X = self._validate_data(X, accept_sparse="csr", order="C") self._check_algorithm_metric() if self.metric_params is None: @@ -499,6 +521,11 @@ def _fit(self, X, y=None): if issparse(X): if self.algorithm not in ("auto", "brute"): warnings.warn("cannot use tree with sparse input: using brute force") + + if self.metric in ("fast_sqeuclidean", "fast_euclidean"): + # The fast alternatives are only available for dense datasets. + self.effective_metric_ = self.effective_metric_.replace("fast_", "") + if self.effective_metric_ not in VALID_METRICS_SPARSE[ "brute" ] and not callable(self.effective_metric_): @@ -542,6 +569,8 @@ def _fit(self, X, y=None): else: self._fit_method = "brute" + specialised_metrics = {"euclidean", "sqeuclidean"} + if self._fit_method == "ball_tree": self._tree = BallTree( X, @@ -557,6 +586,13 @@ def _fit(self, X, y=None): **self.effective_metric_params_, ) elif self._fit_method == "brute": + if ( + self.effective_metric_ in specialised_metrics + and self.metric not in specialised_metrics + ): + # In that case, the standard stabler metric has not been explicitly + # specified by the user, so we prefer its fast alternative. + self.effective_metric_ = f"fast_{self.effective_metric_}" self._tree = None else: raise ValueError("algorithm = '%s' not recognized" % self.algorithm) @@ -633,10 +669,7 @@ def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance): # argpartition doesn't guarantee sorted order, so we sort again neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == "euclidean": - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind + result = dist[sample_range, neigh_ind], neigh_ind else: result = neigh_ind return result @@ -706,10 +739,21 @@ class from an array representing our data set and ask who's % type(n_neighbors) ) + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and PairwiseDistancesArgKmin.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + if X is not None: query_is_train = False if self.metric == "precomputed": X = _check_precomputed(X) + elif use_pairwise_distances_reductions: + # We force the C-contiguity even if it creates a copy for F-ordered + # arrays because this implementation is more efficient. + X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") else: X = self._validate_data(X, accept_sparse="csr", reset=False) else: @@ -728,24 +772,40 @@ class from an array representing our data set and ask who's n_jobs = effective_n_jobs(self.n_jobs) chunked_results = None - if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X): + if use_pairwise_distances_reductions: + results = PairwiseDistancesArgKmin.get_for( + X=X, + Y=self._fit_X, + k=n_neighbors, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + n_threads=self.n_jobs, + ).compute( + strategy="auto", + return_distance=return_distance, + ) + + elif ( + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) + ): results = _kneighbors_from_graph( X, n_neighbors=n_neighbors, return_distance=return_distance ) elif self._fit_method == "brute": + # TODO: support sparse matrices + # When ArgKmin is not supported and when the user ask for a + # fast alternative, we need to revert to the standard. + if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"): + # The fast alternatives are only available for dense datasets. + self.effective_metric_ = self.effective_metric_.replace("fast_", "") + reduce_func = partial( self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance, ) - # for efficiency, use squared euclidean distances - if self.effective_metric_ == "euclidean": - kwds = {"squared": True} - else: - kwds = self.effective_metric_params_ - chunked_results = list( pairwise_distances_chunked( X, @@ -753,7 +813,7 @@ class from an array representing our data set and ask who's reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, - **kwds, + **self.effective_metric_params_, ) ) @@ -943,10 +1003,7 @@ def _radius_neighbors_reduce_func(self, dist, start, radius, return_distance): neigh_ind = [np.where(d <= radius)[0] for d in dist] if return_distance: - if self.effective_metric_ == "euclidean": - dist = [np.sqrt(d[neigh_ind[i]]) for i, d in enumerate(dist)] - else: - dist = [d[neigh_ind[i]] for i, d in enumerate(dist)] + dist = [d[neigh_ind[i]] for i, d in enumerate(dist)] results = dist, neigh_ind else: results = neigh_ind @@ -1030,10 +1087,21 @@ class from an array representing our data set and ask who's """ check_is_fitted(self) + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and PairwiseDistancesRadiusNeighborhood.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + if X is not None: query_is_train = False if self.metric == "precomputed": X = _check_precomputed(X) + elif use_pairwise_distances_reductions: + # We force the C-contiguity even if it creates a copy for F-ordered + # arrays because this implementation is more efficient. + X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") else: X = self._validate_data(X, accept_sparse="csr", reset=False) else: @@ -1043,18 +1111,33 @@ class from an array representing our data set and ask who's if radius is None: radius = self.radius - if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X): + if use_pairwise_distances_reductions: + results = PairwiseDistancesRadiusNeighborhood.get_for( + X=X, + Y=self._fit_X, + radius=radius, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + n_threads=self.n_jobs, + sort_results=sort_results, + ).compute( + strategy="auto", + return_distance=return_distance, + ) + + elif ( + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) + ): results = _radius_neighbors_from_graph( X, radius=radius, return_distance=return_distance ) elif self._fit_method == "brute": - # for efficiency, use squared euclidean distances - if self.effective_metric_ == "euclidean": - radius *= radius - kwds = {"squared": True} - else: - kwds = self.effective_metric_params_ + # When RadiusNeighborhood is not supported and when the user ask for a + # fast alternative, we need to revert to the standard. + if self.effective_metric_ in ("fast_sqeuclidean", "fast_euclidean"): + # The fast alternatives are only available for dense datasets. + self.effective_metric_ = self.effective_metric_.replace("fast_", "") reduce_func = partial( self._radius_neighbors_reduce_func, @@ -1068,7 +1151,7 @@ class from an array representing our data set and ask who's reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=self.n_jobs, - **kwds, + **self.effective_metric_params_, ) if return_distance: neigh_dist_chunks, neigh_ind_chunks = zip(*chunked_results) diff --git a/sklearn/neighbors/_binary_tree.pxi b/sklearn/neighbors/_binary_tree.pxi index 9f90414994550..32a907d1c6dea 100755 --- a/sklearn/neighbors/_binary_tree.pxi +++ b/sklearn/neighbors/_binary_tree.pxi @@ -142,7 +142,6 @@ # BinaryTree tree2, ITYPE_t i_node2): # """Compute the maximum distance between two nodes""" -cimport cython cimport numpy as np from libc.math cimport fabs, sqrt, exp, cos, pow, log, lgamma from libc.math cimport fmin, fmax @@ -151,16 +150,16 @@ from libc.string cimport memcpy import numpy as np import warnings -from ..utils import check_array - -from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t -from ._typedefs import DTYPE, ITYPE -from ._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, - euclidean_dist_to_rdist, euclidean_rdist_to_dist) +from ..metrics._dist_metrics cimport (DistanceMetric, euclidean_dist, euclidean_rdist, euclidean_dist_to_rdist) from ._partition_nodes cimport partition_node_indices +from ..utils import check_array +from ..utils._typedefs cimport DTYPE_t, ITYPE_t +from ..utils._typedefs import DTYPE, ITYPE +from ..utils._heap cimport simultaneous_sort as _simultaneous_sort, heap_push + cdef extern from "numpy/arrayobject.h": void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) @@ -226,7 +225,7 @@ leaf_size : positive int, default=40 the case that ``n_samples < leaf_size``. metric : str or DistanceMetric object - the distance metric to use for the tree. Default='minkowski' + The distance metric to use for the tree. Default='minkowski' with p=2 (that is, a euclidean metric). See the documentation of the DistanceMetric class for a list of available metrics. {binary_tree}.valid_metrics gives a list of the metrics which @@ -489,27 +488,6 @@ def kernel_norm(h, d, kernel, return_log=False): return np.exp(result) -###################################################################### -# Tree Utility Routines -cdef inline void swap(DITYPE_t* arr, ITYPE_t i1, ITYPE_t i2): - """swap the values at index i1 and i2 of arr""" - cdef DITYPE_t tmp = arr[i1] - arr[i1] = arr[i2] - arr[i2] = tmp - - -cdef inline void dual_swap(DTYPE_t* darr, ITYPE_t* iarr, - ITYPE_t i1, ITYPE_t i2) nogil: - """swap the values at inex i1 and i2 of both darr and iarr""" - cdef DTYPE_t dtmp = darr[i1] - darr[i1] = darr[i2] - darr[i2] = dtmp - - cdef ITYPE_t itmp = iarr[i1] - iarr[i1] = iarr[i2] - iarr[i2] = itmp - - cdef class NeighborsHeap: """A max-heap structure to keep track of distances/indices of neighbors @@ -564,52 +542,11 @@ cdef class NeighborsHeap: cdef int _push(self, ITYPE_t row, DTYPE_t val, ITYPE_t i_val) nogil except -1: """push (val, i_val) into the given row""" - cdef ITYPE_t i, ic1, ic2, i_swap - cdef ITYPE_t size = self.distances.shape[1] - cdef DTYPE_t* dist_arr = &self.distances[row, 0] - cdef ITYPE_t* ind_arr = &self.indices[row, 0] - - # check if val should be in heap - if val >= dist_arr[0]: - return 0 - - # insert val at position zero - dist_arr[0] = val - ind_arr[0] = i_val - - # descend the heap, swapping values until the max heap criterion is met - i = 0 - while True: - ic1 = 2 * i + 1 - ic2 = ic1 + 1 - - if ic1 >= size: - break - elif ic2 >= size: - if dist_arr[ic1] > val: - i_swap = ic1 - else: - break - elif dist_arr[ic1] >= dist_arr[ic2]: - if val < dist_arr[ic1]: - i_swap = ic1 - else: - break - else: - if val < dist_arr[ic2]: - i_swap = ic2 - else: - break - - dist_arr[i] = dist_arr[i_swap] - ind_arr[i] = ind_arr[i_swap] - - i = i_swap - - dist_arr[i] = val - ind_arr[i] = i_val - - return 0 + cdef: + ITYPE_t size = self.distances.shape[1] + DTYPE_t* dist_arr = &self.distances[row, 0] + ITYPE_t* ind_arr = &self.indices[row, 0] + return heap_push(dist_arr, ind_arr, size, val, i_val) cdef int _sort(self) except -1: """simultaneously sort the distances and indices""" @@ -622,68 +559,6 @@ cdef class NeighborsHeap: distances.shape[1]) return 0 - -cdef int _simultaneous_sort(DTYPE_t* dist, ITYPE_t* idx, - ITYPE_t size) nogil except -1: - """ - Perform a recursive quicksort on the dist array, simultaneously - performing the same swaps on the idx array. The equivalent in - numpy (though quite a bit slower) is - - def simultaneous_sort(dist, idx): - i = np.argsort(dist) - return dist[i], idx[i] - """ - cdef ITYPE_t pivot_idx, i, store_idx - cdef DTYPE_t pivot_val - - # in the small-array case, do things efficiently - if size <= 1: - pass - elif size == 2: - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - elif size == 3: - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - if dist[1] > dist[2]: - dual_swap(dist, idx, 1, 2) - if dist[0] > dist[1]: - dual_swap(dist, idx, 0, 1) - else: - # Determine the pivot using the median-of-three rule. - # The smallest of the three is moved to the beginning of the array, - # the middle (the pivot value) is moved to the end, and the largest - # is moved to the pivot index. - pivot_idx = size / 2 - if dist[0] > dist[size - 1]: - dual_swap(dist, idx, 0, size - 1) - if dist[size - 1] > dist[pivot_idx]: - dual_swap(dist, idx, size - 1, pivot_idx) - if dist[0] > dist[size - 1]: - dual_swap(dist, idx, 0, size - 1) - pivot_val = dist[size - 1] - - # partition indices about pivot. At the end of this operation, - # pivot_idx will contain the pivot value, everything to the left - # will be smaller, and everything to the right will be larger. - store_idx = 0 - for i in range(size - 1): - if dist[i] < pivot_val: - dual_swap(dist, idx, i, store_idx) - store_idx += 1 - dual_swap(dist, idx, store_idx, size - 1) - pivot_idx = store_idx - - # recursively sort each side of the pivot - if pivot_idx > 1: - _simultaneous_sort(dist, idx, pivot_idx) - if pivot_idx + 2 < size: - _simultaneous_sort(dist + pivot_idx + 1, - idx + pivot_idx + 1, - size - pivot_idx - 1) - return 0 - #------------------------------------------------------------ # find_node_split_dim: # this computes the equivalent of @@ -878,7 +753,7 @@ def newObj(obj): ###################################################################### # define the reverse mapping of VALID_METRICS -from ._dist_metrics import get_valid_metric_ids +from ..metrics._dist_metrics import get_valid_metric_ids VALID_METRIC_IDS = get_valid_metric_ids(VALID_METRICS) diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index d616eaa2f32a8..e744e52c40c59 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -11,7 +11,7 @@ import numpy as np from scipy import stats from ..utils.extmath import weighted_mode -from ..utils.validation import _is_arraylike, _num_samples +from ..utils.validation import _is_arraylike, _num_samples, check_is_fitted import warnings from ._base import _check_weights, _get_weights @@ -65,10 +65,10 @@ class KNeighborsClassifier(KNeighborsMixin, ClassifierMixin, NeighborsBase): (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. metric : str or callable, default='minkowski' - The distance metric to use for the tree. The default metric is + The distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. @@ -211,7 +211,21 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs) Class labels for each data sample. """ - neigh_dist, neigh_ind = self.kneighbors(X) + # Duplicated because of the check on self.effective_metric_'s value + # TODO: remove check_is_fitted duplication + check_is_fitted(self) + + X = self._validate_data(X, accept_sparse="csr", reset=False) + + if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean": + # In that case, it is safe to use the fast alternative which + # does not use sqrt on distances as this can be costly. + self.effective_metric_ = "fast_sqeuclidean" + neigh_dist, neigh_ind = self.kneighbors(X) + self.effective_metric_ = "fast_euclidean" + else: + neigh_dist, neigh_ind = self.kneighbors(X) + classes_ = self.classes_ _y = self._y if not self.outputs_2d_: @@ -253,7 +267,20 @@ def predict_proba(self, X): The class probabilities of the input samples. Classes are ordered by lexicographic order. """ - neigh_dist, neigh_ind = self.kneighbors(X) + # Duplicated because of the check on self.effective_metric_'s value + # TODO: remove check_is_fitted duplication + check_is_fitted(self) + + X = self._validate_data(X, accept_sparse="csr", reset=False) + + if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean": + # In that case, it is safe to use the fast alternative which + # does not use sqrt on distances as this can be costly. + self.effective_metric_ = "fast_sqeuclidean" + neigh_dist, neigh_ind = self.kneighbors(X) + self.effective_metric_ = "fast_euclidean" + else: + neigh_dist, neigh_ind = self.kneighbors(X) classes_ = self.classes_ _y = self._y @@ -344,8 +371,8 @@ class RadiusNeighborsClassifier(RadiusNeighborsMixin, ClassifierMixin, Neighbors metric : str or callable, default='minkowski' Distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. @@ -609,10 +636,24 @@ def predict_proba(self, X): The class probabilities of the input samples. Classes are ordered by lexicographic order. """ + # Duplicated because of the check on self.effective_metric_'s value + # TODO: remove check_is_fitted duplication + check_is_fitted(self) n_queries = _num_samples(X) - neigh_dist, neigh_ind = self.radius_neighbors(X) + if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean": + # In that case, it is safe to use the fast alternative which + # does not use sqrt on distances as this can be costly. + original_radius = self.radius + self.effective_metric_ = "fast_sqeuclidean" + self.radius = original_radius * original_radius + neigh_dist, neigh_ind = self.radius_neighbors(X) + self.radius = original_radius + self.effective_metric_ = "fast_euclidean" + else: + neigh_dist, neigh_ind = self.radius_neighbors(X) + outlier_mask = np.zeros(n_queries, dtype=bool) outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind] outliers = np.flatnonzero(outlier_mask) diff --git a/sklearn/neighbors/_distance_metric.py b/sklearn/neighbors/_distance_metric.py new file mode 100644 index 0000000000000..10d6e24139068 --- /dev/null +++ b/sklearn/neighbors/_distance_metric.py @@ -0,0 +1,20 @@ +# TODO: Remove this file in 1.2 +import warnings + +from ..metrics import DistanceMetric as _DistanceMetric + + +class DistanceMetric(_DistanceMetric): + @classmethod + def _warn(cls): + warnings.warn( + "sklearn.neighbors.DistanceMetric has been moved " + "to sklearn.metrics.DistanceMetric in 1.0. " + "This import path will be removed in 1.2", + category=FutureWarning, + ) + + @classmethod + def get_metric(cls, metric, **kwargs): + DistanceMetric._warn() + return _DistanceMetric.get_metric(metric, **kwargs) diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index e6fdeffe3b291..9afa37b71a808 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -65,10 +65,11 @@ def kneighbors_graph( between neighbors according to the given metric. metric : str, default='minkowski' - The distance metric used to calculate the k-Neighbors for each sample - point. The DistanceMetric class gives a list of available metrics. - The default distance is 'euclidean' ('minkowski' metric with the p - param equal to 2.) + The distance metric to use for the tree. The default metric is + minkowski, and with p=2 is equivalent to the standard Euclidean + metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is @@ -157,10 +158,11 @@ def radius_neighbors_graph( between neighbors according to the given metric. metric : str, default='minkowski' - The distance metric used to calculate the neighbors within a - given radius for each sample point. The DistanceMetric class - gives a list of available metrics. The default distance is - 'euclidean' ('minkowski' metric with the param equal to 2.) + The distance metric to use for the tree. The default metric is + minkowski, and with p=2 is equivalent to the standard Euclidean + metric. + For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. p : int, default=2 Power parameter for the Minkowski metric. When p = 1, this is diff --git a/sklearn/neighbors/_partition_nodes.pxd b/sklearn/neighbors/_partition_nodes.pxd index 522e826632824..94b02002d7a1e 100644 --- a/sklearn/neighbors/_partition_nodes.pxd +++ b/sklearn/neighbors/_partition_nodes.pxd @@ -1,4 +1,4 @@ -from ._typedefs cimport DTYPE_t, ITYPE_t +from ..utils._typedefs cimport DTYPE_t, ITYPE_t cdef int partition_node_indices( DTYPE_t *data, diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 75ef124ad1711..5ea2db7ce4d21 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -18,6 +18,7 @@ from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import RegressorMixin from ..utils.deprecation import deprecated +from ..utils.validation import check_is_fitted class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): @@ -75,8 +76,8 @@ class KNeighborsRegressor(KNeighborsMixin, RegressorMixin, NeighborsBase): metric : str or callable, default='minkowski' The distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. @@ -226,7 +227,20 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int Target values. """ - neigh_dist, neigh_ind = self.kneighbors(X) + # Duplicated because of the check on self.effective_metric_'s value + # TODO: remove check_is_fitted duplication + check_is_fitted(self) + + X = self._validate_data(X, accept_sparse="csr", reset=False) + + if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean": + # In that case, it is safe to use the fast alternative which + # does not use sqrt on distances as this can be costly. + self.effective_metric_ = "fast_sqeuclidean" + neigh_dist, neigh_ind = self.kneighbors(X) + self.effective_metric_ = "fast_euclidean" + else: + neigh_dist, neigh_ind = self.kneighbors(X) weights = _get_weights(neigh_dist, self.weights) @@ -306,8 +320,8 @@ class RadiusNeighborsRegressor(RadiusNeighborsMixin, RegressorMixin, NeighborsBa metric : str or callable, default='minkowski' The distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. @@ -434,7 +448,23 @@ def predict(self, X): dtype=double Target values. """ - neigh_dist, neigh_ind = self.radius_neighbors(X) + # Duplicated because of the check on self.effective_metric_'s value + # TODO: remove check_is_fitted duplication + check_is_fitted(self) + + X = self._validate_data(X, accept_sparse="csr", reset=False) + + if self.weights == "uniform" and self.effective_metric_ == "fast_euclidean": + # In that case, it is safe to use the fast alternative which + # does not use sqrt on distances as this can be costly. + original_radius = self.radius + self.effective_metric_ = "fast_sqeuclidean" + self.radius = original_radius * original_radius + neigh_dist, neigh_ind = self.radius_neighbors(X) + self.radius = original_radius + self.effective_metric_ = "fast_euclidean" + else: + neigh_dist, neigh_ind = self.radius_neighbors(X) weights = _get_weights(neigh_dist, self.weights) diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 6b6eec1a3112b..440ac41eb71d5 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -41,8 +41,8 @@ class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin, NeighborsBase): metric : str or callable, default='minkowski' The distance metric to use for the tree. The default metric is minkowski, and with p=2 is equivalent to the standard Euclidean - metric. See the documentation of :class:`DistanceMetric` for a - list of available metrics. + metric. For a list of available metrics, see the documentation of + :class:`~sklearn.metrics.DistanceMetric`. If metric is "precomputed", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only "nonzero" elements may be considered neighbors. diff --git a/sklearn/neighbors/setup.py b/sklearn/neighbors/setup.py index 85305efc29c78..aa19ba501b18d 100644 --- a/sklearn/neighbors/setup.py +++ b/sklearn/neighbors/setup.py @@ -32,19 +32,6 @@ def configuration(parent_package="", top_path=None): libraries=libraries, ) - config.add_extension( - "_dist_metrics", - sources=["_dist_metrics.pyx"], - include_dirs=[numpy.get_include(), os.path.join(numpy.get_include(), "numpy")], - libraries=libraries, - ) - - config.add_extension( - "_typedefs", - sources=["_typedefs.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - ) config.add_extension( "_quad_tree", sources=["_quad_tree.pyx"], diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index c751539f2a1ae..a823a03251a1b 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -4,7 +4,6 @@ import pytest from numpy.testing import assert_array_almost_equal from sklearn.neighbors._ball_tree import BallTree -from sklearn.neighbors import DistanceMetric from sklearn.utils import check_random_state from sklearn.utils.validation import check_array from sklearn.utils._testing import _convert_container @@ -40,6 +39,8 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): + from sklearn.metrics import DistanceMetric + X, Y = check_array(X), check_array(Y) D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) ind = np.argsort(D, axis=1)[:, :k] @@ -84,3 +85,13 @@ def test_array_object_type(): X = np.array([(1, 2, 3), (2, 5), (5, 5, 1, 2)], dtype=object) with pytest.raises(ValueError, match="setting an array element with a sequence"): BallTree(X) + + +def test_bad_pyfunc_metric(): + def wrong_distance(x, y): + return "1" + + X = np.ones((5, 2)) + msg = "Custom distance function must accept two vectors" + with pytest.raises(TypeError, match=msg): + BallTree(X, metric=wrong_distance) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a9592ff9f2c51..6121ee6b1f2ee 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -3,6 +3,7 @@ import pytest import re import numpy as np +import scipy from scipy.sparse import ( bsr_matrix, coo_matrix, @@ -22,11 +23,21 @@ from sklearn.metrics.pairwise import pairwise_distances from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split -from sklearn.neighbors import VALID_METRICS_SPARSE, VALID_METRICS -from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed +from sklearn.neighbors import ( + VALID_METRICS_SPARSE, +) +from sklearn.neighbors._base import ( + _is_sorted_by_data, + _check_precomputed, + KNeighborsMixin, +) from sklearn.pipeline import make_pipeline -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, + get_dummy_metric_kwargs, +) from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import check_random_state from sklearn.utils.fixes import sp_version, parse_version @@ -50,6 +61,9 @@ SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,) ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto") +COMMON_VALID_METRICS = sorted( + set.intersection(*map(set, neighbors.VALID_METRICS.values())) +) P = (1, 2, 3, 4, np.inf) JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys()) @@ -70,42 +84,316 @@ def _weight_func(dist): return retval ** 2 +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("n_features", [5, 10, 100]) +@pytest.mark.parametrize("n_query_pts", [1, 10, 100]) +@pytest.mark.parametrize("n_neighbors", [1, 10, 100]) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) def test_unsupervised_kneighbors( - n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5 + n_samples, + n_features, + n_query_pts, + n_neighbors, + metric, ): - # Test unsupervised neighbors methods - X = rng.rand(n_samples, n_features) + # The different algorithms must return identical results + # on their common metrics, with and without returning + # distances - test = rng.rand(n_query_pts, n_features) + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features) - for p in P: - results_nodist = [] - results = [] + test = local_rng.rand(n_query_pts, n_features) - for algorithm in ALGORITHMS: - neigh = neighbors.NearestNeighbors( - n_neighbors=n_neighbors, algorithm=algorithm, p=p - ) - neigh.fit(X) + results_nodist = [] + results = [] - results_nodist.append(neigh.kneighbors(test, return_distance=False)) - results.append(neigh.kneighbors(test, return_distance=True)) + for algorithm in ALGORITHMS: + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, algorithm=algorithm, metric=metric + ) + neigh.fit(X) + + results_nodist.append(neigh.kneighbors(test, return_distance=False)) + results.append(neigh.kneighbors(test, return_distance=True)) + + for i in range(len(results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + indices_no_dist = results_nodist[i] + distances, next_distances = results[i][0], results[i + 1][0] + indices, next_indices = results[i][1], results[i + 1][1] + assert_allclose( + indices_no_dist, + indices, + err_msg=( + f"The '{algorithm}' algorithm returns different" + "indices depending on 'return_distances'." + ), + ) + assert_allclose( + indices, + next_indices, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different indices." + ), + ) + assert_allclose( + distances, + next_distances, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different distances." + ), + ) - for i in range(len(results) - 1): - assert_array_almost_equal(results_nodist[i], results[i][1]) - assert_array_almost_equal(results[i][0], results[i + 1][0]) - assert_array_almost_equal(results[i][1], results[i + 1][1]) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("n_features", [5, 10, 100]) +@pytest.mark.parametrize("n_query_pts", [1, 10, 100]) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)]) +@pytest.mark.parametrize( + "NeighborsMixinSubclass", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, + ], +) +def test_neigh_predictions_algorithm_agnosticity( + n_samples, + n_features, + n_query_pts, + metric, + n_neighbors, + radius, + NeighborsMixinSubclass, +): + # The different algorithms must return identical predictions results + # on their common metrics. + + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features) + y = local_rng.randint(3, size=n_samples) + + query = local_rng.rand(n_query_pts, n_features) + + predict_results = [] + + parameter = ( + n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius + ) + + for algorithm in ALGORITHMS: + neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric) + neigh.fit(X, y) + + predict_results.append(neigh.predict(query)) + + for i in range(len(predict_results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + predictions, next_predictions = predict_results[i], predict_results[i + 1] + + assert_allclose( + predictions, + next_predictions, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different predictions." + ), + ) + +@pytest.mark.parametrize("seed", range(10)) +@pytest.mark.parametrize("n_samples", [10 ** i for i in [2, 3]]) +@pytest.mark.parametrize("n_features", [5, 10, 100]) +@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)]) @pytest.mark.parametrize( - "NearestNeighbors", + "NeighborsMixinSubclass", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, + ], +) +def test_neighs_predictions_fast_euclidean_correctness( + seed, + n_samples, + n_features, + n_neighbors, + radius, + NeighborsMixinSubclass, + dtype=np.float64, +): + # The fast euclidean strategy must return results + # that are close to the ones obtained with the euclidean distance + if n_samples < n_neighbors: + pytest.skip( + f"Skipping as n_samples (={n_samples}) < n_neighbors (={n_neighbors})", + allow_module_level=True, + ) + + rng = np.random.RandomState(seed) + X = rng.rand(n_samples, n_features).astype(dtype) + y = rng.randint(3, size=n_samples) + + parameter = ( + n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius + ) + + euclidean_est = NeighborsMixinSubclass( + parameter, algorithm="brute", metric="euclidean" + ).fit(X, y) + euclidean_pred = euclidean_est.predict(X) + + fast_euclidean_clf = NeighborsMixinSubclass( + parameter, algorithm="brute", metric="fast_euclidean" + ).fit(X, y) + fast_euclidean_pred = fast_euclidean_clf.predict(X) + + assert_allclose(euclidean_pred, fast_euclidean_pred) + + +@pytest.mark.parametrize( + "KNeighborsEstimator", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + ], +) +@pytest.mark.parametrize( + "weights, expected_kneighbors_metric", + [ + ("uniform", "fast_sqeuclidean"), + ("distance", "fast_euclidean"), + (lambda x: x, "fast_euclidean"), + ], +) +def test_knn_prediction_fast_euclidean_overriding( + KNeighborsEstimator, + weights, + expected_kneighbors_metric, + n_samples=1000, + n_features=100, + dtype=np.float64, +): + # The fast squared euclidean metric must be used over the fast euclidean + # metric solely when using the uniform sample-weighting. + class MockedKNeighborsEstimator(KNeighborsEstimator): + def kneighbors(self, *args, **kwargs): + self.kneighbors_metric_ = self.effective_metric_ + return super().kneighbors(*args, **kwargs) + + rng = np.random.RandomState(0) + X = rng.rand(n_samples, n_features).astype(dtype) + y = rng.randint(3, size=n_samples) + + parameter = 10 + + fast_euclidean_est = MockedKNeighborsEstimator( + parameter, + algorithm="brute", + metric="fast_euclidean", + weights=weights, + ).fit(X, y) + + # effective_metric_ must not be changed + assert fast_euclidean_est.effective_metric_ == "fast_euclidean" + fast_euclidean_est.predict(X) + assert fast_euclidean_est.kneighbors_metric_ == expected_kneighbors_metric + assert fast_euclidean_est.effective_metric_ == "fast_euclidean" + + +@pytest.mark.parametrize( + "KNeighborsEstimator", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + ], +) +@pytest.mark.parametrize("algorithm", ["kd_tree", "ball_tree"]) +def test_knn_prediction_fast_alternatives_fall_back_on_tree( + KNeighborsEstimator, + algorithm, + specified_metric="fast_euclidean", + fall_back_metric="euclidean", + parameter=10, + n_samples=1000, + n_features=100, + dtype=np.float64, +): + # The fast euclidean metric can't be used on "kd_tree", "ball_tree". + rng = np.random.RandomState(0) + X = rng.rand(n_samples, n_features).astype(dtype) + y = rng.randint(3, size=n_samples) + + est = KNeighborsEstimator( + parameter, + algorithm=algorithm, + metric=specified_metric, + ) + with pytest.warns( + UserWarning, + match=( + f"'{specified_metric}' is only available for algorithm='brute' but " + f"algorithm='{algorithm}' is used. Falling " + f"back on metric='{fall_back_metric}'." + ), + ): + est.fit(X, y) + + assert est.metric == fall_back_metric + assert est.effective_metric_ == fall_back_metric + + +@pytest.mark.parametrize( + "KNeighborsEstimator", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + ], +) +def test_knn_prediction_fast_alternatives_fall_back_on_sparse( + KNeighborsEstimator, + specified_metric="fast_euclidean", + fall_back_metric="euclidean", + parameter=10, + n_samples=1000, + n_features=100, + dtype=np.float64, +): + # The fast euclidean metric can't be used on sparse datasets. + rng = np.random.RandomState(0) + X = scipy.sparse.random(n_samples, n_features, density=0.25, random_state=rng) + y = rng.randint(3, size=n_samples) + + est = KNeighborsEstimator( + parameter, + algorithm="brute", + metric=specified_metric, + ) + est.fit(X, y) + assert est.effective_metric_ == fall_back_metric + + +@pytest.mark.parametrize( + "KNeighborsMixinSubclass", [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors, ], ) -def test_unsupervised_inputs(NearestNeighbors): +def test_unsupervised_inputs(KNeighborsMixinSubclass): # Test unsupervised inputs for neighbors estimators X = rng.random_sample((10, 3)) @@ -115,7 +403,7 @@ def test_unsupervised_inputs(NearestNeighbors): dist1, ind1 = nbrs_fid.kneighbors(X) - nbrs = NearestNeighbors(n_neighbors=1) + nbrs = KNeighborsMixinSubclass(n_neighbors=1) for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): nbrs.fit(data, y) @@ -1168,19 +1456,19 @@ def test_kneighbors_graph(): assert_array_almost_equal(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]]) -def test_kneighbors_graph_sparse(seed=36): +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +def test_kneighbors_graph_sparse(n_neighbors, mode, seed=36): # Test kneighbors_graph to build the k-Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) - for n_neighbors in [1, 2, 3]: - for mode in ["connectivity", "distance"]: - assert_array_almost_equal( - neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), - neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), - ) + assert_array_almost_equal( + neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) def test_radius_neighbors_graph(): @@ -1196,21 +1484,19 @@ def test_radius_neighbors_graph(): ) -def test_radius_neighbors_graph_sparse(seed=36): +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +def test_radius_neighbors_graph_sparse(n_neighbors, mode, seed=36): # Test radius_neighbors_graph to build the Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) - for n_neighbors in [1, 2, 3]: - for mode in ["connectivity", "distance"]: - assert_array_almost_equal( - neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), - neighbors.radius_neighbors_graph( - Xcsr, n_neighbors, mode=mode - ).toarray(), - ) + assert_array_almost_equal( + neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) def test_neighbors_badargs(): @@ -1275,77 +1561,50 @@ def test_neighbors_badargs(): nbrs.radius_neighbors_graph(X, mode="blah") -def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +def test_neighbors_metrics( + metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5 +): # Test computing the neighbors for various metrics - # create a symmetric matrix - V = rng.rand(n_features, n_features) - VI = np.dot(V, V.T) - - metrics = [ - ("euclidean", {}), - ("manhattan", {}), - ("minkowski", dict(p=1)), - ("minkowski", dict(p=2)), - ("minkowski", dict(p=3)), - ("minkowski", dict(p=np.inf)), - ("chebyshev", {}), - ("seuclidean", dict(V=rng.rand(n_features))), - ("wminkowski", dict(p=3, w=rng.rand(n_features))), - ("mahalanobis", dict(VI=VI)), - ("haversine", {}), - ] - algorithms = ["brute", "ball_tree", "kd_tree"] - X = rng.rand(n_samples, n_features) + if metric == "wminkowski" and sp_version >= parse_version("1.8.0"): + pytest.skip("wminkowski will be removed in SciPy 1.8.0") + rng = np.random.RandomState(0) + X = rng.rand(n_samples, n_features) test = rng.rand(n_query_pts, n_features) - for metric, metric_params in metrics: - if metric == "wminkowski" and sp_version >= parse_version("1.8.0"): - # wminkowski will be removed in SciPy 1.8.0 - continue - results = {} - p = metric_params.pop("p", 2) - for algorithm in algorithms: - # KD tree doesn't support all metrics - if algorithm == "kd_tree" and metric not in neighbors.KDTree.valid_metrics: - est = neighbors.NearestNeighbors( - algorithm=algorithm, metric=metric, metric_params=metric_params - ) - with pytest.raises(ValueError): - est.fit(X) - continue - neigh = neighbors.NearestNeighbors( - n_neighbors=n_neighbors, - algorithm=algorithm, - metric=metric, - p=p, - metric_params=metric_params, - ) - - # Haversine distance only accepts 2D data - feature_sl = slice(None, 2) if metric == "haversine" else slice(None) - - neigh.fit(X[:, feature_sl]) - - # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 - ExceptionToAssert = None - if ( - metric == "wminkowski" - and algorithm == "brute" - and sp_version >= parse_version("1.6.0") - ): - ExceptionToAssert = DeprecationWarning + algorithms = ["brute", "ball_tree", "kd_tree"] + metric_params = get_dummy_metric_kwargs(metric, n_features) + + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X_train = np.ascontiguousarray(X[:, feature_sl]) + X_test = np.ascontiguousarray(test[:, feature_sl]) + else: + X_train = X + X_test = test + + results = {} + p = metric_params.pop("p", 2) + for algorithm in algorithms: + # KD tree doesn't support all metrics + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, + algorithm=algorithm, + metric=metric, + p=p, + metric_params=metric_params, + ) - with pytest.warns(ExceptionToAssert): - results[algorithm] = neigh.kneighbors( - test[:, feature_sl], return_distance=True - ) + neigh.fit(X_train) + results[algorithm] = neigh.kneighbors(X_test, return_distance=True) - assert_array_almost_equal(results["brute"][0], results["ball_tree"][0]) - assert_array_almost_equal(results["brute"][1], results["ball_tree"][1]) - if "kd_tree" in results: - assert_array_almost_equal(results["brute"][0], results["kd_tree"][0]) - assert_array_almost_equal(results["brute"][1], results["kd_tree"][1]) + assert_allclose(results["brute"][0], results["ball_tree"][0]) + assert_allclose(results["brute"][1], results["ball_tree"][1]) + if "kd_tree" in results: + assert_allclose(results["brute"][0], results["kd_tree"][0]) + assert_allclose(results["brute"][1], results["kd_tree"][1]) def test_callable_metric(): @@ -1369,59 +1628,44 @@ def custom_metric(x1, x2): assert_array_almost_equal(dist1, dist2) -def test_valid_brute_metric_for_auto_algorithm(): - X = rng.rand(12, 12) +@pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"]) +def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=12): + X = rng.rand(n_samples, n_features) Xcsr = csr_matrix(X) - # check that there is a metric that is valid for brute - # but not ball_tree (so we actually test something) - assert "cosine" in VALID_METRICS["brute"] - assert "cosine" not in VALID_METRICS["ball_tree"] + metric_params = get_dummy_metric_kwargs(metric, n_features) - # Metric which don't required any additional parameter - require_params = ["mahalanobis", "wminkowski", "seuclidean"] - for metric in VALID_METRICS["brute"]: - if metric != "precomputed" and metric not in require_params: - nn = neighbors.NearestNeighbors( - n_neighbors=3, algorithm="auto", metric=metric - ) - if metric != "haversine": - nn.fit(X) - nn.kneighbors(X) - else: - nn.fit(X[:, :2]) - nn.kneighbors(X[:, :2]) - elif metric == "precomputed": - X_precomputed = rng.random_sample((10, 4)) - Y_precomputed = rng.random_sample((3, 4)) - DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean") - DYX = metrics.pairwise_distances( - Y_precomputed, X_precomputed, metric="euclidean" - ) - nb_p = neighbors.NearestNeighbors(n_neighbors=3) - nb_p.fit(DXX) - nb_p.kneighbors(DYX) + if metric == "precomputed": + X_precomputed = rng.random_sample((10, 4)) + Y_precomputed = rng.random_sample((3, 4)) + DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean") + DYX = metrics.pairwise_distances( + Y_precomputed, X_precomputed, metric="euclidean" + ) + nb_p = neighbors.NearestNeighbors(n_neighbors=3, metric="precomputed") + nb_p.fit(DXX) + nb_p.kneighbors(DYX) - for metric in VALID_METRICS_SPARSE["brute"]: - if metric != "precomputed" and metric not in require_params: + else: + nn = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="auto", metric=metric, metric_params=metric_params + ) + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X = np.ascontiguousarray(X[:, feature_sl]) + else: + X = X + + nn.fit(X) + nn.kneighbors(X) + + if metric in VALID_METRICS_SPARSE["brute"]: nn = neighbors.NearestNeighbors( n_neighbors=3, algorithm="auto", metric=metric ).fit(Xcsr) nn.kneighbors(Xcsr) - # Metric with parameter - VI = np.dot(X, X.T) - list_metrics = [ - ("seuclidean", dict(V=rng.rand(12))), - ("wminkowski", dict(w=rng.rand(12))), - ("mahalanobis", dict(VI=VI)), - ] - for metric, params in list_metrics: - nn = neighbors.NearestNeighbors( - n_neighbors=3, algorithm="auto", metric=metric, metric_params=params - ).fit(X) - nn.kneighbors(X) - def test_metric_params_interface(): X = rng.rand(5, 5) @@ -1513,82 +1757,86 @@ def test_k_and_radius_neighbors_train_is_not_query(): assert_array_equal(rng.A, [[0, 1], [1, 1]]) -def test_k_and_radius_neighbors_X_None(): +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_X_None(algorithm): # Test kneighbors et.al when query is None - for algorithm in ALGORITHMS: + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + + X = [[0], [1]] + nn.fit(X) + + dist, ind = nn.kneighbors() + assert_array_equal(dist, [[1], [1]]) + assert_array_equal(ind, [[1], [0]]) + dist, ind = nn.radius_neighbors(None, radius=1.5) + check_object_arrays(dist, [[1], [1]]) + check_object_arrays(ind, [[1], [0]]) + + # Test the graph variants. + rng = nn.radius_neighbors_graph(None, radius=1.5) + kng = nn.kneighbors_graph(None) + for graph in [rng, kng]: + assert_array_equal(graph.A, [[0, 1], [1, 0]]) + assert_array_equal(graph.data, [1, 1]) + assert_array_equal(graph.indices, [1, 0]) + + X = [[0, 1], [0, 1], [1, 1]] + nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) + nn.fit(X) + assert_array_equal( + nn.kneighbors_graph().A, + np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), + ) - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) - X = [[0], [1]] - nn.fit(X) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_duplicates(algorithm): + # Test behavior of kneighbors when duplicates are present in query + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + duplicates = [[0], [1], [3]] - dist, ind = nn.kneighbors() - assert_array_equal(dist, [[1], [1]]) - assert_array_equal(ind, [[1], [0]]) - dist, ind = nn.radius_neighbors(None, radius=1.5) - check_object_arrays(dist, [[1], [1]]) - check_object_arrays(ind, [[1], [0]]) + nn.fit(duplicates) - # Test the graph variants. - rng = nn.radius_neighbors_graph(None, radius=1.5) - kng = nn.kneighbors_graph(None) - for graph in [rng, kng]: - assert_array_equal(graph.A, [[0, 1], [1, 0]]) - assert_array_equal(graph.data, [1, 1]) - assert_array_equal(graph.indices, [1, 0]) - - X = [[0, 1], [0, 1], [1, 1]] - nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) - nn.fit(X) - assert_array_equal( - nn.kneighbors_graph().A, - np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), - ) + # Do not do anything special to duplicates. + kng = nn.kneighbors_graph(duplicates, mode="distance") + assert_allclose( + kng.toarray(), np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + ) + assert_allclose(kng.data, [0.0, 0.0, 0.0]) + assert_allclose(kng.indices, [0, 1, 2]) + dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) + check_object_arrays(dist, [[0, 1], [1, 0]]) + check_object_arrays(ind, [[0, 1], [0, 1]]) -def test_k_and_radius_neighbors_duplicates(): - # Test behavior of kneighbors when duplicates are present in query + rng = nn.radius_neighbors_graph(duplicates, radius=1.5) + assert_allclose( + rng.toarray(), np.array([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + ) - for algorithm in ALGORITHMS: - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) - nn.fit([[0], [1]]) - - # Do not do anything special to duplicates. - kng = nn.kneighbors_graph([[0], [1]], mode="distance") - assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]])) - assert_array_equal(kng.data, [0.0, 0.0]) - assert_array_equal(kng.indices, [0, 1]) - - dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) - check_object_arrays(dist, [[0, 1], [1, 0]]) - check_object_arrays(ind, [[0, 1], [0, 1]]) - - rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5) - assert_array_equal(rng.A, np.ones((2, 2))) - - rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") - rng.sort_indices() - assert_array_equal(rng.A, [[0, 1], [1, 0]]) - assert_array_equal(rng.indices, [0, 1, 0, 1]) - assert_array_equal(rng.data, [0, 1, 1, 0]) - - # Mask the first duplicates when n_duplicates > n_neighbors. - X = np.ones((3, 1)) - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") - nn.fit(X) - dist, ind = nn.kneighbors() - assert_array_equal(dist, np.zeros((3, 1))) - assert_array_equal(ind, [[1], [0], [1]]) - - # Test that zeros are explicitly marked in kneighbors_graph. - kng = nn.kneighbors_graph(mode="distance") - assert_array_equal(kng.A, np.zeros((3, 3))) - assert_array_equal(kng.data, np.zeros(3)) - assert_array_equal(kng.indices, [1.0, 0.0, 1.0]) - assert_array_equal( - nn.kneighbors_graph().A, - np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), - ) + rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") + rng.sort_indices() + assert_allclose(rng.toarray(), [[0, 1, 0], [1, 0, 0]]) + assert_allclose(rng.indices, [0, 1, 0, 1]) + assert_allclose(rng.data, [0, 1, 1, 0]) + + # Mask the first duplicates when n_duplicates > n_neighbors. + X = np.ones((3, 1)) + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") + nn.fit(X) + dist, ind = nn.kneighbors() + assert_allclose(dist, np.zeros((3, 1))) + assert_allclose(ind, [[1], [0], [1]]) + + # Test that zeros are explicitly marked in kneighbors_graph. + kng = nn.kneighbors_graph(mode="distance") + assert_allclose(kng.toarray(), np.zeros((3, 3))) + assert_allclose(kng.data, np.zeros(3)) + assert_allclose(kng.indices, [1, 0, 1]) + assert_allclose( + nn.kneighbors_graph().toarray(), + np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), + ) def test_include_self_neighbors_graph(): diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index de34b4d230171..e043ffb730708 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from sklearn.neighbors import DistanceMetric +from sklearn.metrics import DistanceMetric from sklearn.neighbors._ball_tree import ( BallTree, kernel_norm, diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index 8290318d35deb..d87b5da52339c 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -3,6 +3,7 @@ """ import pkgutil import inspect +from distutils.version import LooseVersion from importlib import import_module from operator import itemgetter from collections.abc import Sequence @@ -19,6 +20,7 @@ import warnings import numpy as np from scipy.sparse import issparse +from threadpoolctl import threadpool_info from .murmurhash import murmurhash3_32 from .class_weight import compute_class_weight, compute_sample_weight @@ -80,6 +82,39 @@ _IS_32BIT = 8 * struct.calcsize("P") == 32 +def _in_unstable_openblas_configuration(): + """Return True if in an unstable configuration for OpenBLAS""" + + # Import libraries which might load OpenBLAS. + import numpy # noqa + import scipy # noqa + + modules_info = threadpool_info() + + open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info) + if not open_blas_used: + return False + + # OpenBLAS 0.3.16 fixed unstability for arm64, see: + # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa + openblas_arm64_stable_version = LooseVersion("0.3.16") + for info in modules_info: + if info["internal_api"] != "openblas": + continue + openblas_version = info.get("version") + openblas_architecture = info.get("architecture") + if openblas_version is None or openblas_architecture is None: + # Cannot be sure that OpenBLAS is good enough. Assume unstable: + return True + if ( + openblas_architecture == "neoversen1" + and openblas_version < openblas_arm64_stable_version + ): + # See discussions in https://github.com/numpy/numpy/issues/19411 + return True + return False + + class Bunch(dict): """Container object exposing keys as attributes. diff --git a/sklearn/utils/_heap.pxd b/sklearn/utils/_heap.pxd new file mode 100644 index 0000000000000..0b65a5a32e393 --- /dev/null +++ b/sklearn/utils/_heap.pxd @@ -0,0 +1,19 @@ +# Heap routines, used in various Cython implementation. + +from cython cimport floating + +from ._typedefs cimport ITYPE_t + +cdef int simultaneous_sort( + floating* dist, + ITYPE_t* idx, + ITYPE_t size +) nogil + +cdef int heap_push( + floating* values, + ITYPE_t* indices, + ITYPE_t size, + floating val, + ITYPE_t val_idx, +) nogil diff --git a/sklearn/utils/_heap.pyx b/sklearn/utils/_heap.pyx new file mode 100644 index 0000000000000..d6133eab7c658 --- /dev/null +++ b/sklearn/utils/_heap.pyx @@ -0,0 +1,144 @@ +#!python +# cython: boundscheck=False +# cython: cdivision=True +# cython: initializedcheck=False +# cython: wraparound=False + + +from cython cimport floating, integral, numeric + +from ._typedefs cimport ITYPE_t + +cdef inline void dual_swap(floating* darr, ITYPE_t* iarr, + ITYPE_t i1, ITYPE_t i2) nogil: + """Swap the values at inex i1 and i2 of both darr and iarr""" + cdef floating dtmp = darr[i1] + darr[i1] = darr[i2] + darr[i2] = dtmp + + cdef ITYPE_t itmp = iarr[i1] + iarr[i1] = iarr[i2] + iarr[i2] = itmp + +cdef int simultaneous_sort( + floating* values, + ITYPE_t* indices, + ITYPE_t size +) nogil: + """ + Perform a recursive quicksort on the values array, simultaneously + performing the same swaps on the indices array. + """ + # TODO: In order to support discrete distance metrics, we need to have a + # simultaneous sort which breaks ties on indices when distances are identical. + # The best might be using a std::sort and a Comparator which might need + # AoS instead of SoA (currently used). + cdef: + ITYPE_t pivot_idx, i, store_idx + floating pivot_val + + # in the small-array case, do things efficiently + if size <= 1: + pass + elif size == 2: + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + elif size == 3: + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + if values[1] > values[2]: + dual_swap(values, indices, 1, 2) + if values[0] > values[1]: + dual_swap(values, indices, 0, 1) + else: + # Determine the pivot using the median-of-three rule. + # The smallest of the three is moved to the beginning of the array, + # the middle (the pivot value) is moved to the end, and the largest + # is moved to the pivot index. + pivot_idx = size // 2 + if values[0] > values[size - 1]: + dual_swap(values, indices, 0, size - 1) + if values[size - 1] > values[pivot_idx]: + dual_swap(values, indices, size - 1, pivot_idx) + if values[0] > values[size - 1]: + dual_swap(values, indices, 0, size - 1) + pivot_val = values[size - 1] + + # partition indices about pivot. At the end of this operation, + # pivot_idx will contain the pivot value, everything to the left + # will be smaller, and everything to the right will be larger. + store_idx = 0 + for i in range(size - 1): + if values[i] < pivot_val: + dual_swap(values, indices, i, store_idx) + store_idx += 1 + dual_swap(values, indices, store_idx, size - 1) + pivot_idx = store_idx + + # recursively sort each side of the pivot + if pivot_idx > 1: + simultaneous_sort(values, indices, pivot_idx) + if pivot_idx + 2 < size: + simultaneous_sort(values + pivot_idx + 1, + indices + pivot_idx + 1, + size - pivot_idx - 1) + return 0 + + +cdef inline int heap_push( + floating* values, + ITYPE_t* indices, + ITYPE_t size, + floating val, + ITYPE_t val_idx, +) nogil: + """Push a tuple (val, val_idx) into a fixed-size max-heap. + + The max-heap is represented as a struct of arrays where: + - values is the array containing the data to construct the heap on + - indices is the array containing the indices (meta-data) of each value. + """ + cdef: + ITYPE_t current_idx, left_child_idx, right_child_idx, swap_idx + + # check if val should be in heap + if val >= values[0]: + return 0 + + # insert val at position zero + values[0] = val + indices[0] = val_idx + + # descend the heap, swapping values until the max heap criterion is met + current_idx = 0 + while True: + left_child_idx = 2 * current_idx + 1 + right_child_idx = left_child_idx + 1 + + if left_child_idx >= size: + break + elif right_child_idx >= size: + if values[left_child_idx] > val: + swap_idx = left_child_idx + else: + break + elif values[left_child_idx] >= values[right_child_idx]: + if val < values[left_child_idx]: + swap_idx = left_child_idx + else: + break + else: + if val < values[right_child_idx]: + swap_idx = right_child_idx + else: + break + + values[current_idx] = values[swap_idx] + indices[current_idx] = indices[swap_idx] + + current_idx = swap_idx + + values[current_idx] = val + indices[current_idx] = val_idx + + return 0 diff --git a/sklearn/utils/_openmp_helpers.pxd b/sklearn/utils/_openmp_helpers.pxd new file mode 100644 index 0000000000000..e57fc9bfa6bf5 --- /dev/null +++ b/sklearn/utils/_openmp_helpers.pxd @@ -0,0 +1,6 @@ +# Helpers to access OpenMP threads information +# +# Those interfaces act as indirections which allows the non-support of OpenMP +# for implementations which have been written for it. + +cdef int _openmp_thread_num() nogil diff --git a/sklearn/utils/_openmp_helpers.pyx b/sklearn/utils/_openmp_helpers.pyx index fb8920074a84e..cddd77ac42746 100644 --- a/sklearn/utils/_openmp_helpers.pyx +++ b/sklearn/utils/_openmp_helpers.pyx @@ -6,7 +6,7 @@ IF SKLEARN_OPENMP_PARALLELISM_ENABLED: def _openmp_parallelism_enabled(): """Determines whether scikit-learn has been built with OpenMP - + It allows to retrieve at runtime the information gathered at compile time. """ # SKLEARN_OPENMP_PARALLELISM_ENABLED is resolved at compile time during @@ -22,7 +22,7 @@ cpdef _openmp_effective_n_threads(n_threads=None): - if the ``OMP_NUM_THREADS`` environment variable is set, return ``openmp.omp_get_max_threads()`` - otherwise, return the minimum between ``openmp.omp_get_max_threads()`` - and the number of cpus, taking cgroups quotas into account. Cgroups + and the number of cpus, taking cgroups quotas into account. Cgroups quotas can typically be set by tools such as Docker. The result of ``omp_get_max_threads`` can be influenced by environment variable ``OMP_NUM_THREADS`` or at runtime by ``omp_set_num_threads``. @@ -59,4 +59,13 @@ cpdef _openmp_effective_n_threads(n_threads=None): # OpenMP disabled at build-time => sequential mode return 1 - + +cdef inline int _openmp_thread_num() nogil: + """Return the number of the thread calling this function. + + If scikit-learn is built without OpenMP support, always return 0. + """ + IF SKLEARN_OPENMP_PARALLELISM_ENABLED: + return openmp.omp_get_thread_num() + ELSE: + return 0 diff --git a/sklearn/utils/_testing.py b/sklearn/utils/_testing.py index 1e4ecdd53e136..18f45d2680b13 100644 --- a/sklearn/utils/_testing.py +++ b/sklearn/utils/_testing.py @@ -48,7 +48,12 @@ import joblib import sklearn -from sklearn.utils import IS_PYPY, _IS_32BIT, deprecated +from sklearn.utils import ( + IS_PYPY, + _IS_32BIT, + deprecated, + _in_unstable_openblas_configuration, +) from sklearn.utils.multiclass import check_classification_targets from sklearn.utils.validation import ( check_array, @@ -448,6 +453,10 @@ def set_random_state(estimator, random_state=0): os.environ.get("TRAVIS") == "true", reason="skip on travis" ) fails_if_pypy = pytest.mark.xfail(IS_PYPY, reason="not compatible with PyPy") + fails_if_unstable_openblas = pytest.mark.xfail( + _in_unstable_openblas_configuration(), + reason="OpenBLAS is unstable for this configuration", + ) skip_if_no_parallel = pytest.mark.skipif( not joblib.parallel.mp, reason="joblib is in serial mode" ) @@ -1041,3 +1050,24 @@ def transform(self, X, y=None): def fit_transform(self, X, y=None): return self.fit(X, y).transform(X, y) + + +def get_dummy_metric_kwargs(metric: str, n_features: int): + """Return dummy DistanceMetric kwargs for tests.""" + rng = np.random.RandomState(1) + weights = rng.random_sample(n_features) + weights /= weights.sum() + + V = rng.random_sample((n_features, n_features)) + + # VI is positive-semidefinite, preferred for precision matrix + VI = np.dot(V, V.T) + 3 * np.eye(n_features) + + kwargs = { + "minkowski": dict(p=1.5), + "seuclidean": dict(V=weights), + "wminkowski": dict(p=1.5, w=weights), + "mahalanobis": dict(VI=VI), + } + + return kwargs.get(metric, {}) diff --git a/sklearn/neighbors/_typedefs.pxd b/sklearn/utils/_typedefs.pxd similarity index 100% rename from sklearn/neighbors/_typedefs.pxd rename to sklearn/utils/_typedefs.pxd diff --git a/sklearn/neighbors/_typedefs.pyx b/sklearn/utils/_typedefs.pyx similarity index 100% rename from sklearn/neighbors/_typedefs.pyx rename to sklearn/utils/_typedefs.pyx diff --git a/sklearn/utils/setup.py b/sklearn/utils/setup.py index c75cbe2d86495..6f65a7224d38b 100644 --- a/sklearn/utils/setup.py +++ b/sklearn/utils/setup.py @@ -85,6 +85,21 @@ def configuration(parent_package="", top_path=None): config.add_extension( "_readonly_array_wrapper", sources=["_readonly_array_wrapper.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + + config.add_extension( + "_typedefs", + sources=["_typedefs.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + ) + + config.add_extension( + "_heap", + sources=["_heap.pyx"], + include_dirs=[numpy.get_include()], libraries=libraries, )