From e93ea0f3207a6d84ddce636cdcab4c0e39d3d14b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 14 Jan 2022 18:14:30 +0100 Subject: [PATCH 01/34] Forward pairwise_dist_chunk_size in the configuration --- sklearn/_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/_config.py b/sklearn/_config.py index d6a02737f640d..fd0b30da7a82c 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -197,6 +197,7 @@ def config_context( working_memory=working_memory, print_changed_only=print_changed_only, display=display, + pairwise_dist_chunk_size=pairwise_dist_chunk_size, ) try: From e39bf7dd47c3040141f5482818a86eb63015830d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 14 Jan 2022 18:35:42 +0100 Subject: [PATCH 02/34] Flip finalized results for PairwiseDistancesArgKmin The previous would have made the code more complex by introducing some boilerplate for the interface plugs. Having it this way actually simplifies the code. This also removes the haversine branch for test_pairwise_distances_argkmin Co-authored-by: Olivier Grisel --- .../metrics/_pairwise_distances_reduction.pyx | 2 +- .../test_pairwise_distances_reduction.py | 19 +++++++------------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 76420b50a1b5e..329cf90158105 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -883,7 +883,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # We need to recompute distances because we relied on # surrogate distances for the reduction. self.compute_exact_distances() - return np.asarray(self.argkmin_indices), np.asarray(self.argkmin_distances) + return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) return np.asarray(self.argkmin_indices) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index e975aad55bf9c..6546022f88351 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -176,14 +176,14 @@ def test_chunk_size_agnosticism( else 10 ** np.log(n_features) ) - ref_indices, ref_dist = PairwiseDistancesReduction.compute( + ref_dist, ref_indices = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True, ) - indices, dist = PairwiseDistancesReduction.compute( + dist, indices = PairwiseDistancesReduction.compute( X, Y, parameter, @@ -222,14 +222,14 @@ def test_n_threads_agnosticism( else 10 ** np.log(n_features) ) - ref_indices, ref_dist = PairwiseDistancesReduction.compute( + ref_dist, ref_indices = PairwiseDistancesReduction.compute( X, Y, parameter, return_distance=True, ) - indices, dist = PairwiseDistancesReduction.compute( + dist, indices = PairwiseDistancesReduction.compute( X, Y, parameter, n_threads=1, return_distance=True ) @@ -269,7 +269,7 @@ def test_strategies_consistency( else 10 ** np.log(n_features) ) - indices_par_X, dist_par_X = PairwiseDistancesReduction.compute( + dist_par_X, indices_par_X = PairwiseDistancesReduction.compute( X, Y, parameter, @@ -282,7 +282,7 @@ def test_strategies_consistency( return_distance=True, ) - indices_par_Y, dist_par_Y = PairwiseDistancesReduction.compute( + dist_par_Y, indices_par_Y = PairwiseDistancesReduction.compute( X, Y, parameter, @@ -324,11 +324,6 @@ def test_pairwise_distances_argkmin( X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread - # Haversine distance only accepts 2D data - if metric == "haversine": - X = np.ascontiguousarray(X[:, :2]) - Y = np.ascontiguousarray(Y[:, :2]) - metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0] # Reference for argkmin results @@ -346,7 +341,7 @@ def test_pairwise_distances_argkmin( row_idx, argkmin_indices_ref[row_idx] ] - argkmin_indices, argkmin_distances = PairwiseDistancesArgKmin.compute( + argkmin_distances, argkmin_indices = PairwiseDistancesArgKmin.compute( X, Y, k, From c762c407873b8d6417b1c2ff78d19d82550e48d3 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 14 Jan 2022 18:18:00 +0100 Subject: [PATCH 03/34] Plug PairwiseDistancesArgKmin as a back-end --- sklearn/metrics/pairwise.py | 83 ++++++++++++++++++---- sklearn/multioutput.py | 2 +- sklearn/neighbors/_base.py | 100 ++++++++++++++++++--------- sklearn/neighbors/_classification.py | 15 +++- sklearn/neighbors/_regression.py | 7 +- 5 files changed, 159 insertions(+), 48 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index dfa7e518b0858..f7e53c316d9e0 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -19,6 +19,7 @@ from scipy.sparse import issparse from joblib import Parallel, effective_n_jobs +from .. import config_context from ..utils.validation import _num_samples from ..utils.validation import check_non_negative from ..utils import check_array @@ -31,6 +32,7 @@ from ..utils.fixes import delayed from ..utils.fixes import sp_version, parse_version +from ._pairwise_distances_reduction import PairwiseDistancesArgKmin from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan from ..exceptions import DataConversionWarning @@ -582,6 +584,10 @@ def _argmin_min_reduce(dist, start): return indices, values +def _argmin_reduce(dist, start): + return dist.argmin(axis=1) + + def pairwise_distances_argmin_min( X, Y, *, axis=1, metric="euclidean", metric_kwargs=None ): @@ -654,19 +660,38 @@ def pairwise_distances_argmin_min( """ X, Y = check_pairwise_arrays(X, Y) - if metric_kwargs is None: - metric_kwargs = {} - if axis == 0: X, Y = Y, X - indices, values = zip( - *pairwise_distances_chunked( - X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs + if metric_kwargs is None: + metric_kwargs = {} + + if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): + values, indices = PairwiseDistancesArgKmin.compute( + X=X, + Y=Y, + k=1, + metric=metric, + metric_kwargs=metric_kwargs, + strategy="auto", + return_distance=True, ) - ) - indices = np.concatenate(indices) - values = np.concatenate(values) + values = values.flatten() + indices = indices.flatten() + else: + # TODO: once PairwiseDistancesArgKmin supports sparse input matrices and 32 bit, + # we won't need to fallback to pairwise_distances_chunked anymore. + + # Turn off check for finiteness because this is costly and because arrays + # have already been validated. + with config_context(assume_finite=True): + indices, values = zip( + *pairwise_distances_chunked( + X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs + ) + ) + indices = np.concatenate(indices) + values = np.concatenate(values) return indices, values @@ -738,9 +763,43 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs if metric_kwargs is None: metric_kwargs = {} - return pairwise_distances_argmin_min( - X, Y, axis=axis, metric=metric, metric_kwargs=metric_kwargs - )[0] + X, Y = check_pairwise_arrays(X, Y) + + if axis == 0: + X, Y = Y, X + + if metric_kwargs is None: + metric_kwargs = {} + + if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): + indices = PairwiseDistancesArgKmin.compute( + X=X, + Y=Y, + k=1, + metric=metric, + metric_kwargs=metric_kwargs, + strategy="auto", + return_distance=False, + ) + indices = indices.flatten() + else: + # TODO: once PairwiseDistancesArgKmin supports sparse input matrices and 32 bit, + # we won't need to fallback to pairwise_distances_chunked anymore. + + # Turn off check for finiteness because this is costly and because arrays + # have already been validated. + with config_context(assume_finite=True): + indices = np.concatenate( + list( + # This returns a np.ndarray generator whose arrays we need + # to flatten into one. + pairwise_distances_chunked( + X, Y, reduce_func=_argmin_reduce, metric=metric, **metric_kwargs + ) + ) + ) + + return indices def haversine_distances(X, Y=None): diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index a604a6c801943..f71f47bbd61dc 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -399,7 +399,7 @@ class MultiOutputClassifier(ClassifierMixin, _MultiOutputEstimator): >>> X, y = make_multilabel_classification(n_classes=3, random_state=0) >>> clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y) >>> clf.predict(X[-2:]) - array([[1, 1, 0], [1, 1, 1]]) + array([[1, 1, 1], [1, 1, 1]]) """ def __init__(self, estimator, *, n_jobs=None): diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index c453ca84a4784..802b4ec639112 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -23,6 +23,9 @@ from ..base import is_classifier from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS +from ..metrics._pairwise_distances_reduction import ( + PairwiseDistancesArgKmin, +) from ..utils import ( check_array, gen_even_slices, @@ -351,31 +354,33 @@ def _check_algorithm_metric(self): if self.algorithm not in ["auto", "brute", "kd_tree", "ball_tree"]: raise ValueError("unrecognized algorithm: '%s'" % self.algorithm) + self._metric = self.metric + if self.algorithm == "auto": - if self.metric == "precomputed": + if self._metric == "precomputed": alg_check = "brute" - elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]: + elif callable(self._metric) or self._metric in VALID_METRICS["ball_tree"]: alg_check = "ball_tree" else: alg_check = "brute" else: alg_check = self.algorithm - if callable(self.metric): + if callable(self._metric): if self.algorithm == "kd_tree": # callable metric is only valid for brute force and ball_tree raise ValueError( "kd_tree does not support callable metric '%s'" "Function call overhead will result" "in very poor performance." - % self.metric + % self._metric ) - elif self.metric not in VALID_METRICS[alg_check]: + elif self._metric not in VALID_METRICS[alg_check]: raise ValueError( "Metric '%s' not valid. Use " "sorted(sklearn.neighbors.VALID_METRICS['%s']) " "to get valid options. " - "Metric can also be a callable function." % (self.metric, alg_check) + "Metric can also be a callable function." % (self._metric, alg_check) ) if self.metric_params is not None and "p" in self.metric_params: @@ -391,7 +396,7 @@ def _check_algorithm_metric(self): else: effective_p = self.p - if self.metric in ["wminkowski", "minkowski"] and effective_p < 1: + if self._metric in ["wminkowski", "minkowski"] and effective_p < 1: raise ValueError("p must be greater or equal to one for minkowski metric") def _fit(self, X, y=None): @@ -441,12 +446,12 @@ def _fit(self, X, y=None): self.effective_metric_params_ = self.metric_params.copy() effective_p = self.effective_metric_params_.get("p", self.p) - if self.metric in ["wminkowski", "minkowski"]: + if self._metric in ["wminkowski", "minkowski"]: self.effective_metric_params_["p"] = effective_p - self.effective_metric_ = self.metric + self.effective_metric_ = self._metric # For minkowski distance, use more efficient methods where available - if self.metric == "minkowski": + if self._metric == "minkowski": p = self.effective_metric_params_.pop("p", 2) w = self.effective_metric_params_.pop("w", None) if p < 1: @@ -485,7 +490,7 @@ def _fit(self, X, y=None): self.n_samples_fit_ = X.data.shape[0] return self - if self.metric == "precomputed": + if self._metric == "precomputed": X = _check_precomputed(X) # Precomputed matrix X must be squared if X.shape[0] != X.shape[1]: @@ -502,6 +507,7 @@ def _fit(self, X, y=None): if issparse(X): if self.algorithm not in ("auto", "brute"): warnings.warn("cannot use tree with sparse input: using brute force") + if self.effective_metric_ not in VALID_METRICS_SPARSE[ "brute" ] and not callable(self.effective_metric_): @@ -526,7 +532,7 @@ def _fit(self, X, y=None): # A tree approach is better for small number of neighbors or small # number of features, with KDTree generally faster when available if ( - self.metric == "precomputed" + self._metric == "precomputed" or self._fit_X.shape[1] > 15 or ( self.n_neighbors is not None @@ -636,10 +642,7 @@ def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance): # argpartition doesn't guarantee sorted order, so we sort again neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - if self.effective_metric_ == "euclidean": - result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind - else: - result = dist[sample_range, neigh_ind], neigh_ind + result = dist[sample_range, neigh_ind], neigh_ind else: result = neigh_ind return result @@ -709,18 +712,37 @@ class from an array representing our data set and ask who's % type(n_neighbors) ) - if X is not None: - query_is_train = False - if self.metric == "precomputed": - X = _check_precomputed(X) - else: - X = self._validate_data(X, accept_sparse="csr", reset=False) - else: - query_is_train = True + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and PairwiseDistancesArgKmin.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) + + query_is_train = X is None + if query_is_train: + if use_pairwise_distances_reductions: + # We force the C-contiguity even if it creates a copy for F-ordered + # arrays because PairwiseDistancesArgKmin is more efficient. + self._fit_X = self._validate_data( + self._fit_X, accept_sparse="csr", reset=False, order="C" + ) X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 + else: + if use_pairwise_distances_reductions: + # We force the C-contiguity even if it creates a copy for F-ordered + # arrays because PairwiseDistancesArgKmin is more efficient. + X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") + self._fit_X = self._validate_data( + self._fit_X, accept_sparse="csr", reset=False, order="C" + ) + elif self._metric == "precomputed": + X = _check_precomputed(X) + else: + X = self._validate_data(X, accept_sparse="csr", reset=False) n_samples_fit = self.n_samples_fit_ if n_neighbors > n_samples_fit: @@ -731,24 +753,36 @@ class from an array representing our data set and ask who's n_jobs = effective_n_jobs(self.n_jobs) chunked_results = None - if self._fit_method == "brute" and self.metric == "precomputed" and issparse(X): + if use_pairwise_distances_reductions: + results = PairwiseDistancesArgKmin.compute( + X=X, + Y=self._fit_X, + k=n_neighbors, + metric=self.effective_metric_, + metric_kwargs=self.effective_metric_params_, + n_threads=self.n_jobs, + strategy="auto", + return_distance=return_distance, + ) + + elif ( + self._fit_method == "brute" + and self._metric == "precomputed" + and issparse(X) + ): results = _kneighbors_from_graph( X, n_neighbors=n_neighbors, return_distance=return_distance ) elif self._fit_method == "brute": + # TODO: support sparse matrices + reduce_func = partial( self._kneighbors_reduce_func, n_neighbors=n_neighbors, return_distance=return_distance, ) - # for efficiency, use squared euclidean distances - if self.effective_metric_ == "euclidean": - kwds = {"squared": True} - else: - kwds = self.effective_metric_params_ - chunked_results = list( pairwise_distances_chunked( X, @@ -756,7 +790,7 @@ class from an array representing our data set and ask who's reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, - **kwds, + **self.effective_metric_params_, ) ) @@ -1052,6 +1086,8 @@ class from an array representing our data set and ask who's ) elif self._fit_method == "brute": + # TODO: support sparse matrices + # for efficiency, use squared euclidean distances if self.effective_metric_ == "euclidean": radius *= radius diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index 4f84a16211dbd..25cf84c6f63b0 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -213,7 +213,13 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs) Class labels for each data sample. """ - neigh_dist, neigh_ind = self.kneighbors(X) + if self.weights == "uniform": + # In that case, we do not need the distance so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) + classes_ = self.classes_ _y = self._y if not self.outputs_2d_: @@ -255,7 +261,12 @@ def predict_proba(self, X): The class probabilities of the input samples. Classes are ordered by lexicographic order. """ - neigh_dist, neigh_ind = self.kneighbors(X) + if self.weights == "uniform": + # In that case, we do not need the distance so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) classes_ = self.classes_ _y = self._y diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 74cecede2efa4..cdadce4bbe543 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -228,7 +228,12 @@ def predict(self, X): y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int Target values. """ - neigh_dist, neigh_ind = self.kneighbors(X) + if self.weights == "uniform": + # In that case, we do not need the distance so we do not compute them. + neigh_ind = self.kneighbors(X, return_distance=False) + neigh_dist = None + else: + neigh_dist, neigh_ind = self.kneighbors(X) weights = _get_weights(neigh_dist, self.weights) From 64cde3b4914faa8612fc51cc4dd5899e1171294e Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 14 Jan 2022 18:18:49 +0100 Subject: [PATCH 04/34] Adapt test accordingly --- sklearn/metrics/tests/test_dist_metrics.py | 22 +- sklearn/metrics/tests/test_pairwise.py | 8 +- sklearn/neighbors/tests/test_neighbors.py | 495 ++++++++++++--------- 3 files changed, 301 insertions(+), 224 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 1de7471c01812..809081c28236b 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -9,6 +9,7 @@ from scipy.spatial.distance import cdist from sklearn.metrics import DistanceMetric +from sklearn.metrics._dist_metrics import BOOL_METRICS from sklearn.utils import check_random_state from sklearn.utils._testing import create_memmap_backed_data from sklearn.utils.fixes import sp_version, parse_version @@ -37,16 +38,6 @@ def dist_func(x1, x2, p): V = rng.random_sample((d, d)) VI = np.dot(V, V.T) -BOOL_METRICS = [ - "matching", - "jaccard", - "dice", - "kulsinski", - "rogerstanimoto", - "russellrao", - "sokalmichener", - "sokalsneath", -] METRICS_DEFAULT_PARAMS = [ ("euclidean", {}), @@ -73,6 +64,17 @@ def dist_func(x1, x2, p): ) +# TODO: remove this test in 1.3 +def test_neighbors_distance_metric_deprecation(): + from sklearn.neighbors import DistanceMetric as DeprecatedDistanceMetric + + with pytest.warns( + FutureWarning, match="sklearn.neighbors.DistanceMetric has been moved" + ): + DeprecatedDistanceMetric.get_metric("euclidean") + + +@pytest.mark.parametrize("metric", METRICS_DEFAULT_PARAMS) def check_cdist(metric, kwargs, X1, X2): if metric == "wminkowski": # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index b7e90e63f2af1..8efddeb9f7d8c 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -457,11 +457,13 @@ def test_pairwise_distances_argmin_min(): assert type(valssp) == np.ndarray # euclidean metric squared - idx, vals = pairwise_distances_argmin_min( - X, Y, metric="euclidean", metric_kwargs={"squared": True} - ) + # Squared Euclidean metric + idx, vals = pairwise_distances_argmin_min(X, Y, metric="sqeuclidean") + idx2 = pairwise_distances_argmin(X, Y, metric="sqeuclidean") + assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals_sq) + assert_array_almost_equal(idx2, expected_idx) # Non-euclidean scikit-learn metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 2a4d500610051..e811fbee9b302 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -20,13 +20,25 @@ from sklearn.exceptions import EfficiencyWarning from sklearn.exceptions import NotFittedError from sklearn.metrics.pairwise import pairwise_distances +from sklearn.metrics.tests.test_pairwise_distances_reduction import ( + _get_dummy_metric_params_list, +) from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split -from sklearn.neighbors import VALID_METRICS_SPARSE, VALID_METRICS -from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed +from sklearn.neighbors import ( + VALID_METRICS_SPARSE, +) +from sklearn.neighbors._base import ( + _is_sorted_by_data, + _check_precomputed, + KNeighborsMixin, +) from sklearn.pipeline import make_pipeline -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, +) from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import check_random_state from sklearn.utils.fixes import sp_version, parse_version @@ -50,6 +62,9 @@ SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,) ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto") +COMMON_VALID_METRICS = sorted( + set.intersection(*map(set, neighbors.VALID_METRICS.values())) +) P = (1, 2, 3, 4, np.inf) JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys()) @@ -70,42 +85,144 @@ def _weight_func(dist): return retval ** 2 +@pytest.mark.parametrize("n_samples", [100, 1000]) +@pytest.mark.parametrize("n_features", [5, 100]) +@pytest.mark.parametrize("n_query_pts", [10, 100]) +@pytest.mark.parametrize("n_neighbors", [1, 10, 100]) +@pytest.mark.parametrize("query_is_train", [False, True]) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) def test_unsupervised_kneighbors( - n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5 + n_samples, + n_features, + n_query_pts, + n_neighbors, + query_is_train, + metric, ): - # Test unsupervised neighbors methods - X = rng.rand(n_samples, n_features) + # The different algorithms must return identical results + # on their common metrics, with and without returning + # distances - test = rng.rand(n_query_pts, n_features) + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features) - for p in P: - results_nodist = [] - results = [] + query = X if query_is_train else local_rng.rand(n_query_pts, n_features) - for algorithm in ALGORITHMS: - neigh = neighbors.NearestNeighbors( - n_neighbors=n_neighbors, algorithm=algorithm, p=p - ) - neigh.fit(X) + results_nodist = [] + results = [] + + for algorithm in ALGORITHMS: + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, algorithm=algorithm, metric=metric + ) + neigh.fit(X) - results_nodist.append(neigh.kneighbors(test, return_distance=False)) - results.append(neigh.kneighbors(test, return_distance=True)) + results_nodist.append(neigh.kneighbors(query, return_distance=False)) + results.append(neigh.kneighbors(query, return_distance=True)) - for i in range(len(results) - 1): - assert_array_almost_equal(results_nodist[i], results[i][1]) - assert_array_almost_equal(results[i][0], results[i + 1][0]) - assert_array_almost_equal(results[i][1], results[i + 1][1]) + for i in range(len(results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + indices_no_dist = results_nodist[i] + distances, next_distances = results[i][0], results[i + 1][0] + indices, next_indices = results[i][1], results[i + 1][1] + assert_array_equal( + indices_no_dist, + indices, + err_msg=( + f"The '{algorithm}' algorithm returns different" + "indices depending on 'return_distances'." + ), + ) + assert_array_equal( + indices, + next_indices, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different indices." + ), + ) + assert_allclose( + distances, + next_distances, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different distances." + ), + atol=1e-6, + ) +@pytest.mark.parametrize("n_samples", [100, 1000]) +@pytest.mark.parametrize("n_features", [5, 100]) +@pytest.mark.parametrize("n_query_pts", [10, 100]) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)]) @pytest.mark.parametrize( - "NearestNeighbors", + "NeighborsMixinSubclass", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + ], +) +def test_neigh_predictions_algorithm_agnosticity( + n_samples, + n_features, + n_query_pts, + metric, + n_neighbors, + radius, + NeighborsMixinSubclass, +): + # The different algorithms must return identical predictions results + # on their common metrics. + + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features) + y = local_rng.randint(3, size=n_samples) + + query = local_rng.rand(n_query_pts, n_features) + + predict_results = [] + + parameter = ( + n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius + ) + + for algorithm in ALGORITHMS: + neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric) + neigh.fit(X, y) + + predict_results.append(neigh.predict(query)) + + for i in range(len(predict_results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + predictions, next_predictions = predict_results[i], predict_results[i + 1] + + assert_allclose( + predictions, + next_predictions, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different predictions." + ), + ) + + +@pytest.mark.parametrize( + "KNeighborsMixinSubclass", [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors, ], ) -def test_unsupervised_inputs(NearestNeighbors): +def test_unsupervised_inputs(KNeighborsMixinSubclass): # Test unsupervised inputs for neighbors estimators X = rng.random_sample((10, 3)) @@ -115,7 +232,7 @@ def test_unsupervised_inputs(NearestNeighbors): dist1, ind1 = nbrs_fid.kneighbors(X) - nbrs = NearestNeighbors(n_neighbors=1) + nbrs = KNeighborsMixinSubclass(n_neighbors=1) for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): nbrs.fit(data, y) @@ -216,8 +333,6 @@ def make_train_test(X_train, X_test): estimators = [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, - neighbors.RadiusNeighborsClassifier, - neighbors.RadiusNeighborsRegressor, ] check_precomputed(make_train_test, estimators) @@ -1168,19 +1283,19 @@ def test_kneighbors_graph(): assert_array_almost_equal(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]]) -def test_kneighbors_graph_sparse(seed=36): +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +def test_kneighbors_graph_sparse(n_neighbors, mode, seed=36): # Test kneighbors_graph to build the k-Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) - for n_neighbors in [1, 2, 3]: - for mode in ["connectivity", "distance"]: - assert_array_almost_equal( - neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), - neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), - ) + assert_array_almost_equal( + neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) def test_radius_neighbors_graph(): @@ -1196,21 +1311,19 @@ def test_radius_neighbors_graph(): ) -def test_radius_neighbors_graph_sparse(seed=36): +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +def test_radius_neighbors_graph_sparse(n_neighbors, mode, seed=36): # Test radius_neighbors_graph to build the Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) - for n_neighbors in [1, 2, 3]: - for mode in ["connectivity", "distance"]: - assert_array_almost_equal( - neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), - neighbors.radius_neighbors_graph( - Xcsr, n_neighbors, mode=mode - ).toarray(), - ) + assert_array_almost_equal( + neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) def test_neighbors_badargs(): @@ -1275,57 +1388,22 @@ def test_neighbors_badargs(): nbrs.radius_neighbors_graph(X, mode="blah") -def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +def test_neighbors_metrics( + metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5 +): # Test computing the neighbors for various metrics # create a symmetric matrix - V = rng.rand(n_features, n_features) - VI = np.dot(V, V.T) - - metrics = [ - ("euclidean", {}), - ("manhattan", {}), - ("minkowski", dict(p=1)), - ("minkowski", dict(p=2)), - ("minkowski", dict(p=3)), - ("minkowski", dict(p=np.inf)), - ("chebyshev", {}), - ("seuclidean", dict(V=rng.rand(n_features))), - ("mahalanobis", dict(VI=VI)), - ("haversine", {}), - ] - if sp_version < parse_version("1.8.0.dev0"): - # TODO: remove once we no longer support scipy < 1.8.0. - # wminkowski was removed in scipy 1.8.0 but should work for previous - # versions. - metrics.append( - ("wminkowski", dict(p=3, w=rng.rand(n_features))), - ) - else: - # Recent scipy versions accept weights in the Minkowski metric directly: - metrics.append( - ("minkowski", dict(p=3, w=rng.rand(n_features))), - ) - algorithms = ["brute", "ball_tree", "kd_tree"] - X = rng.rand(n_samples, n_features) + X_train = rng.rand(n_samples, n_features) + X_test = rng.rand(n_query_pts, n_features) - test = rng.rand(n_query_pts, n_features) + metric_params_list = _get_dummy_metric_params_list(metric, n_features) - for metric, metric_params in metrics: + for metric_params in metric_params_list: results = {} p = metric_params.pop("p", 2) - w = metric_params.get("w", None) for algorithm in algorithms: - # KD tree doesn't support all metrics - if algorithm == "kd_tree" and ( - metric not in neighbors.KDTree.valid_metrics or w is not None - ): - est = neighbors.NearestNeighbors( - algorithm=algorithm, metric=metric, metric_params=metric_params - ) - with pytest.raises(ValueError): - est.fit(X) - continue neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, @@ -1334,10 +1412,7 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor metric_params=metric_params, ) - # Haversine distance only accepts 2D data - feature_sl = slice(None, 2) if metric == "haversine" else slice(None) - - neigh.fit(X[:, feature_sl]) + neigh.fit(X_train) # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 ExceptionToAssert = None @@ -1349,15 +1424,20 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - results[algorithm] = neigh.kneighbors( - test[:, feature_sl], return_distance=True - ) + results[algorithm] = neigh.kneighbors(X_test, return_distance=True) + + brute_dst, brute_idx = results["brute"] + kd_tree_dst, kd_tree_idx = results["kd_tree"] + ball_tree_dst, ball_tree_idx = results["ball_tree"] + + assert_allclose(brute_dst, ball_tree_dst) + assert_array_equal(brute_idx, ball_tree_idx) + + assert_allclose(brute_dst, kd_tree_dst) + assert_array_equal(brute_idx, kd_tree_idx) - assert_array_almost_equal(results["brute"][0], results["ball_tree"][0]) - assert_array_almost_equal(results["brute"][1], results["ball_tree"][1]) - if "kd_tree" in results: - assert_array_almost_equal(results["brute"][0], results["kd_tree"][0]) - assert_array_almost_equal(results["brute"][1], results["kd_tree"][1]) + assert_allclose(ball_tree_dst, kd_tree_dst) + assert_array_equal(ball_tree_idx, kd_tree_idx) def test_callable_metric(): @@ -1381,58 +1461,47 @@ def custom_metric(x1, x2): assert_array_almost_equal(dist1, dist2) -def test_valid_brute_metric_for_auto_algorithm(): - X = rng.rand(12, 12) +@pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"]) +def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=12): + X = rng.rand(n_samples, n_features) Xcsr = csr_matrix(X) - # check that there is a metric that is valid for brute - # but not ball_tree (so we actually test something) - assert "cosine" in VALID_METRICS["brute"] - assert "cosine" not in VALID_METRICS["ball_tree"] + metric_params_list = _get_dummy_metric_params_list(metric, n_features) + + if metric == "precomputed": + X_precomputed = rng.random_sample((10, 4)) + Y_precomputed = rng.random_sample((3, 4)) + DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean") + DYX = metrics.pairwise_distances( + Y_precomputed, X_precomputed, metric="euclidean" + ) + nb_p = neighbors.NearestNeighbors(n_neighbors=3, metric="precomputed") + nb_p.fit(DXX) + nb_p.kneighbors(DYX) - # Metric which don't required any additional parameter - require_params = ["mahalanobis", "wminkowski", "seuclidean"] - for metric in VALID_METRICS["brute"]: - if metric != "precomputed" and metric not in require_params: + else: + for metric_params in metric_params_list: nn = neighbors.NearestNeighbors( - n_neighbors=3, algorithm="auto", metric=metric + n_neighbors=3, + algorithm="auto", + metric=metric, + metric_params=metric_params, ) - if metric != "haversine": - nn.fit(X) - nn.kneighbors(X) + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X = np.ascontiguousarray(X[:, feature_sl]) else: - nn.fit(X[:, :2]) - nn.kneighbors(X[:, :2]) - elif metric == "precomputed": - X_precomputed = rng.random_sample((10, 4)) - Y_precomputed = rng.random_sample((3, 4)) - DXX = metrics.pairwise_distances(X_precomputed, metric="euclidean") - DYX = metrics.pairwise_distances( - Y_precomputed, X_precomputed, metric="euclidean" - ) - nb_p = neighbors.NearestNeighbors(n_neighbors=3) - nb_p.fit(DXX) - nb_p.kneighbors(DYX) + X = X - for metric in VALID_METRICS_SPARSE["brute"]: - if metric != "precomputed" and metric not in require_params: - nn = neighbors.NearestNeighbors( - n_neighbors=3, algorithm="auto", metric=metric - ).fit(Xcsr) - nn.kneighbors(Xcsr) - - # Metric with parameter - VI = np.dot(X, X.T) - list_metrics = [ - ("seuclidean", dict(V=rng.rand(12))), - ("wminkowski", dict(w=rng.rand(12))), - ("mahalanobis", dict(VI=VI)), - ] - for metric, params in list_metrics: - nn = neighbors.NearestNeighbors( - n_neighbors=3, algorithm="auto", metric=metric, metric_params=params - ).fit(X) - nn.kneighbors(X) + nn.fit(X) + nn.kneighbors(X) + + if metric in VALID_METRICS_SPARSE["brute"]: + nn = neighbors.NearestNeighbors( + n_neighbors=3, algorithm="auto", metric=metric + ).fit(Xcsr) + nn.kneighbors(Xcsr) def test_metric_params_interface(): @@ -1525,82 +1594,86 @@ def test_k_and_radius_neighbors_train_is_not_query(): assert_array_equal(rng.A, [[0, 1], [1, 1]]) -def test_k_and_radius_neighbors_X_None(): +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_X_None(algorithm): # Test kneighbors et.al when query is None - for algorithm in ALGORITHMS: + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + + X = [[0], [1]] + nn.fit(X) + + dist, ind = nn.kneighbors() + assert_array_equal(dist, [[1], [1]]) + assert_array_equal(ind, [[1], [0]]) + dist, ind = nn.radius_neighbors(None, radius=1.5) + check_object_arrays(dist, [[1], [1]]) + check_object_arrays(ind, [[1], [0]]) + + # Test the graph variants. + rng = nn.radius_neighbors_graph(None, radius=1.5) + kng = nn.kneighbors_graph(None) + for graph in [rng, kng]: + assert_array_equal(graph.A, [[0, 1], [1, 0]]) + assert_array_equal(graph.data, [1, 1]) + assert_array_equal(graph.indices, [1, 0]) + + X = [[0, 1], [0, 1], [1, 1]] + nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) + nn.fit(X) + assert_array_equal( + nn.kneighbors_graph().A, + np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), + ) - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) - X = [[0], [1]] - nn.fit(X) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_duplicates(algorithm): + # Test behavior of kneighbors when duplicates are present in query + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + duplicates = [[0], [1], [3]] - dist, ind = nn.kneighbors() - assert_array_equal(dist, [[1], [1]]) - assert_array_equal(ind, [[1], [0]]) - dist, ind = nn.radius_neighbors(None, radius=1.5) - check_object_arrays(dist, [[1], [1]]) - check_object_arrays(ind, [[1], [0]]) + nn.fit(duplicates) - # Test the graph variants. - rng = nn.radius_neighbors_graph(None, radius=1.5) - kng = nn.kneighbors_graph(None) - for graph in [rng, kng]: - assert_array_equal(graph.A, [[0, 1], [1, 0]]) - assert_array_equal(graph.data, [1, 1]) - assert_array_equal(graph.indices, [1, 0]) - - X = [[0, 1], [0, 1], [1, 1]] - nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) - nn.fit(X) - assert_array_equal( - nn.kneighbors_graph().A, - np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), - ) + # Do not do anything special to duplicates. + kng = nn.kneighbors_graph(duplicates, mode="distance") + assert_allclose( + kng.toarray(), np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + ) + assert_allclose(kng.data, [0.0, 0.0, 0.0]) + assert_allclose(kng.indices, [0, 1, 2]) + dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) + check_object_arrays(dist, [[0, 1], [1, 0]]) + check_object_arrays(ind, [[0, 1], [0, 1]]) -def test_k_and_radius_neighbors_duplicates(): - # Test behavior of kneighbors when duplicates are present in query + rng = nn.radius_neighbors_graph(duplicates, radius=1.5) + assert_allclose( + rng.toarray(), np.array([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + ) - for algorithm in ALGORITHMS: - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) - nn.fit([[0], [1]]) - - # Do not do anything special to duplicates. - kng = nn.kneighbors_graph([[0], [1]], mode="distance") - assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]])) - assert_array_equal(kng.data, [0.0, 0.0]) - assert_array_equal(kng.indices, [0, 1]) - - dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) - check_object_arrays(dist, [[0, 1], [1, 0]]) - check_object_arrays(ind, [[0, 1], [0, 1]]) - - rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5) - assert_array_equal(rng.A, np.ones((2, 2))) - - rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") - rng.sort_indices() - assert_array_equal(rng.A, [[0, 1], [1, 0]]) - assert_array_equal(rng.indices, [0, 1, 0, 1]) - assert_array_equal(rng.data, [0, 1, 1, 0]) - - # Mask the first duplicates when n_duplicates > n_neighbors. - X = np.ones((3, 1)) - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") - nn.fit(X) - dist, ind = nn.kneighbors() - assert_array_equal(dist, np.zeros((3, 1))) - assert_array_equal(ind, [[1], [0], [1]]) - - # Test that zeros are explicitly marked in kneighbors_graph. - kng = nn.kneighbors_graph(mode="distance") - assert_array_equal(kng.A, np.zeros((3, 3))) - assert_array_equal(kng.data, np.zeros(3)) - assert_array_equal(kng.indices, [1.0, 0.0, 1.0]) - assert_array_equal( - nn.kneighbors_graph().A, - np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), - ) + rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") + rng.sort_indices() + assert_allclose(rng.toarray(), [[0, 1, 0], [1, 0, 0]]) + assert_allclose(rng.indices, [0, 1, 0, 1]) + assert_allclose(rng.data, [0, 1, 1, 0]) + + # Mask the first duplicates when n_duplicates > n_neighbors. + X = np.ones((3, 1)) + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") + nn.fit(X) + dist, ind = nn.kneighbors() + assert_allclose(dist, np.zeros((3, 1))) + assert_allclose(ind, [[1], [0], [1]]) + + # Test that zeros are explicitly marked in kneighbors_graph. + kng = nn.kneighbors_graph(mode="distance") + assert_allclose(kng.toarray(), np.zeros((3, 3))) + assert_allclose(kng.data, np.zeros(3)) + assert_allclose(kng.indices, [1, 0, 1]) + assert_allclose( + nn.kneighbors_graph().toarray(), + np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), + ) def test_include_self_neighbors_graph(): From d392122e77eeff79b0e68d6b9a244cfacb8d4d3a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Fri, 14 Jan 2022 18:19:17 +0100 Subject: [PATCH 05/34] Add whats_new entry --- doc/whats_new/v1.1.rst | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index d14cd278f67a1..c10a86ef5cf8e 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -481,6 +481,37 @@ Changelog left corner of the HTML representation to show how the elements are clickable. :pr:`21298` by `Thomas Fan`_. +Miscellaneous +............. + +- |Efficiency| Low-level routines for reductions on pairwise distances + for dense float64 datasets have been refactored. The following functions + and estimators now benefit from improved performances, in particular on + multi-cores machines: + - :func:`sklearn.metrics.pairwise_distances_argmin` + - :func:`sklearn.metrics.pairwise_distances_argmin_min` + - :class:`sklearn.cluster.AffinityPropagation` + - :class:`sklearn.cluster.Birch` + - :class:`sklearn.cluster.MeanShift` + - :class:`sklearn.cluster.OPTICS` + - :class:`sklearn.cluster.SpectralClustering` + - :func:`sklearn.feature_selection.mutual_info_regression` + - :class:`sklearn.neighbors.KNeighborsClassifier` + - :class:`sklearn.neighbors.KNeighborsRegressor` + - :class:`sklearn.neighbors.LocalOutlierFactor` + - :class:`sklearn.neighbors.NearestNeighbors` + - :class:`sklearn.manifold.Isomap` + - :class:`sklearn.manifold.LocallyLinearEmbedding` + - :class:`sklearn.manifold.TSNE` + - :func:`sklearn.manifold.trustworthiness` + - :class:`sklearn.semi_supervised.LabelPropagation` + - :class:`sklearn.semi_supervised.LabelSpreading` + + For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors` + can be up to 20× faster than in the previous versions'. + + :pr:`21462` by :user:`Julien Jerphanion `. + Code and Documentation Contributors ----------------------------------- From 5e1d07113bb6990c32f2febeb23dcde9d16cb7ee Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 18 Jan 2022 10:36:47 +0100 Subject: [PATCH 06/34] Change input validation order for kneighbors --- sklearn/neighbors/_base.py | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 802b4ec639112..ea3d1581218d4 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -402,7 +402,9 @@ def _check_algorithm_metric(self): def _fit(self, X, y=None): if self._get_tags()["requires_y"]: if not isinstance(X, (KDTree, BallTree, NeighborsBase)): - X, y = self._validate_data(X, y, accept_sparse="csr", multi_output=True) + X, y = self._validate_data( + X, y, accept_sparse="csr", multi_output=True, order="C" + ) if is_classifier(self): # Classification targets require a specific format @@ -437,7 +439,7 @@ def _fit(self, X, y=None): else: if not isinstance(X, (KDTree, BallTree, NeighborsBase)): - X = self._validate_data(X, accept_sparse="csr") + X = self._validate_data(X, accept_sparse="csr", order="C") self._check_algorithm_metric() if self.metric_params is None: @@ -712,37 +714,17 @@ class from an array representing our data set and ask who's % type(n_neighbors) ) - use_pairwise_distances_reductions = ( - self._fit_method == "brute" - and PairwiseDistancesArgKmin.is_usable_for( - X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ - ) - ) - query_is_train = X is None if query_is_train: - if use_pairwise_distances_reductions: - # We force the C-contiguity even if it creates a copy for F-ordered - # arrays because PairwiseDistancesArgKmin is more efficient. - self._fit_X = self._validate_data( - self._fit_X, accept_sparse="csr", reset=False, order="C" - ) X = self._fit_X # Include an extra neighbor to account for the sample itself being # returned, which is removed later n_neighbors += 1 else: - if use_pairwise_distances_reductions: - # We force the C-contiguity even if it creates a copy for F-ordered - # arrays because PairwiseDistancesArgKmin is more efficient. - X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") - self._fit_X = self._validate_data( - self._fit_X, accept_sparse="csr", reset=False, order="C" - ) - elif self._metric == "precomputed": + if self._metric == "precomputed": X = _check_precomputed(X) else: - X = self._validate_data(X, accept_sparse="csr", reset=False) + X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") n_samples_fit = self.n_samples_fit_ if n_neighbors > n_samples_fit: @@ -753,6 +735,12 @@ class from an array representing our data set and ask who's n_jobs = effective_n_jobs(self.n_jobs) chunked_results = None + use_pairwise_distances_reductions = ( + self._fit_method == "brute" + and PairwiseDistancesArgKmin.is_usable_for( + X if X is not None else self._fit_X, self._fit_X, self.effective_metric_ + ) + ) if use_pairwise_distances_reductions: results = PairwiseDistancesArgKmin.compute( X=X, From 0e8ebb529784cb57943f2e7a39339258e37a03f6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 17 Jan 2022 11:14:49 +0100 Subject: [PATCH 07/34] Remove duplicated test_neighbors_distance_metric_deprecation Co-authored-by: Thomas J. Fan --- sklearn/metrics/tests/test_dist_metrics.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 809081c28236b..4b1688430e3e9 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -64,16 +64,6 @@ def dist_func(x1, x2, p): ) -# TODO: remove this test in 1.3 -def test_neighbors_distance_metric_deprecation(): - from sklearn.neighbors import DistanceMetric as DeprecatedDistanceMetric - - with pytest.warns( - FutureWarning, match="sklearn.neighbors.DistanceMetric has been moved" - ): - DeprecatedDistanceMetric.get_metric("euclidean") - - @pytest.mark.parametrize("metric", METRICS_DEFAULT_PARAMS) def check_cdist(metric, kwargs, X1, X2): if metric == "wminkowski": From f15f271f604105e7ce5431191cd9d09528ca7419 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 17 Jan 2022 11:21:26 +0100 Subject: [PATCH 08/34] Adapt the documentation Co-authored-by: Thomas J. Fan --- sklearn/metrics/_pairwise_distances_reduction.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 329cf90158105..7e9c6e2e7e76a 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -621,10 +621,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): Indices of the argkmin for each vector in X. If return_distance=True: - - argkmin_indices : ndarray of shape (n_samples_X, k) - Indices of the argkmin for each vector in X. - argkmin_distances : ndarray of shape (n_samples_X, k) Distances to the argkmin for each vector in X. + - argkmin_indices : ndarray of shape (n_samples_X, k) + Indices of the argkmin for each vector in X. Notes ----- From 0f0e440c3ee6cbc453fe567396b6b09c20dca5e5 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 18 Jan 2022 09:33:06 +0100 Subject: [PATCH 09/34] Add mahalanobis case to test fixtures --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 6546022f88351..d0bc35caf49db 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -42,7 +42,6 @@ def _get_dummy_metric_params_list(metric: str, n_features: int): # Recent scipy versions accept weights in the Minkowski metric directly: # type: ignore minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features))) - return minkowski_kwargs # TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0. @@ -59,6 +58,12 @@ def _get_dummy_metric_params_list(metric: str, n_features: int): if metric == "seuclidean": return [dict(V=rng.rand(n_features))] + if metric == "mahalanobis": + A = rng.rand(n_features, n_features) + # Make the matrix symmetric positive definite + VI = A + A.T + 3 * np.eye(n_features) + return [dict(VI=VI)] + # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric. # In those cases, no kwargs is needed. return [{}] From 3448b01a86ad25e9dc232c345c4ec80d7abb6beb Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 19 Jan 2022 10:18:06 +0100 Subject: [PATCH 10/34] Correct whats_new entry --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index c10a86ef5cf8e..6d322457dc177 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -508,7 +508,7 @@ Miscellaneous - :class:`sklearn.semi_supervised.LabelSpreading` For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors` - can be up to 20× faster than in the previous versions'. + can be up to ×20 faster than in the previous versions'. :pr:`21462` by :user:`Julien Jerphanion `. From afdaaa1f5b7f4cbc6c568ae64ad08b1413c0d1ef Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 20 Jan 2022 17:13:25 +0100 Subject: [PATCH 11/34] CLN Remove unneeded private metric attribute This was needed when 'fast_sqeuclidean' and 'fast_euclidean' were present to choose the best implementation based on the user specification. Those metric have been removed since then, making this attribute useless. Co-authored-by: Thomas J. Fan --- sklearn/neighbors/_base.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index ea3d1581218d4..66ac38d0173f4 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -354,33 +354,31 @@ def _check_algorithm_metric(self): if self.algorithm not in ["auto", "brute", "kd_tree", "ball_tree"]: raise ValueError("unrecognized algorithm: '%s'" % self.algorithm) - self._metric = self.metric - if self.algorithm == "auto": - if self._metric == "precomputed": + if self.metric == "precomputed": alg_check = "brute" - elif callable(self._metric) or self._metric in VALID_METRICS["ball_tree"]: + elif callable(self.metric) or self.metric in VALID_METRICS["ball_tree"]: alg_check = "ball_tree" else: alg_check = "brute" else: alg_check = self.algorithm - if callable(self._metric): + if callable(self.metric): if self.algorithm == "kd_tree": # callable metric is only valid for brute force and ball_tree raise ValueError( "kd_tree does not support callable metric '%s'" "Function call overhead will result" "in very poor performance." - % self._metric + % self.metric ) - elif self._metric not in VALID_METRICS[alg_check]: + elif self.metric not in VALID_METRICS[alg_check]: raise ValueError( "Metric '%s' not valid. Use " "sorted(sklearn.neighbors.VALID_METRICS['%s']) " "to get valid options. " - "Metric can also be a callable function." % (self._metric, alg_check) + "Metric can also be a callable function." % (self.metric, alg_check) ) if self.metric_params is not None and "p" in self.metric_params: @@ -396,7 +394,7 @@ def _check_algorithm_metric(self): else: effective_p = self.p - if self._metric in ["wminkowski", "minkowski"] and effective_p < 1: + if self.metric in ["wminkowski", "minkowski"] and effective_p < 1: raise ValueError("p must be greater or equal to one for minkowski metric") def _fit(self, X, y=None): @@ -448,12 +446,12 @@ def _fit(self, X, y=None): self.effective_metric_params_ = self.metric_params.copy() effective_p = self.effective_metric_params_.get("p", self.p) - if self._metric in ["wminkowski", "minkowski"]: + if self.metric in ["wminkowski", "minkowski"]: self.effective_metric_params_["p"] = effective_p - self.effective_metric_ = self._metric + self.effective_metric_ = self.metric # For minkowski distance, use more efficient methods where available - if self._metric == "minkowski": + if self.metric == "minkowski": p = self.effective_metric_params_.pop("p", 2) w = self.effective_metric_params_.pop("w", None) if p < 1: @@ -492,7 +490,7 @@ def _fit(self, X, y=None): self.n_samples_fit_ = X.data.shape[0] return self - if self._metric == "precomputed": + if self.metric == "precomputed": X = _check_precomputed(X) # Precomputed matrix X must be squared if X.shape[0] != X.shape[1]: @@ -534,7 +532,7 @@ def _fit(self, X, y=None): # A tree approach is better for small number of neighbors or small # number of features, with KDTree generally faster when available if ( - self._metric == "precomputed" + self.metric == "precomputed" or self._fit_X.shape[1] > 15 or ( self.n_neighbors is not None @@ -721,7 +719,7 @@ class from an array representing our data set and ask who's # returned, which is removed later n_neighbors += 1 else: - if self._metric == "precomputed": + if self.metric == "precomputed": X = _check_precomputed(X) else: X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") @@ -754,9 +752,7 @@ class from an array representing our data set and ask who's ) elif ( - self._fit_method == "brute" - and self._metric == "precomputed" - and issparse(X) + self._fit_method == "brute" and self.metric == "precomputed" and issparse(X) ): results = _kneighbors_from_graph( X, n_neighbors=n_neighbors, return_distance=return_distance From 34566a7ac9c7053da05e53a7261e103a495eced1 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 24 Jan 2022 22:17:26 +0100 Subject: [PATCH 12/34] TST Assert FutureWarning instead of DeprecationWarning in test_neighbors_metrics --- sklearn/neighbors/tests/test_neighbors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 4594d1ade1a25..8a47ec35ca2d6 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1493,7 +1493,7 @@ def test_neighbors_metrics( and algorithm == "brute" and sp_version >= parse_version("1.6.0") ): - ExceptionToAssert = DeprecationWarning + ExceptionToAssert = FutureWarning with pytest.warns(ExceptionToAssert): results[algorithm] = neigh.kneighbors(X_test, return_distance=True) From 3524735502737294b63da13f9a70776d117d551f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 25 Jan 2022 08:51:18 +0100 Subject: [PATCH 13/34] MAINT Add use_pairwise_dist_activate to scikit-learn config --- sklearn/_config.py | 17 +++++++++++++++++ .../metrics/_pairwise_distances_reduction.pyx | 4 +++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sklearn/_config.py b/sklearn/_config.py index fd0b30da7a82c..796221360964d 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -48,6 +48,7 @@ def set_config( print_changed_only=None, display=None, pairwise_dist_chunk_size=None, + use_pairwise_dist_activate=None, ): """Set global scikit-learn configuration @@ -93,6 +94,12 @@ def set_config( .. versionadded:: 1.1 + use_pairwise_dist_activate : bool, default=None + Use PairwiseDistancesReduction when possible. + Default is True. + + .. versionadded:: 1.1 + See Also -------- config_context : Context manager for global scikit-learn configuration. @@ -110,6 +117,8 @@ def set_config( local_config["display"] = display if pairwise_dist_chunk_size is not None: local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size + if use_pairwise_dist_activate is not None: + local_config["use_pairwise_dist_activate"] = use_pairwise_dist_activate @contextmanager @@ -120,6 +129,7 @@ def config_context( print_changed_only=None, display=None, pairwise_dist_chunk_size=None, + use_pairwise_dist_activate=None, ): """Context manager for global scikit-learn configuration. @@ -164,6 +174,12 @@ def config_context( .. versionadded:: 1.1 + use_pairwise_dist_activate : bool, default=None + Use PairwiseDistancesReduction when possible. + Default is True. + + .. versionadded:: 1.1 + Yields ------ None. @@ -198,6 +214,7 @@ def config_context( print_changed_only=print_changed_only, display=display, pairwise_dist_chunk_size=pairwise_dist_chunk_size, + use_pairwise_dist_activate=use_pairwise_dist_activate, ) try: diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 7e9c6e2e7e76a..625d01f9a6e98 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -17,6 +17,7 @@ cimport numpy as np import numpy as np import warnings +import sklearn from .. import get_config from libc.stdlib cimport free, malloc from libc.float cimport DBL_MAX @@ -211,7 +212,8 @@ cdef class PairwiseDistancesReduction: True if the PairwiseDistancesReduction can be used, else False. """ # TODO: support sparse arrays and 32 bits - return (not issparse(X) and X.dtype == np.float64 and + return (sklearn.get_config().get("use_pairwise_dist_activate", True) and + not issparse(X) and X.dtype == np.float64 and not issparse(Y) and Y.dtype == np.float64 and metric in cls.valid_metrics()) From 6b396b0ee1ac8686eb253d6cf7e3388678233681 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 25 Jan 2022 08:56:23 +0100 Subject: [PATCH 14/34] TST Add a test for the 'brute' backends' results' consistency Co-authored-by: Thomas J. Fan --- sklearn/neighbors/tests/test_neighbors.py | 62 +++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 8a47ec35ca2d6..f1a7f80756a98 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -13,8 +13,12 @@ issparse, ) -from sklearn import metrics -from sklearn import neighbors, datasets +from sklearn import ( + config_context, + datasets, + metrics, + neighbors, +) from sklearn.base import clone from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import EfficiencyWarning @@ -1450,7 +1454,6 @@ def test_neighbors_metrics( metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5 ): # Test computing the neighbors for various metrics - # create a symmetric matrix algorithms = ["brute", "ball_tree", "kd_tree"] X_train = rng.rand(n_samples, n_features) X_test = rng.rand(n_query_pts, n_features) @@ -1513,6 +1516,59 @@ def test_neighbors_metrics( assert_array_equal(ball_tree_idx, kd_tree_idx) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") +@pytest.mark.parametrize( + "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"])) +) +def test_kneighbors_brute_backend( + metric, n_samples=2000, n_features=30, n_query_pts=100, n_neighbors=5 +): + # Both backend for the 'brute' algorithm of kneighbors must gives identical results. + X_train = rng.rand(n_samples, n_features) + X_test = rng.rand(n_query_pts, n_features) + + # Haversine distance only accepts 2D data + if metric == "haversine": + feature_sl = slice(None, 2) + X_train = np.ascontiguousarray(X_train[:, feature_sl]) + X_test = np.ascontiguousarray(X_test[:, feature_sl]) + + metric_params_list = _generate_test_params_for(metric, n_features) + + # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 + ExceptionToAssert = None + if metric == "wminkowski" and sp_version >= parse_version("1.6.0"): + ExceptionToAssert = FutureWarning + + for metric_params in metric_params_list: + p = metric_params.pop("p", 2) + + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, + algorithm="brute", + metric=metric, + p=p, + metric_params=metric_params, + ) + + neigh.fit(X_train) + with pytest.warns(ExceptionToAssert): + with config_context(use_pairwise_dist_activate=False): + # Use the legacy back-end for brute + legacy_brute_dst, legacy_brute_idx = neigh.kneighbors( + X_test, return_distance=True + ) + with config_context(use_pairwise_dist_activate=True): + # Use the PairwiseDistancesReduction as a back-end for brute + pdr_brute_dst, pdr_brute_idx = neigh.kneighbors( + X_test, return_distance=True + ) + + assert_allclose(legacy_brute_dst, pdr_brute_dst) + assert_array_equal(legacy_brute_idx, pdr_brute_idx) + + def test_callable_metric(): def custom_metric(x1, x2): return np.sqrt(np.sum(x1 ** 2 + x2 ** 2)) From aa1f86facc2658e2bd21a05a87e7004aa8e303c2 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 25 Jan 2022 09:28:38 +0100 Subject: [PATCH 15/34] fixup! MAINT Add use_pairwise_dist_activate to scikit-learn config --- sklearn/_config.py | 15 ++++++++------- sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +- sklearn/tests/test_config.py | 3 +++ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/sklearn/_config.py b/sklearn/_config.py index 796221360964d..def36fdec1ecc 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -12,6 +12,7 @@ "pairwise_dist_chunk_size": int( os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256) ), + "use_pairwise_dist": True, } _threadlocal = threading.local() @@ -48,7 +49,7 @@ def set_config( print_changed_only=None, display=None, pairwise_dist_chunk_size=None, - use_pairwise_dist_activate=None, + use_pairwise_dist=None, ): """Set global scikit-learn configuration @@ -94,7 +95,7 @@ def set_config( .. versionadded:: 1.1 - use_pairwise_dist_activate : bool, default=None + use_pairwise_dist : bool, default=None Use PairwiseDistancesReduction when possible. Default is True. @@ -117,8 +118,8 @@ def set_config( local_config["display"] = display if pairwise_dist_chunk_size is not None: local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size - if use_pairwise_dist_activate is not None: - local_config["use_pairwise_dist_activate"] = use_pairwise_dist_activate + if use_pairwise_dist is not None: + local_config["use_pairwise_dist"] = use_pairwise_dist @contextmanager @@ -129,7 +130,7 @@ def config_context( print_changed_only=None, display=None, pairwise_dist_chunk_size=None, - use_pairwise_dist_activate=None, + use_pairwise_dist=None, ): """Context manager for global scikit-learn configuration. @@ -174,7 +175,7 @@ def config_context( .. versionadded:: 1.1 - use_pairwise_dist_activate : bool, default=None + use_pairwise_dist : bool, default=None Use PairwiseDistancesReduction when possible. Default is True. @@ -214,7 +215,7 @@ def config_context( print_changed_only=print_changed_only, display=display, pairwise_dist_chunk_size=pairwise_dist_chunk_size, - use_pairwise_dist_activate=use_pairwise_dist_activate, + use_pairwise_dist=use_pairwise_dist, ) try: diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 625d01f9a6e98..f6f42660516b5 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -212,7 +212,7 @@ cdef class PairwiseDistancesReduction: True if the PairwiseDistancesReduction can be used, else False. """ # TODO: support sparse arrays and 32 bits - return (sklearn.get_config().get("use_pairwise_dist_activate", True) and + return (sklearn.get_config().get("use_pairwise_dist", True) and not issparse(X) and X.dtype == np.float64 and not issparse(Y) and Y.dtype == np.float64 and metric in cls.valid_metrics()) diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index e99eb5fc9db82..5666a8d4537d0 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -17,6 +17,7 @@ def test_config_context(): "print_changed_only": True, "display": "text", "pairwise_dist_chunk_size": 256, + "use_pairwise_dist": True, } # Not using as a context manager affects nothing @@ -30,6 +31,7 @@ def test_config_context(): "print_changed_only": True, "display": "text", "pairwise_dist_chunk_size": 256, + "use_pairwise_dist": True, } assert get_config()["assume_finite"] is False @@ -60,6 +62,7 @@ def test_config_context(): "print_changed_only": True, "display": "text", "pairwise_dist_chunk_size": 256, + "use_pairwise_dist": True, } # No positional arguments From 305d21702d006eec3ab826e322e555ed8dfdd1ec Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 25 Jan 2022 11:37:11 +0100 Subject: [PATCH 16/34] fixup! fixup! MAINT Add use_pairwise_dist_activate to scikit-learn config --- sklearn/neighbors/tests/test_neighbors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index f1a7f80756a98..052e7035e051e 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1554,12 +1554,12 @@ def test_kneighbors_brute_backend( neigh.fit(X_train) with pytest.warns(ExceptionToAssert): - with config_context(use_pairwise_dist_activate=False): + with config_context(use_pairwise_dist=False): # Use the legacy back-end for brute legacy_brute_dst, legacy_brute_idx = neigh.kneighbors( X_test, return_distance=True ) - with config_context(use_pairwise_dist_activate=True): + with config_context(use_pairwise_dist=True): # Use the PairwiseDistancesReduction as a back-end for brute pdr_brute_dst, pdr_brute_idx = neigh.kneighbors( X_test, return_distance=True From eced3165be505c8b6968e4aecb6913647167bc9a Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 25 Jan 2022 13:55:02 +0100 Subject: [PATCH 17/34] TST Filter FutureWarning for WMinkowskiDistance --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index d0bc35caf49db..193ec18900c4b 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -310,7 +310,8 @@ def test_strategies_consistency( # Concrete PairwiseDistancesReductions tests - +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("n_features", [50, 500]) @pytest.mark.parametrize("translation", [0, 1e6]) @pytest.mark.parametrize("metric", CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS) From b84259d3208961e05182f4afe82965413e266e80 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 24 Jan 2022 22:43:33 -0500 Subject: [PATCH 18/34] MAINT pin numpydoc in arm for now (#22292) --- build_tools/circle/build_test_arm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/circle/build_test_arm.sh b/build_tools/circle/build_test_arm.sh index 67beaae5dba31..5ea185eed726c 100755 --- a/build_tools/circle/build_test_arm.sh +++ b/build_tools/circle/build_test_arm.sh @@ -60,7 +60,7 @@ fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then # numpydoc requires sphinx mamba install --verbose -y sphinx - mamba install --verbose -y numpydoc + mamba install --verbose -y "numpydoc<1.2" fi python --version From d63713e01ebb8c3ade7d9b86f5e239dc8552c903 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 25 Jan 2022 15:13:51 +0100 Subject: [PATCH 19/34] fixup! TST Filter FutureWarning for WMinkowskiDistance --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 193ec18900c4b..f202094213bf0 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -241,6 +241,8 @@ def test_n_threads_agnosticism( ASSERT_RESULT[PairwiseDistancesReduction](ref_dist, dist, ref_indices, indices) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("seed", range(5)) @pytest.mark.parametrize("n_samples", [100, 1000]) @pytest.mark.parametrize("metric", PairwiseDistancesReduction.valid_metrics()) From 4ad35091eee806dd111c97e446428b58a82321f9 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 25 Jan 2022 17:35:13 +0100 Subject: [PATCH 20/34] Revert keywords arguments removal for the GEMM trick for 'euclidean' --- sklearn/neighbors/_base.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index ce0207d8b4e5d..0c6dfceb5cf42 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -670,7 +670,10 @@ def _kneighbors_reduce_func(self, dist, start, n_neighbors, return_distance): # argpartition doesn't guarantee sorted order, so we sort again neigh_ind = neigh_ind[sample_range, np.argsort(dist[sample_range, neigh_ind])] if return_distance: - result = dist[sample_range, neigh_ind], neigh_ind + if self.effective_metric_ == "euclidean": + result = np.sqrt(dist[sample_range, neigh_ind]), neigh_ind + else: + result = dist[sample_range, neigh_ind], neigh_ind else: result = neigh_ind return result @@ -795,6 +798,12 @@ class from an array representing our data set and ask who's return_distance=return_distance, ) + # for efficiency, use squared euclidean distances + if self.effective_metric_ == "euclidean": + kwds = {"squared": True} + else: + kwds = self.effective_metric_params_ + chunked_results = list( pairwise_distances_chunked( X, @@ -802,7 +811,7 @@ class from an array representing our data set and ask who's reduce_func=reduce_func, metric=self.effective_metric_, n_jobs=n_jobs, - **self.effective_metric_params_, + **kwds, ) ) From 948b04cb6e22214c1af762c79d5285b50a1fbdd7 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Mon, 24 Jan 2022 20:46:30 +0100 Subject: [PATCH 21/34] MAINT pin max numpydoc for now (#22286) --- build_tools/azure/install.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 8fd9b0b9cc67f..baa1da332f8a5 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -147,7 +147,9 @@ fi if [[ "$TEST_DOCSTRINGS" == "true" ]]; then # numpydoc requires sphinx python -m pip install sphinx - python -m pip install numpydoc + # TODO: update the docstring checks to be compatible with new + # numpydoc versions + python -m pip install "numpydoc<1.2" fi python --version From ca45a58c8617c52c9a15709974314e29c82512a5 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 26 Jan 2022 18:13:41 +0100 Subject: [PATCH 22/34] Add 'haversine' to CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS Co-authored-by: Olivier Grisel --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index f202094213bf0..94c2c9a975e23 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -24,6 +24,7 @@ "chebyshev", "cityblock", "euclidean", + "haversine", "minkowski", "seuclidean", ] From ee3c43d51e435b9c82893991d413fa366ef0360d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 26 Jan 2022 18:17:19 +0100 Subject: [PATCH 23/34] fixup! Add 'haversine' to CDIST_PAIRWISE_DISTANCES_REDUCTION_COMMON_METRICS --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 94c2c9a975e23..f6092fea0d0d1 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -333,6 +333,11 @@ def test_pairwise_distances_argkmin( X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread + # Haversine distance only accept 2D data + if metric == "haversine": + X = np.ascontiguousarray(X[:, :2]) + Y = np.ascontiguousarray(Y[:, :2]) + metric_kwargs = _get_dummy_metric_params_list(metric, n_features)[0] # Reference for argkmin results From c8de77cfec2594049cbc6ac17ef5f225262a5b54 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 27 Jan 2022 10:49:59 +0100 Subject: [PATCH 24/34] Apply suggestions from code review Co-authored-by: Olivier Grisel --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 1 + sklearn/neighbors/_base.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index f6092fea0d0d1..e9ae597d6c79e 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -43,6 +43,7 @@ def _get_dummy_metric_params_list(metric: str, n_features: int): # Recent scipy versions accept weights in the Minkowski metric directly: # type: ignore minkowski_kwargs.append(dict(p=3, w=rng.rand(n_features))) + return minkowski_kwargs # TODO: remove this case for "wminkowski" once we no longer support scipy < 1.8.0. diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 0c6dfceb5cf42..9755d4c8c8f76 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -790,7 +790,8 @@ class from an array representing our data set and ask who's ) elif self._fit_method == "brute": - # TODO: support sparse matrices + # TODO: should no longer be needed once PairwiseDistancesArgKmin + # is extended to accept sparse and/or float32 inputs. reduce_func = partial( self._kneighbors_reduce_func, @@ -1107,7 +1108,9 @@ class from an array representing our data set and ask who's ) elif self._fit_method == "brute": - # TODO: support sparse matrices + # TODO: should no longer be needed once we have Cython-optimized + # implementation for radius queries, with support for sparse and/or + # float32 inputs. # for efficiency, use squared euclidean distances if self.effective_metric_ == "euclidean": From 2feec54051c4020a16c272d4403790e6807f70fe Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 27 Jan 2022 11:26:59 +0100 Subject: [PATCH 25/34] MAINT Document some config parameters for maintenance Also rename one of them. Co-authored-by: Thomas J. Fan Co-authored-by: Olivier Grisel --- sklearn/_config.py | 32 ++++++++++++++----- .../metrics/_pairwise_distances_reduction.pyx | 2 +- sklearn/neighbors/tests/test_neighbors.py | 4 +-- sklearn/tests/test_config.py | 6 ++-- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/sklearn/_config.py b/sklearn/_config.py index def36fdec1ecc..2acf8401c4e3f 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -12,7 +12,7 @@ "pairwise_dist_chunk_size": int( os.environ.get("SKLEARN_PAIRWISE_DIST_CHUNK_SIZE", 256) ), - "use_pairwise_dist": True, + "enable_cython_pairwise_dist": True, } _threadlocal = threading.local() @@ -49,7 +49,7 @@ def set_config( print_changed_only=None, display=None, pairwise_dist_chunk_size=None, - use_pairwise_dist=None, + enable_cython_pairwise_dist=None, ): """Set global scikit-learn configuration @@ -93,12 +93,20 @@ def set_config( The number of vectors per chunk for PairwiseDistancesReduction. Default is 256 (suitable for most of modern laptops' caches and architectures). + Intended for easier benchmarking and testing of scikit-learn internals. + End users are not expected to benefit from customizing this configuration + setting. + .. versionadded:: 1.1 - use_pairwise_dist : bool, default=None + enable_cython_pairwise_dist : bool, default=None Use PairwiseDistancesReduction when possible. Default is True. + Intended for easier benchmarking and testing of scikit-learn internals. + End users are not expected to benefit from customizing this configuration + setting. + .. versionadded:: 1.1 See Also @@ -118,8 +126,8 @@ def set_config( local_config["display"] = display if pairwise_dist_chunk_size is not None: local_config["pairwise_dist_chunk_size"] = pairwise_dist_chunk_size - if use_pairwise_dist is not None: - local_config["use_pairwise_dist"] = use_pairwise_dist + if enable_cython_pairwise_dist is not None: + local_config["enable_cython_pairwise_dist"] = enable_cython_pairwise_dist @contextmanager @@ -130,7 +138,7 @@ def config_context( print_changed_only=None, display=None, pairwise_dist_chunk_size=None, - use_pairwise_dist=None, + enable_cython_pairwise_dist=None, ): """Context manager for global scikit-learn configuration. @@ -173,12 +181,20 @@ def config_context( The number of vectors per chunk for PairwiseDistancesReduction. Default is 256 (suitable for most of modern laptops' caches and architectures). + Intended for easier benchmarking and testing of scikit-learn internals. + End users are not expected to benefit from customizing this configuration + setting. + .. versionadded:: 1.1 - use_pairwise_dist : bool, default=None + enable_cython_pairwise_dist : bool, default=None Use PairwiseDistancesReduction when possible. Default is True. + Intended for easier benchmarking and testing of scikit-learn internals. + End users are not expected to benefit from customizing this configuration + setting. + .. versionadded:: 1.1 Yields @@ -215,7 +231,7 @@ def config_context( print_changed_only=print_changed_only, display=display, pairwise_dist_chunk_size=pairwise_dist_chunk_size, - use_pairwise_dist=use_pairwise_dist, + enable_cython_pairwise_dist=enable_cython_pairwise_dist, ) try: diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index f6f42660516b5..340ed4ac0286c 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -212,7 +212,7 @@ cdef class PairwiseDistancesReduction: True if the PairwiseDistancesReduction can be used, else False. """ # TODO: support sparse arrays and 32 bits - return (sklearn.get_config().get("use_pairwise_dist", True) and + return (sklearn.get_config().get("enable_cython_pairwise_dist", True) and not issparse(X) and X.dtype == np.float64 and not issparse(Y) and Y.dtype == np.float64 and metric in cls.valid_metrics()) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 052e7035e051e..0221b72264e40 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1554,12 +1554,12 @@ def test_kneighbors_brute_backend( neigh.fit(X_train) with pytest.warns(ExceptionToAssert): - with config_context(use_pairwise_dist=False): + with config_context(enable_cython_pairwise_dist=False): # Use the legacy back-end for brute legacy_brute_dst, legacy_brute_idx = neigh.kneighbors( X_test, return_distance=True ) - with config_context(use_pairwise_dist=True): + with config_context(enable_cython_pairwise_dist=True): # Use the PairwiseDistancesReduction as a back-end for brute pdr_brute_dst, pdr_brute_idx = neigh.kneighbors( X_test, return_distance=True diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index 5666a8d4537d0..f2c5b96a6703b 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -17,7 +17,7 @@ def test_config_context(): "print_changed_only": True, "display": "text", "pairwise_dist_chunk_size": 256, - "use_pairwise_dist": True, + "enable_cython_pairwise_dist": True, } # Not using as a context manager affects nothing @@ -31,7 +31,7 @@ def test_config_context(): "print_changed_only": True, "display": "text", "pairwise_dist_chunk_size": 256, - "use_pairwise_dist": True, + "enable_cython_pairwise_dist": True, } assert get_config()["assume_finite"] is False @@ -62,7 +62,7 @@ def test_config_context(): "print_changed_only": True, "display": "text", "pairwise_dist_chunk_size": 256, - "use_pairwise_dist": True, + "enable_cython_pairwise_dist": True, } # No positional arguments From 7a4c1373d287c68aa639b8a98735419a1881f729 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 27 Jan 2022 11:47:05 +0100 Subject: [PATCH 26/34] FIX Support and test one of 'sqeuclidean' specification Co-authored-by: Olivier Grisel --- sklearn/metrics/pairwise.py | 12 ++++++++++++ sklearn/metrics/tests/test_pairwise.py | 22 ++++++++++++++++------ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 8baabcf193653..5b1f1032ed781 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -667,6 +667,12 @@ def pairwise_distances_argmin_min( metric_kwargs = {} if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): + # This is an adaptation for one "sqeuclidean" specification. + # For this back-end, we can directly use "sqeuclidean". + if metric_kwargs.get("squared", False) and metric == "euclidean": + metric = "sqeuclidean" + metric_kwargs = {} + values, indices = PairwiseDistancesArgKmin.compute( X=X, Y=Y, @@ -772,6 +778,12 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs metric_kwargs = {} if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): + # This is an adaptation for one "sqeuclidean" specification. + # For this back-end, we can directly use "sqeuclidean". + if metric_kwargs.get("squared", False) and metric == "euclidean": + metric = "sqeuclidean" + metric_kwargs = {} + indices = PairwiseDistancesArgKmin.compute( X=X, Y=Y, diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 3a3fb35b1e35c..efcf63955c239 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -432,10 +432,11 @@ def test_paired_distances_callable(): paired_distances(X, Y) -def test_pairwise_distances_argmin_min(): +@pytest.mark.parametrize("dtype", (np.float32, np.float64)) +def test_pairwise_distances_argmin_min(dtype): # Check pairwise minimum distances computation for any metric - X = [[0], [1]] - Y = [[-2], [3]] + X = np.asarray([[0], [1]], dtype=dtype) + Y = np.asarray([[-2], [3]], dtype=dtype) Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) @@ -458,14 +459,23 @@ def test_pairwise_distances_argmin_min(): assert type(idxsp) == np.ndarray assert type(valssp) == np.ndarray - # euclidean metric squared # Squared Euclidean metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="sqeuclidean") - idx2 = pairwise_distances_argmin(X, Y, metric="sqeuclidean") + idx2, vals2 = pairwise_distances_argmin_min( + X, Y, metric="euclidean", metric_kwargs={"squared": True} + ) + idx3 = pairwise_distances_argmin(X, Y, metric="sqeuclidean") + idx4 = pairwise_distances_argmin( + X, Y, metric="euclidean", metric_kwargs={"squared": True} + ) - assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals_sq) + assert_array_almost_equal(vals2, expected_vals_sq) + + assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) + assert_array_almost_equal(idx3, expected_idx) + assert_array_almost_equal(idx4, expected_idx) # Non-euclidean scikit-learn metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") From ea762b745c76416a4e52467d1c8b2a302a0a67cf Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 27 Jan 2022 11:54:09 +0100 Subject: [PATCH 27/34] FIX Various typos fix and correct haversine 'haversine' is not supported by cdist. --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 3 +-- sklearn/neighbors/tests/test_neighbors.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index e9ae597d6c79e..82351e9022d6a 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -24,7 +24,6 @@ "chebyshev", "cityblock", "euclidean", - "haversine", "minkowski", "seuclidean", ] @@ -334,7 +333,7 @@ def test_pairwise_distances_argkmin( X = translation + rng.rand(n_samples, n_features).astype(dtype) * spread Y = translation + rng.rand(n_samples, n_features).astype(dtype) * spread - # Haversine distance only accept 2D data + # Haversine distance only accepts 2D data if metric == "haversine": X = np.ascontiguousarray(X[:, :2]) Y = np.ascontiguousarray(Y[:, :2]) diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 0221b72264e40..538bfdafdd6a2 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1524,7 +1524,7 @@ def test_neighbors_metrics( def test_kneighbors_brute_backend( metric, n_samples=2000, n_features=30, n_query_pts=100, n_neighbors=5 ): - # Both backend for the 'brute' algorithm of kneighbors must gives identical results. + # Both backend for the 'brute' algorithm of kneighbors must give identical results. X_train = rng.rand(n_samples, n_features) X_test = rng.rand(n_query_pts, n_features) From b9cb0f4c94929017a39d09776c471c56779f95e5 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 27 Jan 2022 17:44:25 +0100 Subject: [PATCH 28/34] Directly use get_config --- sklearn/metrics/_pairwise_distances_reduction.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 340ed4ac0286c..a6118dedd80da 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -17,7 +17,6 @@ cimport numpy as np import numpy as np import warnings -import sklearn from .. import get_config from libc.stdlib cimport free, malloc from libc.float cimport DBL_MAX @@ -212,7 +211,7 @@ cdef class PairwiseDistancesReduction: True if the PairwiseDistancesReduction can be used, else False. """ # TODO: support sparse arrays and 32 bits - return (sklearn.get_config().get("enable_cython_pairwise_dist", True) and + return (get_config().get("enable_cython_pairwise_dist", True) and not issparse(X) and X.dtype == np.float64 and not issparse(Y) and Y.dtype == np.float64 and metric in cls.valid_metrics()) From 2df70b1f3629f710fcc5a1f57b2d352b6a76ff2b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 1 Feb 2022 16:28:24 +0100 Subject: [PATCH 29/34] CLN Apply comments from review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Christian Lorentzen Co-authored-by: Jérémie du Boisberranger --- doc/whats_new/v1.1.rst | 1 + sklearn/_config.py | 2 +- sklearn/metrics/_pairwise_distances_reduction.pyx | 2 +- sklearn/metrics/pairwise.py | 8 ++++---- sklearn/metrics/tests/test_dist_metrics.py | 1 - sklearn/neighbors/tests/test_neighbors.py | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 7e01eba77ac31..fb8254a3b5fe1 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -585,6 +585,7 @@ Miscellaneous can be up to ×20 faster than in the previous versions'. :pr:`21462` by :user:`Julien Jerphanion `. + - |Fix| :func:`check_scalar` raises an error when `include_boundaries={"left", "right"}` and the boundaries are not set. :pr:`22027` by `Marie Lanternier `. diff --git a/sklearn/_config.py b/sklearn/_config.py index 2acf8401c4e3f..6248025b05aa0 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -90,7 +90,7 @@ def set_config( .. versionadded:: 0.23 pairwise_dist_chunk_size : int, default=None - The number of vectors per chunk for PairwiseDistancesReduction. + The number of row vectors per chunk for PairwiseDistancesReduction. Default is 256 (suitable for most of modern laptops' caches and architectures). Intended for easier benchmarking and testing of scikit-learn internals. diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index a6118dedd80da..3833773d7e0f6 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -643,7 +643,7 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # Note (jjerphan): Some design thoughts for future extensions. # This factory comes to handle specialisations for the given arguments. # For future work, this might can be an entrypoint to specialise operations - # for various back-end and/or hardware and/or datatypes, and/or fused + # for various backend and/or hardware and/or datatypes, and/or fused # {sparse, dense}-datasetspair etc. if ( metric in ("euclidean", "sqeuclidean") diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 5b1f1032ed781..5c2768497c7a4 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -667,8 +667,8 @@ def pairwise_distances_argmin_min( metric_kwargs = {} if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): - # This is an adaptation for one "sqeuclidean" specification. - # For this back-end, we can directly use "sqeuclidean". + # This is an adaptor for one "sqeuclidean" specification. + # For this backend, we can directly use "sqeuclidean". if metric_kwargs.get("squared", False) and metric == "euclidean": metric = "sqeuclidean" metric_kwargs = {} @@ -778,8 +778,8 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs metric_kwargs = {} if PairwiseDistancesArgKmin.is_usable_for(X, Y, metric): - # This is an adaptation for one "sqeuclidean" specification. - # For this back-end, we can directly use "sqeuclidean". + # This is an adaptor for one "sqeuclidean" specification. + # For this backend, we can directly use "sqeuclidean". if metric_kwargs.get("squared", False) and metric == "euclidean": metric = "sqeuclidean" metric_kwargs = {} diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index c19faa0dad996..6c841d1d44f8c 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -65,7 +65,6 @@ def dist_func(x1, x2, p): ) -@pytest.mark.parametrize("metric", METRICS_DEFAULT_PARAMS) def check_cdist(metric, kwargs, X1, X2): if metric == "wminkowski": # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 538bfdafdd6a2..1009e9a713f7f 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1555,12 +1555,12 @@ def test_kneighbors_brute_backend( neigh.fit(X_train) with pytest.warns(ExceptionToAssert): with config_context(enable_cython_pairwise_dist=False): - # Use the legacy back-end for brute + # Use the legacy backend for brute legacy_brute_dst, legacy_brute_idx = neigh.kneighbors( X_test, return_distance=True ) with config_context(enable_cython_pairwise_dist=True): - # Use the PairwiseDistancesReduction as a back-end for brute + # Use the PairwiseDistancesReduction as a backend for brute pdr_brute_dst, pdr_brute_idx = neigh.kneighbors( X_test, return_distance=True ) From b6e6f3d38fb2f931cece6d87c4c569f977941dbf Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 1 Feb 2022 17:03:54 +0100 Subject: [PATCH 30/34] Motivate swapped returned values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger Co-authored-by: Thomas J. Fan --- sklearn/metrics/_pairwise_distances_reduction.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/metrics/_pairwise_distances_reduction.pyx b/sklearn/metrics/_pairwise_distances_reduction.pyx index 3833773d7e0f6..df0918bb61334 100644 --- a/sklearn/metrics/_pairwise_distances_reduction.pyx +++ b/sklearn/metrics/_pairwise_distances_reduction.pyx @@ -884,6 +884,10 @@ cdef class PairwiseDistancesArgKmin(PairwiseDistancesReduction): # We need to recompute distances because we relied on # surrogate distances for the reduction. self.compute_exact_distances() + + # Values are returned identically to the way `KNeighborsMixin.kneighbors` + # returns values. This is counter-intuitive but this allows not using + # complex adaptations where `PairwiseDistancesArgKmin.compute` is called. return np.asarray(self.argkmin_distances), np.asarray(self.argkmin_indices) return np.asarray(self.argkmin_indices) From 16d777f0c8885c5e2c2799b602d524446fa705ad Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 2 Feb 2022 08:26:08 +0100 Subject: [PATCH 31/34] TST Remove mahalanobis from test fixtures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérémie du Boisberranger --- sklearn/metrics/tests/test_pairwise_distances_reduction.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 82351e9022d6a..b9f3d7dbf3dd5 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -59,12 +59,6 @@ def _get_dummy_metric_params_list(metric: str, n_features: int): if metric == "seuclidean": return [dict(V=rng.rand(n_features))] - if metric == "mahalanobis": - A = rng.rand(n_features, n_features) - # Make the matrix symmetric positive definite - VI = A + A.T + 3 * np.eye(n_features) - return [dict(VI=VI)] - # Case of: "euclidean", "manhattan", "chebyshev", "haversine" or any other metric. # In those cases, no kwargs is needed. return [{}] From bd02da09d442898ef6bf7b647f9ced7719caf16d Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 2 Feb 2022 08:29:15 +0100 Subject: [PATCH 32/34] MNT Add comment regaduction functions' signatures Co-authored-by: Christian Lorentzen Co-authored-by: Olivier Grisel --- sklearn/metrics/pairwise.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 5c2768497c7a4..2eec32d1682af 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -578,6 +578,13 @@ def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): return distances +# start is specified in the signature of `_argmin_min_reduce` +# and of `_argmin_reduce` but is not used. +# This is because the higher order `pairwise_distances_chunked` +# function needs reduction functions that are passed as argument +# to have a two arguments signature. + + def _argmin_min_reduce(dist, start): indices = dist.argmin(axis=1) values = dist[np.arange(dist.shape[0]), indices] From 5ea0427d60f79a8891ea0428916465a4c56cf8db Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Tue, 8 Feb 2022 22:50:39 +0100 Subject: [PATCH 33/34] TST Complete test for `pairwise_distance_{argmin,argmin_min}` (#22371) --- sklearn/metrics/tests/test_pairwise.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index efcf63955c239..b92b4416ffeb1 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -517,6 +517,30 @@ def test_pairwise_distances_argmin_min(dtype): np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) + # Changing the axis and permuting datasets must give the same results + argmin_0, dist_0 = pairwise_distances_argmin_min(X, Y, axis=0) + argmin_1, dist_1 = pairwise_distances_argmin_min(Y, X, axis=1) + + assert_allclose(dist_0, dist_1) + assert_array_equal(argmin_0, argmin_1) + + argmin_0, dist_0 = pairwise_distances_argmin_min(X, X, axis=0) + argmin_1, dist_1 = pairwise_distances_argmin_min(X, X, axis=1) + + assert_allclose(dist_0, dist_1) + assert_array_equal(argmin_0, argmin_1) + + # Changing the axis and permuting datasets must give the same results + argmin_0 = pairwise_distances_argmin(X, Y, axis=0) + argmin_1 = pairwise_distances_argmin(Y, X, axis=1) + + assert_array_equal(argmin_0, argmin_1) + + argmin_0 = pairwise_distances_argmin(X, X, axis=0) + argmin_1 = pairwise_distances_argmin(X, X, axis=1) + + assert_array_equal(argmin_0, argmin_1) + def _reduce_func(dist, start): return dist[:, :100] From 5f5a83f9f3e0b89096e3eb868a49c07999f0b85b Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 9 Feb 2022 11:06:01 +0100 Subject: [PATCH 34/34] DOC Add sub-pull requests to the whats_new entry --- doc/whats_new/v1.1.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index fb8254a3b5fe1..613b085d5d42c 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -584,7 +584,8 @@ Miscellaneous For instance :class:`sklearn.neighbors.NearestNeighbors.kneighbors` can be up to ×20 faster than in the previous versions'. - :pr:`21462` by :user:`Julien Jerphanion `. + :pr:`21987`, :pr:`22064`, :pr:`22065` and :pr:`22288` + by :user:`Julien Jerphanion ` - |Fix| :func:`check_scalar` raises an error when `include_boundaries={"left", "right"}` and the boundaries are not set.