diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 5dd10d83bfe68..bf258ea564c8c 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -39,6 +39,7 @@ def dist_func(x1, x2, p): VI = np.dot(V, V.T) BOOL_METRICS = [ + "hamming", "matching", "jaccard", "dice", diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py index 179a82e150f93..bcf448ae65c05 100644 --- a/sklearn/neighbors/_base.py +++ b/sklearn/neighbors/_base.py @@ -32,7 +32,7 @@ from ..utils.multiclass import check_classification_targets from ..utils.validation import check_is_fitted from ..utils.validation import check_non_negative -from ..utils.fixes import delayed +from ..utils.fixes import delayed, sp_version from ..utils.fixes import parse_version from ..exceptions import DataConversionWarning, EfficiencyWarning @@ -63,11 +63,15 @@ "sokalsneath", "sqeuclidean", "yule", - "wminkowski", ] ), ) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +if sp_version < parse_version("1.8.0.dev0"): + # Before scipy 1.8.0.dev0, wminkowski was the key to use + # the weighted minkowski metric. + VALID_METRICS["brute"].append("wminkowski") VALID_METRICS_SPARSE = dict( ball_tree=[], diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 5ba940c0bd1a7..4594d1ade1a25 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -20,13 +20,21 @@ from sklearn.exceptions import EfficiencyWarning from sklearn.exceptions import NotFittedError from sklearn.metrics.pairwise import pairwise_distances +from sklearn.metrics.tests.test_dist_metrics import BOOL_METRICS from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split from sklearn.neighbors import VALID_METRICS_SPARSE -from sklearn.neighbors._base import _is_sorted_by_data, _check_precomputed +from sklearn.neighbors._base import ( + _is_sorted_by_data, + _check_precomputed, + KNeighborsMixin, +) from sklearn.pipeline import make_pipeline -from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import ( + assert_allclose, + assert_array_almost_equal, + assert_array_equal, +) from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import check_random_state from sklearn.utils.fixes import sp_version, parse_version @@ -50,6 +58,9 @@ SPARSE_OR_DENSE = SPARSE_TYPES + (np.asarray,) ALGORITHMS = ("ball_tree", "brute", "kd_tree", "auto") +COMMON_VALID_METRICS = sorted( + set.intersection(*map(set, neighbors.VALID_METRICS.values())) +) P = (1, 2, 3, 4, np.inf) JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys()) @@ -59,7 +70,7 @@ def _generate_test_params_for(metric: str, n_features: int): - """Return list of dummy DistanceMetric kwargs for tests.""" + """Return list of DistanceMetric kwargs for tests.""" # Distinguishing on cases not to compute unneeded datastructures. rng = np.random.RandomState(1) @@ -110,42 +121,153 @@ def _weight_func(dist): return retval ** 2 +@pytest.mark.parametrize( + "n_samples, n_features, n_query_pts, n_neighbors", + [ + (100, 100, 10, 100), + (1000, 5, 100, 1), + ], +) +@pytest.mark.parametrize("query_is_train", [False, True]) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) def test_unsupervised_kneighbors( - n_samples=20, n_features=5, n_query_pts=2, n_neighbors=5 + n_samples, + n_features, + n_query_pts, + n_neighbors, + query_is_train, + metric, ): - # Test unsupervised neighbors methods - X = rng.rand(n_samples, n_features) + # The different algorithms must return identical results + # on their common metrics, with and without returning + # distances - test = rng.rand(n_query_pts, n_features) + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features) - for p in P: - results_nodist = [] - results = [] + query = X if query_is_train else local_rng.rand(n_query_pts, n_features) - for algorithm in ALGORITHMS: - neigh = neighbors.NearestNeighbors( - n_neighbors=n_neighbors, algorithm=algorithm, p=p - ) - neigh.fit(X) + results_nodist = [] + results = [] - results_nodist.append(neigh.kneighbors(test, return_distance=False)) - results.append(neigh.kneighbors(test, return_distance=True)) + for algorithm in ALGORITHMS: + neigh = neighbors.NearestNeighbors( + n_neighbors=n_neighbors, algorithm=algorithm, metric=metric + ) + neigh.fit(X) - for i in range(len(results) - 1): - assert_array_almost_equal(results_nodist[i], results[i][1]) - assert_array_almost_equal(results[i][0], results[i + 1][0]) - assert_array_almost_equal(results[i][1], results[i + 1][1]) + results_nodist.append(neigh.kneighbors(query, return_distance=False)) + results.append(neigh.kneighbors(query, return_distance=True)) + + for i in range(len(results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + indices_no_dist = results_nodist[i] + distances, next_distances = results[i][0], results[i + 1][0] + indices, next_indices = results[i][1], results[i + 1][1] + assert_array_equal( + indices_no_dist, + indices, + err_msg=( + f"The '{algorithm}' algorithm returns different" + "indices depending on 'return_distances'." + ), + ) + assert_array_equal( + indices, + next_indices, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different indices." + ), + ) + assert_allclose( + distances, + next_distances, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different distances." + ), + atol=1e-6, + ) @pytest.mark.parametrize( - "NearestNeighbors", + "n_samples, n_features, n_query_pts", + [ + (100, 100, 10), + (1000, 5, 100), + ], +) +@pytest.mark.parametrize("metric", COMMON_VALID_METRICS) +@pytest.mark.parametrize("n_neighbors, radius", [(1, 100), (50, 500), (100, 1000)]) +@pytest.mark.parametrize( + "NeighborsMixinSubclass", + [ + neighbors.KNeighborsClassifier, + neighbors.KNeighborsRegressor, + neighbors.RadiusNeighborsClassifier, + neighbors.RadiusNeighborsRegressor, + ], +) +def test_neigh_predictions_algorithm_agnosticity( + n_samples, + n_features, + n_query_pts, + metric, + n_neighbors, + radius, + NeighborsMixinSubclass, +): + # The different algorithms must return identical predictions results + # on their common metrics. + + # Redefining the rng locally to use the same generated X + local_rng = np.random.RandomState(0) + X = local_rng.rand(n_samples, n_features) + y = local_rng.randint(3, size=n_samples) + + query = local_rng.rand(n_query_pts, n_features) + + predict_results = [] + + parameter = ( + n_neighbors if issubclass(NeighborsMixinSubclass, KNeighborsMixin) else radius + ) + + for algorithm in ALGORITHMS: + neigh = NeighborsMixinSubclass(parameter, algorithm=algorithm, metric=metric) + neigh.fit(X, y) + + predict_results.append(neigh.predict(query)) + + for i in range(len(predict_results) - 1): + algorithm = ALGORITHMS[i] + next_algorithm = ALGORITHMS[i + 1] + + predictions, next_predictions = predict_results[i], predict_results[i + 1] + + assert_allclose( + predictions, + next_predictions, + err_msg=( + f"The '{algorithm}' and '{next_algorithm}' " + "algorithms return different predictions." + ), + ) + + +@pytest.mark.parametrize( + "KNeighborsMixinSubclass", [ neighbors.KNeighborsClassifier, neighbors.KNeighborsRegressor, neighbors.NearestNeighbors, ], ) -def test_unsupervised_inputs(NearestNeighbors): +def test_unsupervised_inputs(KNeighborsMixinSubclass): # Test unsupervised inputs for neighbors estimators X = rng.random_sample((10, 3)) @@ -155,7 +277,7 @@ def test_unsupervised_inputs(NearestNeighbors): dist1, ind1 = nbrs_fid.kneighbors(X) - nbrs = NearestNeighbors(n_neighbors=1) + nbrs = KNeighborsMixinSubclass(n_neighbors=1) for data in (nbrs_fid, neighbors.BallTree(X), neighbors.KDTree(X)): nbrs.fit(data, y) @@ -1208,19 +1330,19 @@ def test_kneighbors_graph(): assert_array_almost_equal(A.toarray(), [[1, 1, 1], [1, 1, 1], [1, 1, 1]]) -def test_kneighbors_graph_sparse(seed=36): +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +def test_kneighbors_graph_sparse(n_neighbors, mode, seed=36): # Test kneighbors_graph to build the k-Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) - for n_neighbors in [1, 2, 3]: - for mode in ["connectivity", "distance"]: - assert_array_almost_equal( - neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), - neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), - ) + assert_array_almost_equal( + neighbors.kneighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.kneighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) def test_radius_neighbors_graph(): @@ -1236,21 +1358,19 @@ def test_radius_neighbors_graph(): ) -def test_radius_neighbors_graph_sparse(seed=36): +@pytest.mark.parametrize("n_neighbors", [1, 2, 3]) +@pytest.mark.parametrize("mode", ["connectivity", "distance"]) +def test_radius_neighbors_graph_sparse(n_neighbors, mode, seed=36): # Test radius_neighbors_graph to build the Nearest Neighbor graph # for sparse input. rng = np.random.RandomState(seed) X = rng.randn(10, 10) Xcsr = csr_matrix(X) - for n_neighbors in [1, 2, 3]: - for mode in ["connectivity", "distance"]: - assert_array_almost_equal( - neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), - neighbors.radius_neighbors_graph( - Xcsr, n_neighbors, mode=mode - ).toarray(), - ) + assert_array_almost_equal( + neighbors.radius_neighbors_graph(X, n_neighbors, mode=mode).toarray(), + neighbors.radius_neighbors_graph(Xcsr, n_neighbors, mode=mode).toarray(), + ) def test_neighbors_badargs(): @@ -1317,57 +1437,34 @@ def test_neighbors_badargs(): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") -def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): +@pytest.mark.parametrize( + "metric", + sorted( + set(neighbors.VALID_METRICS["ball_tree"]).intersection( + neighbors.VALID_METRICS["brute"] + ) + - set(["pyfunc", *BOOL_METRICS]) + ), +) +def test_neighbors_metrics( + metric, n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5 +): # Test computing the neighbors for various metrics # create a symmetric matrix - V = rng.rand(n_features, n_features) - VI = np.dot(V, V.T) - - metrics = [ - ("euclidean", {}), - ("manhattan", {}), - ("minkowski", dict(p=1)), - ("minkowski", dict(p=2)), - ("minkowski", dict(p=3)), - ("minkowski", dict(p=np.inf)), - ("chebyshev", {}), - ("seuclidean", dict(V=rng.rand(n_features))), - ("mahalanobis", dict(VI=VI)), - ("haversine", {}), - ] - if sp_version < parse_version("1.8.0.dev0"): - # TODO: remove once we no longer support scipy < 1.8.0. - # wminkowski was removed in scipy 1.8.0 but should work for previous - # versions. - metrics.append( - ("wminkowski", dict(p=3, w=rng.rand(n_features))), - ) - else: - # Recent scipy versions accept weights in the Minkowski metric directly: - metrics.append( - ("minkowski", dict(p=3, w=rng.rand(n_features))), - ) - algorithms = ["brute", "ball_tree", "kd_tree"] - X = rng.rand(n_samples, n_features) + X_train = rng.rand(n_samples, n_features) + X_test = rng.rand(n_query_pts, n_features) - test = rng.rand(n_query_pts, n_features) + metric_params_list = _generate_test_params_for(metric, n_features) - for metric, metric_params in metrics: + for metric_params in metric_params_list: + # Some metric (e.g. Weighted minkowski) are not supported by KDTree + exclude_kd_tree = metric not in neighbors.VALID_METRICS["kd_tree"] or ( + "minkowski" in metric and "w" in metric_params + ) results = {} p = metric_params.pop("p", 2) - w = metric_params.get("w", None) for algorithm in algorithms: - # KD tree doesn't support all metrics - if algorithm == "kd_tree" and ( - metric not in neighbors.KDTree.valid_metrics or w is not None - ): - est = neighbors.NearestNeighbors( - algorithm=algorithm, metric=metric, metric_params=metric_params - ) - with pytest.raises(ValueError): - est.fit(X) - continue neigh = neighbors.NearestNeighbors( n_neighbors=n_neighbors, algorithm=algorithm, @@ -1376,10 +1473,18 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor metric_params=metric_params, ) + if exclude_kd_tree and algorithm == "kd_tree": + with pytest.raises(ValueError): + neigh.fit(X_train) + continue + # Haversine distance only accepts 2D data - feature_sl = slice(None, 2) if metric == "haversine" else slice(None) + if metric == "haversine": + feature_sl = slice(None, 2) + X_train = np.ascontiguousarray(X_train[:, feature_sl]) + X_test = np.ascontiguousarray(X_test[:, feature_sl]) - neigh.fit(X[:, feature_sl]) + neigh.fit(X_train) # wminkoski is deprecated in SciPy 1.6.0 and removed in 1.8.0 ExceptionToAssert = None @@ -1391,15 +1496,21 @@ def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbor ExceptionToAssert = DeprecationWarning with pytest.warns(ExceptionToAssert): - results[algorithm] = neigh.kneighbors( - test[:, feature_sl], return_distance=True - ) + results[algorithm] = neigh.kneighbors(X_test, return_distance=True) + + brute_dst, brute_idx = results["brute"] + ball_tree_dst, ball_tree_idx = results["ball_tree"] + + assert_allclose(brute_dst, ball_tree_dst) + assert_array_equal(brute_idx, ball_tree_idx) + + if not exclude_kd_tree: + kd_tree_dst, kd_tree_idx = results["kd_tree"] + assert_allclose(brute_dst, kd_tree_dst) + assert_array_equal(brute_idx, kd_tree_idx) - assert_array_almost_equal(results["brute"][0], results["ball_tree"][0]) - assert_array_almost_equal(results["brute"][1], results["ball_tree"][1]) - if "kd_tree" in results: - assert_array_almost_equal(results["brute"][0], results["kd_tree"][0]) - assert_array_almost_equal(results["brute"][1], results["kd_tree"][1]) + assert_allclose(ball_tree_dst, kd_tree_dst) + assert_array_equal(ball_tree_idx, kd_tree_idx) def test_callable_metric(): @@ -1427,10 +1538,6 @@ def custom_metric(x1, x2): @pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric", neighbors.VALID_METRICS["brute"]) def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features=12): - # Any valid metric for algorithm="brute" must be a valid for algorithm="auto". - # It's the responsibility of the estimator to select which algorithm is likely - # to be the most efficient from the subset of the algorithm compatible with - # that metric (and params). Worst case is to fallback to algorithm="brute". X = rng.rand(n_samples, n_features) Xcsr = csr_matrix(X) @@ -1457,7 +1564,8 @@ def test_valid_brute_metric_for_auto_algorithm(metric, n_samples=20, n_features= ) # Haversine distance only accepts 2D data if metric == "haversine": - X = np.ascontiguousarray(X[:, :2]) + feature_sl = slice(None, 2) + X = np.ascontiguousarray(X[:, feature_sl]) nn.fit(X) nn.kneighbors(X) @@ -1559,82 +1667,86 @@ def test_k_and_radius_neighbors_train_is_not_query(): assert_array_equal(rng.A, [[0, 1], [1, 1]]) -def test_k_and_radius_neighbors_X_None(): +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_X_None(algorithm): # Test kneighbors et.al when query is None - for algorithm in ALGORITHMS: + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + + X = [[0], [1]] + nn.fit(X) + + dist, ind = nn.kneighbors() + assert_array_equal(dist, [[1], [1]]) + assert_array_equal(ind, [[1], [0]]) + dist, ind = nn.radius_neighbors(None, radius=1.5) + check_object_arrays(dist, [[1], [1]]) + check_object_arrays(ind, [[1], [0]]) + + # Test the graph variants. + rng = nn.radius_neighbors_graph(None, radius=1.5) + kng = nn.kneighbors_graph(None) + for graph in [rng, kng]: + assert_array_equal(graph.A, [[0, 1], [1, 0]]) + assert_array_equal(graph.data, [1, 1]) + assert_array_equal(graph.indices, [1, 0]) + + X = [[0, 1], [0, 1], [1, 1]] + nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) + nn.fit(X) + assert_array_equal( + nn.kneighbors_graph().A, + np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), + ) - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) - X = [[0], [1]] - nn.fit(X) +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_k_and_radius_neighbors_duplicates(algorithm): + # Test behavior of kneighbors when duplicates are present in query + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) + duplicates = [[0], [1], [3]] - dist, ind = nn.kneighbors() - assert_array_equal(dist, [[1], [1]]) - assert_array_equal(ind, [[1], [0]]) - dist, ind = nn.radius_neighbors(None, radius=1.5) - check_object_arrays(dist, [[1], [1]]) - check_object_arrays(ind, [[1], [0]]) + nn.fit(duplicates) - # Test the graph variants. - rng = nn.radius_neighbors_graph(None, radius=1.5) - kng = nn.kneighbors_graph(None) - for graph in [rng, kng]: - assert_array_equal(graph.A, [[0, 1], [1, 0]]) - assert_array_equal(graph.data, [1, 1]) - assert_array_equal(graph.indices, [1, 0]) - - X = [[0, 1], [0, 1], [1, 1]] - nn = neighbors.NearestNeighbors(n_neighbors=2, algorithm=algorithm) - nn.fit(X) - assert_array_equal( - nn.kneighbors_graph().A, - np.array([[0.0, 1.0, 1.0], [1.0, 0.0, 1.0], [1.0, 1.0, 0]]), - ) + # Do not do anything special to duplicates. + kng = nn.kneighbors_graph(duplicates, mode="distance") + assert_allclose( + kng.toarray(), np.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]) + ) + assert_allclose(kng.data, [0.0, 0.0, 0.0]) + assert_allclose(kng.indices, [0, 1, 2]) + dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) + check_object_arrays(dist, [[0, 1], [1, 0]]) + check_object_arrays(ind, [[0, 1], [0, 1]]) -def test_k_and_radius_neighbors_duplicates(): - # Test behavior of kneighbors when duplicates are present in query + rng = nn.radius_neighbors_graph(duplicates, radius=1.5) + assert_allclose( + rng.toarray(), np.array([[1.0, 1.0, 0.0], [1.0, 1.0, 0.0], [0.0, 0.0, 1.0]]) + ) - for algorithm in ALGORITHMS: - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm=algorithm) - nn.fit([[0], [1]]) - - # Do not do anything special to duplicates. - kng = nn.kneighbors_graph([[0], [1]], mode="distance") - assert_array_equal(kng.A, np.array([[0.0, 0.0], [0.0, 0.0]])) - assert_array_equal(kng.data, [0.0, 0.0]) - assert_array_equal(kng.indices, [0, 1]) - - dist, ind = nn.radius_neighbors([[0], [1]], radius=1.5) - check_object_arrays(dist, [[0, 1], [1, 0]]) - check_object_arrays(ind, [[0, 1], [0, 1]]) - - rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5) - assert_array_equal(rng.A, np.ones((2, 2))) - - rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") - rng.sort_indices() - assert_array_equal(rng.A, [[0, 1], [1, 0]]) - assert_array_equal(rng.indices, [0, 1, 0, 1]) - assert_array_equal(rng.data, [0, 1, 1, 0]) - - # Mask the first duplicates when n_duplicates > n_neighbors. - X = np.ones((3, 1)) - nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") - nn.fit(X) - dist, ind = nn.kneighbors() - assert_array_equal(dist, np.zeros((3, 1))) - assert_array_equal(ind, [[1], [0], [1]]) - - # Test that zeros are explicitly marked in kneighbors_graph. - kng = nn.kneighbors_graph(mode="distance") - assert_array_equal(kng.A, np.zeros((3, 3))) - assert_array_equal(kng.data, np.zeros(3)) - assert_array_equal(kng.indices, [1.0, 0.0, 1.0]) - assert_array_equal( - nn.kneighbors_graph().A, - np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), - ) + rng = nn.radius_neighbors_graph([[0], [1]], radius=1.5, mode="distance") + rng.sort_indices() + assert_allclose(rng.toarray(), [[0, 1, 0], [1, 0, 0]]) + assert_allclose(rng.indices, [0, 1, 0, 1]) + assert_allclose(rng.data, [0, 1, 1, 0]) + + # Mask the first duplicates when n_duplicates > n_neighbors. + X = np.ones((3, 1)) + nn = neighbors.NearestNeighbors(n_neighbors=1, algorithm="brute") + nn.fit(X) + dist, ind = nn.kneighbors() + assert_allclose(dist, np.zeros((3, 1))) + assert_allclose(ind, [[1], [0], [1]]) + + # Test that zeros are explicitly marked in kneighbors_graph. + kng = nn.kneighbors_graph(mode="distance") + assert_allclose(kng.toarray(), np.zeros((3, 3))) + assert_allclose(kng.data, np.zeros(3)) + assert_allclose(kng.indices, [1, 0, 1]) + assert_allclose( + nn.kneighbors_graph().toarray(), + np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), + ) def test_include_self_neighbors_graph():