From 3c381db958aecc63a0d40c4292b605adb79f03b9 Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Sat, 4 Dec 2021 12:17:42 +0800 Subject: [PATCH 01/10] Deprecate wminkowski & modify minkowski weights --- sklearn/cluster/tests/test_hierarchical.py | 2 + sklearn/metrics/_dist_metrics.pyx | 65 ++++++++++++------- sklearn/metrics/tests/test_dist_metrics.py | 35 ++++++++++ sklearn/metrics/tests/test_pairwise.py | 2 + sklearn/neighbors/tests/test_neighbors.py | 4 ++ .../neighbors/tests/test_neighbors_tree.py | 2 + 6 files changed, 88 insertions(+), 22 deletions(-) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index d9bbf11e660f4..650443a94f0c7 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -410,6 +410,8 @@ def test_vector_scikit_single_vs_scipy_single(seed): assess_same_labelling(cut, cut_scipy) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_mst_linkage_core_memory_mapped(metric_param_grid): """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset. diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index f7d22c1badfa2..470b207aaf990 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -119,9 +119,12 @@ cdef class DistanceMetric: "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` ============== ==================== ======== =============================== - Note that "minkowski" with a non-None `w` parameter actually calls - `WMinkowskiDistance` with `w=w ** (1/p)` in order to be consistent with the - parametrization of scipy 1.8 and later. + .. deprecated:: 1.1 + `WMinkowskiDistance` is deprecated in version 1.1 and will be removed in version 1.3. + Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are + applied to the absolute differences already raised to the p power. This is different from + `WMinkowskiDistance` where weights are applied to the absolute differences before raising + to the p power. The deprecation aims to remain consistent with Scipy-1.8 convention. **Metrics intended for two-dimensional vector spaces:** Note that the haversine distance metric requires data in the form of [latitude, longitude] and both @@ -257,25 +260,14 @@ cdef class DistanceMetric: if metric is MinkowskiDistance: p = kwargs.pop('p', 2) w = kwargs.pop('w', None) - if w is not None: - # Be consistent with scipy 1.8 conventions: in scipy 1.8, - # 'wminkowski' was removed in favor of passing a - # weight vector directly to 'minkowski', however - # the new weights apply to the absolute differences raised to - # the p power instead of the absolute difference as in - # previous versions of scipy. - # WMinkowskiDistance in sklearn implements the weighting - # scheme of the old 'wminkowski' in scipy < 1.8, hence the - # following adaptation: - return WMinkowskiDistance(p, w ** (1/p), **kwargs) - if p == 1: + if p == 1 and w is None: return ManhattanDistance(**kwargs) - elif p == 2: + elif p == 2 and w is None: return EuclideanDistance(**kwargs) - elif np.isinf(p): + elif np.isinf(p) and w is None: return ChebyshevDistance(**kwargs) else: - return MinkowskiDistance(p, **kwargs) + return MinkowskiDistance(p, w, **kwargs) else: return metric(**kwargs) @@ -554,27 +546,49 @@ cdef class MinkowskiDistance(DistanceMetric): r"""Minkowski Distance .. math:: - D(x, y) = [\sum_i |x_i - y_i|^p] ^ (1/p) + D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p) + + Parameters + ---------- + p : int + The order of the norm of the difference :math:`{||u-v||}_p`. + w : (N,) array-like (optional) + The weight vector. Minkowski Distance requires p >= 1 and finite. For p = infinity, use ChebyshevDistance. Note that for p=1, ManhattanDistance is more efficient, and for p=2, EuclideanDistance is more efficient. """ - def __init__(self, p): + def __init__(self, p, w=None): if p < 1: raise ValueError("p must be greater than 1") elif np.isinf(p): raise ValueError("MinkowskiDistance requires finite p. " "For p=inf, use ChebyshevDistance.") + elif w is not None and any(w_i < 0 for w_i in w): + raise ValueError("w cannot contain negative weights") + self.p = p + self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE)) + self.size = 0 + if w is not None: + self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE)) + self.size = self.vec.shape[0] + + def _validate_data(self, X): + if self.size > 0 and X.shape[1] != self.size: + raise ValueError('MinkowskiDistance dist: ' + 'size of w does not match') cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: - cdef DTYPE_t d=0 + cdef DTYPE_t vec_j, d=0 cdef np.intp_t j + cdef bint has_w = self.size > 0 for j in range(size): - d += pow(fabs(x1[j] - x2[j]), self.p) + vec_j = self.vec[j] if has_w else 1. + d += vec_j * pow(fabs(x1[j] - x2[j]), self.p) return d cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, @@ -595,6 +609,7 @@ cdef class MinkowskiDistance(DistanceMetric): #------------------------------------------------------------ +# TODO: Remove in 1.3 - WMinkowskiDistance class # W-Minkowski Distance cdef class WMinkowskiDistance(DistanceMetric): r"""Weighted Minkowski Distance @@ -613,6 +628,12 @@ cdef class WMinkowskiDistance(DistanceMetric): """ def __init__(self, p, w): + from warnings import warn + warn("WMinkowskiDistance is deprecated in version 1.1 and will be " + "removed in version 1.3. Use MinkowskiDistance instead. Note " + "that in MinkowskiDistance, the weights are applied to the " + "absolute differences raised to the p power.", FutureWarning) + if p < 1: raise ValueError("p must be greater than 1") elif np.isinf(p): diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 6f55c16cb1c28..69171039e60db 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -70,6 +70,9 @@ def dist_func(x1, x2, p): METRICS_DEFAULT_PARAMS.append( ("wminkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), ) +METRICS_DEFAULT_PARAMS.append( + ("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), +) def check_cdist(metric, kwargs, X1, X2): @@ -88,6 +91,8 @@ def check_cdist(metric, kwargs, X1, X2): assert_array_almost_equal(D_sklearn, D_scipy_cdist) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)]) def test_cdist(metric_param_grid, X1, X2): @@ -119,6 +124,8 @@ def check_cdist_bool(metric, D_true): assert_array_almost_equal(D12, D_true) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)]) def test_pdist(metric_param_grid, X1, X2): @@ -169,6 +176,8 @@ def check_pdist_bool(metric, D_true): assert_array_almost_equal(D12, D_true) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("use_read_only_kwargs", [True, False]) @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_pickle(use_read_only_kwargs, metric_param_grid): @@ -183,6 +192,8 @@ def test_pickle(use_read_only_kwargs, metric_param_grid): check_pickle(metric, kwargs) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap]) def test_pickle_bool_metrics(metric, X1_bool): @@ -260,6 +271,8 @@ def custom_metric(x, y): assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") def test_readonly_kwargs(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/21685 @@ -275,3 +288,25 @@ def test_readonly_kwargs(): DistanceMetric.get_metric("seuclidean", V=weights) DistanceMetric.get_metric("wminkowski", p=1, w=weights) DistanceMetric.get_metric("mahalanobis", VI=VI) + + +def test_minkowski_metric_validate_weights(): + w1 = rng.random_sample(d) + w1[0] = -1337 + msg = "w cannot contain negative weights" + with pytest.raises(ValueError, match=msg): + DistanceMetric.get_metric("minkowski", p=3, w=w1) + + w2 = rng.random_sample(d + 1) + dm = DistanceMetric.get_metric("minkowski", p=3, w=w2) + msg = "size of w does not match" + with pytest.raises(ValueError, match=msg): + dm.pairwise(X1, X2) + + +# TODO: Remove in 1.3 when mwinkowski is removed +def test_wminkowski_deprecated(): + w = rng.random_sample(d) + msg = "WMinkowskiDistance is deprecated in version 1.1" + with pytest.warns(FutureWarning, match=msg): + DistanceMetric.get_metric("wminkowski", p=3, w=w) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index b7e90e63f2af1..5c6e9623faaae 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -252,6 +252,8 @@ def callable_rbf_kernel(x, y, **kwds): return K +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize( "func, metric, kwds", [ diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 2a4d500610051..508d7f9190a86 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1275,6 +1275,8 @@ def test_neighbors_badargs(): nbrs.radius_neighbors_graph(X, mode="blah") +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): # Test computing the neighbors for various metrics # create a symmetric matrix @@ -1381,6 +1383,8 @@ def custom_metric(x1, x2): assert_array_almost_equal(dist1, dist2) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") def test_valid_brute_metric_for_auto_algorithm(): X = rng.rand(12, 12) Xcsr = csr_matrix(X) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index e043ffb730708..acbfeaf2cd244 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -233,6 +233,8 @@ def test_gaussian_kde(Cls, n_samples=1000): assert_array_almost_equal(dens_tree, dens_gkde, decimal=3) +# TODO: Remove filterwarnings in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize( "Cls, metric", itertools.chain( From 4b7ad462fa1d6d41f472f1a12d597193dd39aea5 Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Sat, 4 Dec 2021 12:33:19 +0800 Subject: [PATCH 02/10] Add changelog --- doc/whats_new/v1.1.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 83395c4180c44..ff8bad9cc1d83 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -232,6 +232,11 @@ Changelog A deprecation cycle was introduced. :pr:`21576` by :user:`Paul-Emile Dugnat `. +- |API| The `WMinkowskiDistance` metric is deprecated and will be removed in version 1.3. + The `MinkowskiDistance` metric now takes in an optional parameter for weights. + This deprecation is to remain consistent with Scipy-1.8 convention. + :pr:`21873` by :user:`Yar Khine Phyo ` + :mod:`sklearn.manifold` ....................... From ab8f2a648359569b009da7bec89edf76b762cf15 Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Sat, 4 Dec 2021 12:40:22 +0800 Subject: [PATCH 03/10] Remove mistakenly added test line --- sklearn/metrics/tests/test_dist_metrics.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index 245103850e1dd..f4fc4ab58bb89 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -71,9 +71,6 @@ def dist_func(x1, x2, p): METRICS_DEFAULT_PARAMS.append( ("wminkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), ) -METRICS_DEFAULT_PARAMS.append( - ("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), -) def check_cdist(metric, kwargs, X1, X2): From b3371ba966f5bb190eaa2cc72bef05b6c93964f7 Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Sun, 5 Dec 2021 09:45:25 +0800 Subject: [PATCH 04/10] Fix PR Comments --- doc/whats_new/v1.1.rst | 2 +- sklearn/cluster/tests/test_hierarchical.py | 2 +- sklearn/metrics/_dist_metrics.pyx | 29 +++++++++++-------- sklearn/metrics/tests/test_dist_metrics.py | 29 ++++++++++++++----- sklearn/metrics/tests/test_pairwise.py | 2 +- sklearn/neighbors/tests/test_neighbors.py | 4 +-- .../neighbors/tests/test_neighbors_tree.py | 2 +- 7 files changed, 45 insertions(+), 25 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index b7421a0ad404c..1c1264e029876 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -234,7 +234,7 @@ Changelog - |API| The `WMinkowskiDistance` metric is deprecated and will be removed in version 1.3. The `MinkowskiDistance` metric now takes in an optional parameter for weights. - This deprecation is to remain consistent with Scipy-1.8 convention. + This deprecation is to remain consistent with SciPy 1.8 convention. :pr:`21873` by :user:`Yar Khine Phyo ` :mod:`sklearn.manifold` diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 650443a94f0c7..773dfd85900eb 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -411,7 +411,7 @@ def test_vector_scikit_single_vs_scipy_single(seed): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_mst_linkage_core_memory_mapped(metric_param_grid): """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset. diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index 470b207aaf990..95a1a130c6013 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -124,7 +124,7 @@ cdef class DistanceMetric: Use `MinkowskiDistance` instead. Note that in `MinkowskiDistance`, the weights are applied to the absolute differences already raised to the p power. This is different from `WMinkowskiDistance` where weights are applied to the absolute differences before raising - to the p power. The deprecation aims to remain consistent with Scipy-1.8 convention. + to the p power. The deprecation aims to remain consistent with SciPy 1.8 convention. **Metrics intended for two-dimensional vector spaces:** Note that the haversine distance metric requires data in the form of [latitude, longitude] and both @@ -566,29 +566,34 @@ cdef class MinkowskiDistance(DistanceMetric): elif np.isinf(p): raise ValueError("MinkowskiDistance requires finite p. " "For p=inf, use ChebyshevDistance.") - elif w is not None and any(w_i < 0 for w_i in w): - raise ValueError("w cannot contain negative weights") self.p = p - self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE)) - self.size = 0 if w is not None: - self.vec = ReadonlyArrayWrapper(np.asarray(w, dtype=DTYPE)) + w_array = np.asarray(w, dtype=DTYPE) + if (w_array < 0).any(): + raise ValueError("w cannot contain negative weights") + self.vec = ReadonlyArrayWrapper(w_array) self.size = self.vec.shape[0] + else: + self.vec = ReadonlyArrayWrapper(np.asarray([], dtype=DTYPE)) + self.size = 0 def _validate_data(self, X): if self.size > 0 and X.shape[1] != self.size: - raise ValueError('MinkowskiDistance dist: ' - 'size of w does not match') + raise ValueError('MinkowskiDistance: size of w %d should match ' + 'col of input %d' % (self.size, X.shape[1])) cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: - cdef DTYPE_t vec_j, d=0 + cdef DTYPE_t d=0 cdef np.intp_t j cdef bint has_w = self.size > 0 - for j in range(size): - vec_j = self.vec[j] if has_w else 1. - d += vec_j * pow(fabs(x1[j] - x2[j]), self.p) + if has_w: + for j in range(size): + d += self.vec[j] * pow(fabs(x1[j] - x2[j]), self.p) + else: + for j in range(size): + d += pow(fabs(x1[j] - x2[j]), self.p) return d cdef inline DTYPE_t dist(self, const DTYPE_t* x1, const DTYPE_t* x2, diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index f4fc4ab58bb89..b1caebefee653 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -90,7 +90,7 @@ def check_cdist(metric, kwargs, X1, X2): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)]) def test_cdist(metric_param_grid, X1, X2): @@ -123,7 +123,7 @@ def check_cdist_bool(metric, D_true): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) @pytest.mark.parametrize("X1, X2", [(X1, X2), (X1_mmap, X2_mmap)]) def test_pdist(metric_param_grid, X1, X2): @@ -175,7 +175,7 @@ def check_pdist_bool(metric, D_true): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("writable_kwargs", [True, False]) @pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) def test_pickle(writable_kwargs, metric_param_grid): @@ -192,7 +192,7 @@ def test_pickle(writable_kwargs, metric_param_grid): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize("metric", BOOL_METRICS) @pytest.mark.parametrize("X1_bool", [X1_bool, X1_bool_mmap]) def test_pickle_bool_metrics(metric, X1_bool): @@ -271,7 +271,7 @@ def custom_metric(x, y): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") def test_readonly_kwargs(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/21685 @@ -298,14 +298,29 @@ def test_minkowski_metric_validate_weights(): w2 = rng.random_sample(d + 1) dm = DistanceMetric.get_metric("minkowski", p=3, w=w2) - msg = "size of w does not match" + msg = "MinkowskiDistance: size of w %d should match col of input %d" % ( + w2.shape[0], + X1.shape[1], + ) with pytest.raises(ValueError, match=msg): dm.pairwise(X1, X2) -# TODO: Remove in 1.3 when mwinkowski is removed +# TODO: Remove in 1.3 when wminkowski is removed def test_wminkowski_deprecated(): w = rng.random_sample(d) msg = "WMinkowskiDistance is deprecated in version 1.1" with pytest.warns(FutureWarning, match=msg): DistanceMetric.get_metric("wminkowski", p=3, w=w) + + +# TODO: Remove in 1.3 when wminkowski is removed +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") +@pytest.mark.parametrize("p", [1, 1.5, 3]) +def test_wminkowski_minkowski_equivalence(p): + w = rng.random_sample(d) + dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p)) + dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w) + D_wmks = dm_wmks.pairwise(X1, X2) + D_mks = dm_mks.pairwise(X1, X2) + assert_array_almost_equal(D_wmks, D_mks) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 5c6e9623faaae..35f7e19416f0c 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -253,7 +253,7 @@ def callable_rbf_kernel(x, y, **kwds): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize( "func, metric, kwds", [ diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index 508d7f9190a86..c528c3ab900f8 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -1276,7 +1276,7 @@ def test_neighbors_badargs(): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") def test_neighbors_metrics(n_samples=20, n_features=3, n_query_pts=2, n_neighbors=5): # Test computing the neighbors for various metrics # create a symmetric matrix @@ -1384,7 +1384,7 @@ def custom_metric(x1, x2): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") def test_valid_brute_metric_for_auto_algorithm(): X = rng.rand(12, 12) Xcsr = csr_matrix(X) diff --git a/sklearn/neighbors/tests/test_neighbors_tree.py b/sklearn/neighbors/tests/test_neighbors_tree.py index acbfeaf2cd244..85d578c271faa 100644 --- a/sklearn/neighbors/tests/test_neighbors_tree.py +++ b/sklearn/neighbors/tests/test_neighbors_tree.py @@ -234,7 +234,7 @@ def test_gaussian_kde(Cls, n_samples=1000): # TODO: Remove filterwarnings in 1.3 when wminkowski is removed -@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") +@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn") @pytest.mark.parametrize( "Cls, metric", itertools.chain( From 0f42045bdde56b3217f10178d1e4d47a803dfe55 Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Wed, 19 Jan 2022 11:26:25 +0800 Subject: [PATCH 05/10] Fix PR comments on warnings and docs --- doc/whats_new/v1.1.rst | 9 +++++---- sklearn/metrics/_dist_metrics.pyx | 11 ++++++++--- sklearn/metrics/tests/test_dist_metrics.py | 8 +++++--- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index e89787e286807..1183a27ac5866 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -369,10 +369,11 @@ Changelog A deprecation cycle was introduced. :pr:`21576` by :user:`Paul-Emile Dugnat `. -- |API| The `WMinkowskiDistance` metric is deprecated and will be removed in version 1.3. - The `MinkowskiDistance` metric now takes in an optional parameter for weights. - This deprecation is to remain consistent with SciPy 1.8 convention. - :pr:`21873` by :user:`Yar Khine Phyo ` +- |API| The `WMinkowskiDistance` :class:`sklearn.metrics.DistanceMetric` is deprecated + and will be removed in version 1.3. The :class:`sklearn.metrics.MinkowskiDistance` metric + now takes in an optional parameter for weights. This deprecation aims at remaining consistent + with SciPy 1.8 convention. :pr:`21873` by :user:`Yar Khine Phyo ` + - |Fix| :func:`metrics.silhouette_score` now supports integer input for precomputed distances. :pr:`22108` by `Thomas Fan`_. diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index 95a1a130c6013..f8ae052b17885 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -580,8 +580,9 @@ cdef class MinkowskiDistance(DistanceMetric): def _validate_data(self, X): if self.size > 0 and X.shape[1] != self.size: - raise ValueError('MinkowskiDistance: size of w %d should match ' - 'col of input %d' % (self.size, X.shape[1])) + raise ValueError("MinkowskiDistance: the size of w must match " + f"the number of features ({X.shape[1]}). " + f"Currently len(w)={self.size}.") cdef inline DTYPE_t rdist(self, const DTYPE_t* x1, const DTYPE_t* x2, ITYPE_t size) nogil except -1: @@ -637,7 +638,11 @@ cdef class WMinkowskiDistance(DistanceMetric): warn("WMinkowskiDistance is deprecated in version 1.1 and will be " "removed in version 1.3. Use MinkowskiDistance instead. Note " "that in MinkowskiDistance, the weights are applied to the " - "absolute differences raised to the p power.", FutureWarning) + "absolute differences raised to the p power. This is different " + "from WMinkowskiDistance where weights are applied to the " + "absolute differences before raising to the p power. " + "The deprecation aims to remain consistent with SciPy 1.8 " + "convention.", FutureWarning) if p < 1: raise ValueError("p must be greater than 1") diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index b1caebefee653..d4eba36e3578a 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -298,9 +298,10 @@ def test_minkowski_metric_validate_weights(): w2 = rng.random_sample(d + 1) dm = DistanceMetric.get_metric("minkowski", p=3, w=w2) - msg = "MinkowskiDistance: size of w %d should match col of input %d" % ( - w2.shape[0], - X1.shape[1], + msg = ( + "MinkowskiDistance: the size of w must match " + f"the number of features \\({X1.shape[1]}\\). " + f"Currently len\\(w\\)={w2.shape[0]}." ) with pytest.raises(ValueError, match=msg): dm.pairwise(X1, X2) @@ -319,6 +320,7 @@ def test_wminkowski_deprecated(): @pytest.mark.parametrize("p", [1, 1.5, 3]) def test_wminkowski_minkowski_equivalence(p): w = rng.random_sample(d) + # Weights are rescaled for consistency w.r.t scipy 1.8 refactoring of 'minkowski' dm_wmks = DistanceMetric.get_metric("wminkowski", p=p, w=(w) ** (1 / p)) dm_mks = DistanceMetric.get_metric("minkowski", p=p, w=w) D_wmks = dm_wmks.pairwise(X1, X2) From ff6ad0790c2fc67e0af369afcda56c91758def5b Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Thu, 20 Jan 2022 14:18:39 +0800 Subject: [PATCH 06/10] Add check_array --- sklearn/metrics/_dist_metrics.pyx | 5 ++- sklearn/metrics/tests/test_dist_metrics.py | 42 ++++++++++++++++++---- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index f8ae052b17885..e942b9450d824 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -30,6 +30,7 @@ cdef DTYPE_t INF = np.inf from ..utils._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t, DTYPECODE from ..utils._typedefs import DTYPE, ITYPE from ..utils._readonly_array_wrapper import ReadonlyArrayWrapper +from ..utils import check_array ###################################################################### # newObj function @@ -569,7 +570,9 @@ cdef class MinkowskiDistance(DistanceMetric): self.p = p if w is not None: - w_array = np.asarray(w, dtype=DTYPE) + w_array = check_array( + w, ensure_2d=False, dtype=DTYPE, input_name="w" + ) if (w_array < 0).any(): raise ValueError("w cannot contain negative weights") self.vec = ReadonlyArrayWrapper(w_array) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index d4eba36e3578a..d7384f2ba37e8 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -7,6 +7,7 @@ import pytest +import scipy.sparse as sp from scipy.spatial.distance import cdist from sklearn.metrics import DistanceMetric from sklearn.utils import check_random_state @@ -289,13 +290,42 @@ def test_readonly_kwargs(): DistanceMetric.get_metric("mahalanobis", VI=VI) -def test_minkowski_metric_validate_weights(): - w1 = rng.random_sample(d) - w1[0] = -1337 - msg = "w cannot contain negative weights" - with pytest.raises(ValueError, match=msg): - DistanceMetric.get_metric("minkowski", p=3, w=w1) +@pytest.mark.parametrize( + "w, err_type, err_msg", + [ + ( + np.array([1, 1.5, -13]), + ValueError, + "w cannot contain negative weights" + ), + ( + np.array([1, 1.5, np.nan]), + ValueError, + "w contains NaN" + ), + ( + sp.csr_matrix([1, 1.5, 1]), + TypeError, + "A sparse matrix was passed, but dense data is required" + ), + ( + np.array(["a", "b", "c"]), + ValueError, + "could not convert string to float" + ), + ( + np.array([]), + ValueError, + "a minimum of 1 is required" + ), + ], +) +def test_minkowski_metric_validate_weights_values(w, err_type, err_msg): + with pytest.raises(err_type, match=err_msg): + DistanceMetric.get_metric("minkowski", p=3, w=w) + +def test_minkowski_metric_validate_weights_size(): w2 = rng.random_sample(d + 1) dm = DistanceMetric.get_metric("minkowski", p=3, w=w2) msg = ( From c900b05803846af8d175ddb61bf309871c7aa2a8 Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Thu, 20 Jan 2022 14:23:18 +0800 Subject: [PATCH 07/10] Update whats_news --- doc/whats_new/v1.1.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 1183a27ac5866..42f675676a569 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -369,9 +369,9 @@ Changelog A deprecation cycle was introduced. :pr:`21576` by :user:`Paul-Emile Dugnat `. -- |API| The `WMinkowskiDistance` :class:`sklearn.metrics.DistanceMetric` is deprecated - and will be removed in version 1.3. The :class:`sklearn.metrics.MinkowskiDistance` metric - now takes in an optional parameter for weights. This deprecation aims at remaining consistent +- |API| The `"wminkowski"` metric of :class:`sklearn.metrics.DistanceMetric` is deprecated + and will be removed in version 1.3. Instead the existing `"minkowski"` metric now takes + in an optional `w` parameter for weights. This deprecation aims at remaining consistent with SciPy 1.8 convention. :pr:`21873` by :user:`Yar Khine Phyo ` - |Fix| :func:`metrics.silhouette_score` now supports integer input for precomputed From 4857f5cfeb49d124e9d0ce2139ba6667efb26d7c Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Thu, 20 Jan 2022 14:25:51 +0800 Subject: [PATCH 08/10] Update class docstring --- sklearn/metrics/_dist_metrics.pyx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/_dist_metrics.pyx b/sklearn/metrics/_dist_metrics.pyx index e942b9450d824..946d8d7735601 100644 --- a/sklearn/metrics/_dist_metrics.pyx +++ b/sklearn/metrics/_dist_metrics.pyx @@ -547,12 +547,19 @@ cdef class MinkowskiDistance(DistanceMetric): r"""Minkowski Distance .. math:: - D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p) + D(x, y) = {||u-v||}_p + + when w is None. + + Here is the more general expanded expression for the weighted case: + + .. math:: + D(x, y) = [\sum_i w_i *|x_i - y_i|^p] ^ (1/p) Parameters ---------- p : int - The order of the norm of the difference :math:`{||u-v||}_p`. + The order of the p-norm of the difference (see above). w : (N,) array-like (optional) The weight vector. From c607fb096b58b665e0a74949cb6838f006cac63e Mon Sep 17 00:00:00 2001 From: yarkhinephyo Date: Thu, 20 Jan 2022 18:05:34 +0800 Subject: [PATCH 09/10] Fix lint issues --- sklearn/metrics/tests/test_dist_metrics.py | 26 +++++----------------- 1 file changed, 5 insertions(+), 21 deletions(-) diff --git a/sklearn/metrics/tests/test_dist_metrics.py b/sklearn/metrics/tests/test_dist_metrics.py index d7384f2ba37e8..5dd10d83bfe68 100644 --- a/sklearn/metrics/tests/test_dist_metrics.py +++ b/sklearn/metrics/tests/test_dist_metrics.py @@ -293,31 +293,15 @@ def test_readonly_kwargs(): @pytest.mark.parametrize( "w, err_type, err_msg", [ - ( - np.array([1, 1.5, -13]), - ValueError, - "w cannot contain negative weights" - ), - ( - np.array([1, 1.5, np.nan]), - ValueError, - "w contains NaN" - ), + (np.array([1, 1.5, -13]), ValueError, "w cannot contain negative weights"), + (np.array([1, 1.5, np.nan]), ValueError, "w contains NaN"), ( sp.csr_matrix([1, 1.5, 1]), TypeError, - "A sparse matrix was passed, but dense data is required" - ), - ( - np.array(["a", "b", "c"]), - ValueError, - "could not convert string to float" - ), - ( - np.array([]), - ValueError, - "a minimum of 1 is required" + "A sparse matrix was passed, but dense data is required", ), + (np.array(["a", "b", "c"]), ValueError, "could not convert string to float"), + (np.array([]), ValueError, "a minimum of 1 is required"), ], ) def test_minkowski_metric_validate_weights_values(w, err_type, err_msg): From ea01b8a81942d75f292893d798c125bcd8dd84a4 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 20 Jan 2022 15:39:57 +0100 Subject: [PATCH 10/10] Just to trigger the CI