From 7ab4cb3afa13d5d352c09ffcf77bdb9199d6d6df Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 25 Jul 2022 11:16:52 +0200 Subject: [PATCH 01/10] MAINT Make PairwiseDistancesReduction usable only for C-contiguous arrays --- sklearn/cluster/tests/test_birch.py | 15 +++++++++++++++ .../_pairwise_distances_reduction/_dispatcher.py | 7 +++++++ .../tests/test_pairwise_distances_reduction.py | 3 +++ 3 files changed, 25 insertions(+) diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py index 5de87136e367b..2ed482c6cd70d 100644 --- a/sklearn/cluster/tests/test_birch.py +++ b/sklearn/cluster/tests/test_birch.py @@ -248,3 +248,18 @@ def test_both_subclusters_updated(): # no error Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X) + + +def test_f_ordered_array(): + """ + Non-regression test for: + https://github.com/scikit-learn/scikit-learn/issues/23988 + """ + from sklearn.datasets import load_iris + from sklearn.cluster import Birch + + X, y = load_iris(return_X_y=True, as_frame=True) + birch = Birch(n_clusters=3) + + # Must not err. + birch.fit_predict(X) diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py index a79fde694a9ed..1cf670ed35dec 100644 --- a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py +++ b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py @@ -80,11 +80,18 @@ def is_usable_for(cls, X, Y, metric) -> bool: True if the PairwiseDistancesReduction can be used, else False. """ dtypes_validity = X.dtype == Y.dtype == np.float64 + c_contiguity = ( + hasattr(X, "flags") + and X.flags.c_contiguous + and hasattr(Y, "flags") + and Y.flags.c_contiguous + ) return ( get_config().get("enable_cython_pairwise_dist", True) and not issparse(X) and not issparse(Y) and dtypes_validity + and c_contiguity and metric in cls.valid_metrics() ) diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py index 0b9c6e6aad196..9bb55aedf4dd0 100644 --- a/sklearn/metrics/tests/test_pairwise_distances_reduction.py +++ b/sklearn/metrics/tests/test_pairwise_distances_reduction.py @@ -530,6 +530,9 @@ def test_pairwise_distances_reduction_is_usable_for(): assert not PairwiseDistancesReduction.is_usable_for(csr_matrix(X), Y, metric) assert not PairwiseDistancesReduction.is_usable_for(X, csr_matrix(Y), metric) + # F-ordered arrays are not supported + assert not PairwiseDistancesReduction.is_usable_for(np.asfortranarray(X), Y, metric) + def test_argkmin_factory_method_wrong_usages(): rng = np.random.RandomState(1) From 42b00cb56ca652c37af18bf75fc9aab10618df28 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Mon, 25 Jul 2022 11:38:57 +0200 Subject: [PATCH 02/10] TST Adapt test to be pandas-independent --- sklearn/cluster/tests/test_birch.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py index 2ed482c6cd70d..d0b3ae03b3c93 100644 --- a/sklearn/cluster/tests/test_birch.py +++ b/sklearn/cluster/tests/test_birch.py @@ -255,10 +255,9 @@ def test_f_ordered_array(): Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/23988 """ - from sklearn.datasets import load_iris - from sklearn.cluster import Birch + X, _ = make_blobs(n_samples=80, n_features=4, random_state=0) + X = np.asfortranarray(X) - X, y = load_iris(return_X_y=True, as_frame=True) birch = Birch(n_clusters=3) # Must not err. From 4505c39d1a9bebab7b3ba4766e8617d2a73fa5ea Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 27 Jul 2022 10:44:32 +0200 Subject: [PATCH 03/10] TST Add tests for f-contiguous arrays support and remove the old one --- sklearn/cluster/tests/test_birch.py | 14 ------- sklearn/tests/test_common.py | 57 +++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/sklearn/cluster/tests/test_birch.py b/sklearn/cluster/tests/test_birch.py index d0b3ae03b3c93..5de87136e367b 100644 --- a/sklearn/cluster/tests/test_birch.py +++ b/sklearn/cluster/tests/test_birch.py @@ -248,17 +248,3 @@ def test_both_subclusters_updated(): # no error Birch(branching_factor=5, threshold=1e-5, n_clusters=None).fit(X) - - -def test_f_ordered_array(): - """ - Non-regression test for: - https://github.com/scikit-learn/scikit-learn/issues/23988 - """ - X, _ = make_blobs(n_samples=80, n_features=4, random_state=0) - X = np.asfortranarray(X) - - birch = Birch(n_clusters=3) - - # Must not err. - birch.fit_predict(X) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 17d22f78b7e12..cc87ba401ccc7 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -18,6 +18,24 @@ import pytest import numpy as np +from sklearn.cluster import ( + AffinityPropagation, + Birch, + MeanShift, + OPTICS, + SpectralClustering, +) +from sklearn.datasets import make_blobs +from sklearn.manifold import Isomap, TSNE, LocallyLinearEmbedding +from sklearn.neighbors import ( + LocalOutlierFactor, + KNeighborsClassifier, + KNeighborsRegressor, + RadiusNeighborsClassifier, + RadiusNeighborsRegressor, +) +from sklearn.semi_supervised import LabelPropagation, LabelSpreading + from sklearn.utils import all_estimators from sklearn.utils._testing import ignore_warnings from sklearn.exceptions import ConvergenceWarning @@ -541,3 +559,42 @@ def test_check_param_validation(estimator): ) _set_checking_parameters(estimator) check_param_validation(name, estimator) + + +@pytest.mark.parametrize( + "Estimator", + [ + AffinityPropagation, + Birch, + MeanShift, + KNeighborsClassifier, + KNeighborsRegressor, + RadiusNeighborsClassifier, + RadiusNeighborsRegressor, + LabelPropagation, + LabelSpreading, + OPTICS, + SpectralClustering, + LocalOutlierFactor, + LocallyLinearEmbedding, + Isomap, + TSNE, + ], +) +def test_f_contiguous_array_estimator(Estimator): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/23988 + # https://github.com/scikit-learn/scikit-learn/issues/24013 + + X, _ = make_blobs(n_samples=80, n_features=4, random_state=0) + X = np.asfortranarray(X) + y = np.round(X[:, 0]) + + est = Estimator() + est.fit(X, y) + + if hasattr(est, "transform"): + est.transform(X) + + if hasattr(est, "predict"): + est.predict(X) From f0dc170a4ca1f6794fe908020c398c28c386d330 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 27 Jul 2022 10:45:18 +0200 Subject: [PATCH 04/10] Convert f-contiguous array into c-contiguous arrays where appropriate --- sklearn/cluster/_affinity_propagation.py | 2 +- sklearn/cluster/_birch.py | 3 ++- sklearn/cluster/_mean_shift.py | 2 +- sklearn/metrics/pairwise.py | 17 +++++++++++++++-- sklearn/metrics/tests/test_pairwise.py | 9 +++++++++ 5 files changed, 28 insertions(+), 5 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index f0274b113a341..8030bee119c6f 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -516,7 +516,7 @@ def predict(self, X): Cluster labels. """ check_is_fitted(self) - X = self._validate_data(X, reset=False, accept_sparse="csr") + X = self._validate_data(X, reset=False, accept_sparse="csr", order="C") if not hasattr(self, "cluster_centers_"): raise ValueError( "Predict method is not supported when affinity='precomputed'." diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 20a414d1ac56e..fd5be63144dec 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -554,6 +554,7 @@ def _fit(self, X, partial): copy=self.copy, reset=first_call, dtype=[np.float64, np.float32], + order="C", ) threshold = self.threshold branching_factor = self.branching_factor @@ -689,7 +690,7 @@ def predict(self, X): Labelled data. """ check_is_fitted(self) - X = self._validate_data(X, accept_sparse="csr", reset=False) + X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") return self._predict(X) def _predict(self, X): diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 356386eef3db7..f080bb05d51dd 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -518,6 +518,6 @@ def predict(self, X): Index of the cluster each sample belongs to. """ check_is_fitted(self) - X = self._validate_data(X, reset=False) + X = self._validate_data(X, reset=False, order="C") with config_context(assume_finite=True): return pairwise_distances_argmin(X, self.cluster_centers_) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index bcf01bc2925a3..d50e18b57ec34 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -69,6 +69,7 @@ def check_pairwise_arrays( accept_sparse="csr", force_all_finite=True, copy=False, + order=None, ): """Set X and Y appropriately and checks inputs. @@ -127,6 +128,15 @@ def check_pairwise_arrays( .. versionadded:: 0.22 + order : {'F', 'C'} or None, default=None + Whether the arrays will be forced to be fortran or c-style. + When order is None (default), then if copy=False, nothing is ensured + about the memory layout of the output array; otherwise (copy=True) + the memory layout of the returned array is kept as close as possible + to the original array. + + ..versionadded:: 1.1.2 + Returns ------- safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) @@ -150,6 +160,7 @@ def check_pairwise_arrays( copy=copy, force_all_finite=force_all_finite, estimator=estimator, + order=order, ) else: X = check_array( @@ -159,6 +170,7 @@ def check_pairwise_arrays( copy=copy, force_all_finite=force_all_finite, estimator=estimator, + order=order, ) Y = check_array( Y, @@ -167,6 +179,7 @@ def check_pairwise_arrays( copy=copy, force_all_finite=force_all_finite, estimator=estimator, + order=order, ) if precomputed: @@ -661,7 +674,7 @@ def pairwise_distances_argmin_min( pairwise_distances_argmin : Same as `pairwise_distances_argmin_min` but only returns the argmins. """ - X, Y = check_pairwise_arrays(X, Y) + X, Y = check_pairwise_arrays(X, Y, order="C") if axis == 0: X, Y = Y, X @@ -773,7 +786,7 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs if metric_kwargs is None: metric_kwargs = {} - X, Y = check_pairwise_arrays(X, Y) + X, Y = check_pairwise_arrays(X, Y, order="C") if axis == 0: X, Y = Y, X diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index f14c558d5a3c1..a74db9a2fc316 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -534,6 +534,15 @@ def test_pairwise_distances_argmin_min(dtype): assert_array_equal(argmin_0, argmin_1) + # F-contiguous arrays must be supported and must return identical + # C-contiguous results. + argmin_C_contiguous = pairwise_distances_argmin(X, Y) + argmin_F_contiguous = pairwise_distances_argmin( + np.asfortranarray(X), np.asfortranarray(Y) + ) + + assert_array_equal(argmin_C_contiguous, argmin_F_contiguous) + def _reduce_func(dist, start): return dist[:, :100] From 4d88ed5fb5a28a254c5355d64b83a6168dd9d4d5 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 27 Jul 2022 10:48:30 +0200 Subject: [PATCH 05/10] DOC Add whats_new entry --- doc/whats_new/v1.1.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 4c46c0d631f76..adfb71eac065b 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -12,6 +12,10 @@ Version 1.1.2 Changelog --------- +- |Fix| Add support for F-contiguous arrays for estimators and functions whose back-end + have been changed in 1.1. + :pr:`23990` by :user:`Julien Jerphanion `. + :mod:`sklearn.cluster` ...................... From 819c4e5464928ba3b4ac95f5f8fae0a04fe573f6 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 27 Jul 2022 10:53:32 +0200 Subject: [PATCH 06/10] Trigger CI From e65fbe9b26d0e83b24a7fbb042fe794fb21b4f2f Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 27 Jul 2022 10:57:41 +0200 Subject: [PATCH 07/10] fixup! TST Add tests for f-contiguous arrays support --- sklearn/metrics/tests/test_pairwise.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index a74db9a2fc316..31964e2d182dd 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -534,8 +534,7 @@ def test_pairwise_distances_argmin_min(dtype): assert_array_equal(argmin_0, argmin_1) - # F-contiguous arrays must be supported and must return identical - # C-contiguous results. + # F-contiguous arrays must be supported and must return identical results. argmin_C_contiguous = pairwise_distances_argmin(X, Y) argmin_F_contiguous = pairwise_distances_argmin( np.asfortranarray(X), np.asfortranarray(Y) From c77814d953f8860566cae6029cb4a537cada5069 Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 27 Jul 2022 20:30:38 +0200 Subject: [PATCH 08/10] FIX Correct multitasking-mess --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 56b470db0b45b..0cb0233fef038 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -13,7 +13,7 @@ Changelog --------- - |Fix| A default HTML representation is shown for meta-estimators with invalid - parameters. :pr:`24015` by `Thomas Fan` + parameters. :pr:`24015` by `Thomas Fan`_. - |Fix| Add support for F-contiguous arrays for estimators and functions whose back-end have been changed in 1.1. From 83e33e0a0d65659235506454036a44b878a304cc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Wed, 27 Jul 2022 20:51:20 +0200 Subject: [PATCH 09/10] TST Ignore FutureWarning --- sklearn/tests/test_common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index ba127cc8a9b43..3cac700c73d69 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -548,6 +548,8 @@ def test_check_param_validation(estimator): check_param_validation(name, estimator) +# TODO: remove this filter in 1.2 +@pytest.mark.filterwarnings("ignore::FutureWarning:sklearn") @pytest.mark.parametrize( "Estimator", [ From 576cc05b202871b9cb398da2bb2af53c9de951bc Mon Sep 17 00:00:00 2001 From: Julien Jerphanion Date: Thu, 28 Jul 2022 10:00:20 +0200 Subject: [PATCH 10/10] MAINT Fallback on previous backend for F-contiguous array --- sklearn/cluster/_affinity_propagation.py | 2 +- sklearn/cluster/_birch.py | 3 +-- sklearn/cluster/_mean_shift.py | 2 +- sklearn/metrics/pairwise.py | 17 ++--------------- 4 files changed, 5 insertions(+), 19 deletions(-) diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index d2089bb4bcd65..c577b2856d470 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -512,7 +512,7 @@ def predict(self, X): Cluster labels. """ check_is_fitted(self) - X = self._validate_data(X, reset=False, accept_sparse="csr", order="C") + X = self._validate_data(X, reset=False, accept_sparse="csr") if not hasattr(self, "cluster_centers_"): raise ValueError( "Predict method is not supported when affinity='precomputed'." diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index fd5be63144dec..20a414d1ac56e 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -554,7 +554,6 @@ def _fit(self, X, partial): copy=self.copy, reset=first_call, dtype=[np.float64, np.float32], - order="C", ) threshold = self.threshold branching_factor = self.branching_factor @@ -690,7 +689,7 @@ def predict(self, X): Labelled data. """ check_is_fitted(self) - X = self._validate_data(X, accept_sparse="csr", reset=False, order="C") + X = self._validate_data(X, accept_sparse="csr", reset=False) return self._predict(X) def _predict(self, X): diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index f080bb05d51dd..356386eef3db7 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -518,6 +518,6 @@ def predict(self, X): Index of the cluster each sample belongs to. """ check_is_fitted(self) - X = self._validate_data(X, reset=False, order="C") + X = self._validate_data(X, reset=False) with config_context(assume_finite=True): return pairwise_distances_argmin(X, self.cluster_centers_) diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index d50e18b57ec34..bcf01bc2925a3 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -69,7 +69,6 @@ def check_pairwise_arrays( accept_sparse="csr", force_all_finite=True, copy=False, - order=None, ): """Set X and Y appropriately and checks inputs. @@ -128,15 +127,6 @@ def check_pairwise_arrays( .. versionadded:: 0.22 - order : {'F', 'C'} or None, default=None - Whether the arrays will be forced to be fortran or c-style. - When order is None (default), then if copy=False, nothing is ensured - about the memory layout of the output array; otherwise (copy=True) - the memory layout of the returned array is kept as close as possible - to the original array. - - ..versionadded:: 1.1.2 - Returns ------- safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) @@ -160,7 +150,6 @@ def check_pairwise_arrays( copy=copy, force_all_finite=force_all_finite, estimator=estimator, - order=order, ) else: X = check_array( @@ -170,7 +159,6 @@ def check_pairwise_arrays( copy=copy, force_all_finite=force_all_finite, estimator=estimator, - order=order, ) Y = check_array( Y, @@ -179,7 +167,6 @@ def check_pairwise_arrays( copy=copy, force_all_finite=force_all_finite, estimator=estimator, - order=order, ) if precomputed: @@ -674,7 +661,7 @@ def pairwise_distances_argmin_min( pairwise_distances_argmin : Same as `pairwise_distances_argmin_min` but only returns the argmins. """ - X, Y = check_pairwise_arrays(X, Y, order="C") + X, Y = check_pairwise_arrays(X, Y) if axis == 0: X, Y = Y, X @@ -786,7 +773,7 @@ def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs if metric_kwargs is None: metric_kwargs = {} - X, Y = check_pairwise_arrays(X, Y, order="C") + X, Y = check_pairwise_arrays(X, Y) if axis == 0: X, Y = Y, X