diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index 832a4e4389b19..f27ef98dfcdb0 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -91,6 +91,10 @@ Changelog `eigen_tol="auto"` in version 1.3. :pr:`23210` by :user:`Meekail Zain `. +- |API| The `affinity` attribute is now deprecated for + :class:`cluster.AgglomerativeClustering` and will be renamed to `metric` in v1.4. + :pr:`23470` by :user:`Meekail Zain `. + :mod:`sklearn.datasets` ....................... diff --git a/examples/cluster/plot_agglomerative_clustering_metrics.py b/examples/cluster/plot_agglomerative_clustering_metrics.py index e022035ca2207..38fd3682d48ec 100644 --- a/examples/cluster/plot_agglomerative_clustering_metrics.py +++ b/examples/cluster/plot_agglomerative_clustering_metrics.py @@ -125,7 +125,7 @@ def sqr(x): # Plot clustering results for index, metric in enumerate(["cosine", "euclidean", "cityblock"]): model = AgglomerativeClustering( - n_clusters=n_clusters, linkage="average", affinity=metric + n_clusters=n_clusters, linkage="average", metric=metric ) model.fit(X) plt.figure() @@ -134,7 +134,7 @@ def sqr(x): plt.plot(X[model.labels_ == l].T, c=c, alpha=0.5) plt.axis("tight") plt.axis("off") - plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20) + plt.suptitle("AgglomerativeClustering(metric=%s)" % metric, size=20) plt.show() diff --git a/examples/cluster/plot_cluster_comparison.py b/examples/cluster/plot_cluster_comparison.py index 8b52759c79018..a9e39267411b7 100644 --- a/examples/cluster/plot_cluster_comparison.py +++ b/examples/cluster/plot_cluster_comparison.py @@ -171,7 +171,7 @@ ) average_linkage = cluster.AgglomerativeClustering( linkage="average", - affinity="cityblock", + metric="cityblock", n_clusters=params["n_clusters"], connectivity=connectivity, ) diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index d7de31d41b325..90ee3336e0478 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -23,7 +23,7 @@ from ..utils import check_array from ..utils._fast_dict import IntFloatDict from ..utils.graph import _fix_connected_components -from ..utils._param_validation import Interval, StrOptions +from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils.validation import check_memory # mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast' @@ -760,6 +760,19 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): If "precomputed", a distance matrix (instead of a similarity matrix) is needed as input for the fit method. + .. deprecated:: 1.2 + `affinity` was deprecated in version 1.2 and will be renamed to + `metric` in 1.4. + + metric : str or callable, default=None + Metric used to compute the linkage. Can be "euclidean", "l1", "l2", + "manhattan", "cosine", or "precomputed". If set to `None` then + "euclidean" is used. If linkage is "ward", only "euclidean" is + accepted. If "precomputed", a distance matrix is needed as input for + the fit method. + + .. versionadded:: 1.2 + memory : str or object with the joblib.Memory interface, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the @@ -880,9 +893,15 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator): _parameter_constraints = { "n_clusters": [Interval(Integral, 1, None, closed="left"), None], "affinity": [ + Hidden(StrOptions({"deprecated"})), StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable, ], + "metric": [ + StrOptions(set(_VALID_METRICS) | {"precomputed"}), + callable, + None, + ], "memory": "no_validation", # TODO "connectivity": ["array-like", callable, None], "compute_full_tree": [StrOptions({"auto"}), "boolean"], @@ -895,7 +914,8 @@ def __init__( self, n_clusters=2, *, - affinity="euclidean", + affinity="deprecated", # TODO(1.4): Remove + metric=None, # TODO(1.4): Set to "euclidean" memory=None, connectivity=None, compute_full_tree="auto", @@ -910,6 +930,7 @@ def __init__( self.compute_full_tree = compute_full_tree self.linkage = linkage self.affinity = affinity + self.metric = metric self.compute_distances = compute_distances def fit(self, X, y=None): @@ -920,7 +941,7 @@ def fit(self, X, y=None): X : array-like, shape (n_samples, n_features) or \ (n_samples, n_samples) Training instances to cluster, or distances between instances if - ``affinity='precomputed'``. + ``metric='precomputed'``. y : Ignored Not used, present here for API consistency by convention. @@ -950,6 +971,24 @@ def _fit(self, X): """ memory = check_memory(self.memory) + self._metric = self.metric + # TODO(1.4): Remove + if self.affinity != "deprecated": + if self.metric is not None: + raise ValueError( + "Both `affinity` and `metric` attributes were set. Attribute" + " `affinity` was deprecated in version 1.2 and will be removed in" + " 1.4. To avoid this error, only set the `metric` attribute." + ) + warnings.warn( + "Attribute `affinity` was deprecated in version 1.2 and will be removed" + " in 1.4. Use `metric` instead", + FutureWarning, + ) + self._metric = self.affinity + elif self.metric is None: + self._metric = "euclidean" + if not ((self.n_clusters is None) ^ (self.distance_threshold is None)): raise ValueError( "Exactly one of n_clusters and " @@ -962,10 +1001,10 @@ def _fit(self, X): "compute_full_tree must be True if distance_threshold is set." ) - if self.linkage == "ward" and self.affinity != "euclidean": + if self.linkage == "ward" and self._metric != "euclidean": raise ValueError( - "%s was provided as affinity. Ward can only " - "work with euclidean distances." % (self.affinity,) + f"{self._metric} was provided as metric. Ward can only " + "work with euclidean distances." ) tree_builder = _TREE_BUILDERS[self.linkage] @@ -998,7 +1037,7 @@ def _fit(self, X): kwargs = {} if self.linkage != "ward": kwargs["linkage"] = self.linkage - kwargs["affinity"] = self.affinity + kwargs["affinity"] = self._metric distance_threshold = self.distance_threshold @@ -1084,6 +1123,19 @@ class FeatureAgglomeration( If "precomputed", a distance matrix (instead of a similarity matrix) is needed as input for the fit method. + .. deprecated:: 1.2 + `affinity` was deprecated in version 1.2 and will be renamed to + `metric` in 1.4. + + metric : str or callable, default=None + Metric used to compute the linkage. Can be "euclidean", "l1", "l2", + "manhattan", "cosine", or "precomputed". If set to `None` then + "euclidean" is used. If linkage is "ward", only "euclidean" is + accepted. If "precomputed", a distance matrix is needed as input for + the fit method. + + .. versionadded:: 1.2 + memory : str or object with the joblib.Memory interface, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the @@ -1208,8 +1260,14 @@ class FeatureAgglomeration( _parameter_constraints = { "n_clusters": [Interval(Integral, 1, None, closed="left"), None], "affinity": [ + Hidden(StrOptions({"deprecated"})), + StrOptions(set(_VALID_METRICS) | {"precomputed"}), + callable, + ], + "metric": [ StrOptions(set(_VALID_METRICS) | {"precomputed"}), callable, + None, ], "memory": "no_validation", # TODO "connectivity": ["array-like", callable, None], @@ -1224,7 +1282,8 @@ def __init__( self, n_clusters=2, *, - affinity="euclidean", + affinity="deprecated", # TODO(1.4): Remove + metric=None, # TODO(1.4): Set to "euclidean" memory=None, connectivity=None, compute_full_tree="auto", @@ -1240,6 +1299,7 @@ def __init__( compute_full_tree=compute_full_tree, linkage=linkage, affinity=affinity, + metric=metric, distance_threshold=distance_threshold, compute_distances=compute_distances, ) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 3cb5e2bb2b067..012073d1ba332 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -241,24 +241,24 @@ def test_agglomerative_clustering(): clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), - affinity="manhattan", + metric="manhattan", linkage="ward", ) with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete - for affinity in PAIRED_DISTANCES.keys(): + for metric in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), - affinity=affinity, + metric=metric, linkage="complete", ) clustering.fit(X) clustering2 = AgglomerativeClustering( - n_clusters=10, connectivity=None, affinity=affinity, linkage="complete" + n_clusters=10, connectivity=None, metric=metric, linkage="complete" ) clustering2.fit(X) assert_almost_equal( @@ -275,7 +275,7 @@ def test_agglomerative_clustering(): clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, - affinity="precomputed", + metric="precomputed", linkage="complete", ) clustering2.fit(X_dist) @@ -289,7 +289,7 @@ def test_agglomerative_clustering_memory_mapped(): """ rng = np.random.RandomState(0) Xmm = create_memmap_backed_data(rng.randn(50, 100)) - AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm) + AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm) def test_ward_agglomeration(): @@ -860,7 +860,7 @@ def test_invalid_shape_precomputed_dist_matrix(): ValueError, match=r"Distance matrix should be square, got matrix of shape \(5, 3\)", ): - AgglomerativeClustering(affinity="precomputed", linkage="complete").fit(X) + AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X) def test_precomputed_connectivity_affinity_with_2_connected_components(): @@ -900,3 +900,26 @@ def test_precomputed_connectivity_affinity_with_2_connected_components(): assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_) assert_array_equal(clusterer.children_, clusterer_precomputed.children_) + + +# TODO(1.4): Remove +def test_deprecate_affinity(): + rng = np.random.RandomState(42) + X = rng.randn(50, 10) + + af = AgglomerativeClustering(affinity="euclidean") + msg = ( + "Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4." + " Use `metric` instead" + ) + with pytest.warns(FutureWarning, match=msg): + af.fit(X) + with pytest.warns(FutureWarning, match=msg): + af.fit_predict(X) + + af = AgglomerativeClustering(metric="euclidean", affinity="euclidean") + msg = "Both `affinity` and `metric` attributes were set. Attribute" + with pytest.raises(ValueError, match=msg): + af.fit(X) + with pytest.raises(ValueError, match=msg): + af.fit_predict(X)