Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2ca67a6
Reconciled changelog
Micky774 May 26, 2022
7721d51
Merge branch 'main' into deprecate_affinity
Micky774 May 26, 2022
15024ac
Completed initial deprecation
Micky774 May 26, 2022
59d7ce6
Fixed deprecation
Micky774 May 26, 2022
07bb1d9
Added changelog entry
Micky774 May 26, 2022
40d632d
Merge branch 'main' into deprecate_affinity
Micky774 May 27, 2022
8ec5499
Fixed docstring
Micky774 May 27, 2022
b624725
Corrected default param and validation
Micky774 May 27, 2022
2777335
Fixed spacing in docstring
Micky774 May 27, 2022
1bf3963
Merge branch 'main' into deprecate_affinity
Micky774 May 30, 2022
342a890
Merge branch 'main' into deprecate_affinity
Micky774 Jun 2, 2022
2242749
Addressed reviewer feedback
Micky774 Jun 2, 2022
9fab19f
Merge branch 'main' into deprecate_affinity
Micky774 Jun 6, 2022
14196dc
Update sklearn/cluster/_agglomerative.py
Micky774 Jun 6, 2022
777f97f
Merge branch 'deprecate_affinity' of https://github.com/Micky774/scik…
Micky774 Jun 6, 2022
b9986d8
Updated test
Micky774 Jun 6, 2022
be60dcb
Merge branch 'main' into deprecate_affinity
Micky774 Jun 9, 2022
f5ffe90
Added error handling and testing for edge-case
Micky774 Jun 9, 2022
f20c474
Merge remote-tracking branch 'origin/main' into pr/Micky774/23470
glemaitre Jun 30, 2022
a00b1a9
Merge branch 'main' into deprecate_affinity
Micky774 Jun 30, 2022
dbcb021
Apply suggestions from code review
Micky774 Jun 30, 2022
294946a
Streamlined test and renamed var
Micky774 Jun 30, 2022
192a6a6
Merge branch 'main' into deprecate_affinity
Micky774 Jul 4, 2022
e62ed6a
Clarified docstring per review feedback
Micky774 Jul 4, 2022
26ee23d
Merge branch 'deprecate_affinity' of https://github.com/Micky774/scik…
Micky774 Jul 4, 2022
8b914b9
Merge branch 'main' into deprecate_affinity
Micky774 Jul 4, 2022
43ff645
Updated affinity-->metric in plotting
Micky774 Jul 6, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats_new/v1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ Changelog
`eigen_tol="auto"` in version 1.3.
:pr:`23210` by :user:`Meekail Zain <micky774>`.

- |API| The `affinity` attribute is now deprecated for
:class:`cluster.AgglomerativeClustering` and will be renamed to `metric` in v1.4.
:pr:`23470` by :user:`Meekail Zain <micky774>`.

:mod:`sklearn.datasets`
.......................

Expand Down
4 changes: 2 additions & 2 deletions examples/cluster/plot_agglomerative_clustering_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def sqr(x):
# Plot clustering results
for index, metric in enumerate(["cosine", "euclidean", "cityblock"]):
model = AgglomerativeClustering(
n_clusters=n_clusters, linkage="average", affinity=metric
n_clusters=n_clusters, linkage="average", metric=metric
)
model.fit(X)
plt.figure()
Expand All @@ -134,7 +134,7 @@ def sqr(x):
plt.plot(X[model.labels_ == l].T, c=c, alpha=0.5)
plt.axis("tight")
plt.axis("off")
plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20)
plt.suptitle("AgglomerativeClustering(metric=%s)" % metric, size=20)


plt.show()
2 changes: 1 addition & 1 deletion examples/cluster/plot_cluster_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@
)
average_linkage = cluster.AgglomerativeClustering(
linkage="average",
affinity="cityblock",
metric="cityblock",
n_clusters=params["n_clusters"],
connectivity=connectivity,
)
Expand Down
76 changes: 68 additions & 8 deletions sklearn/cluster/_agglomerative.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from ..utils import check_array
from ..utils._fast_dict import IntFloatDict
from ..utils.graph import _fix_connected_components
from ..utils._param_validation import Interval, StrOptions
from ..utils._param_validation import Hidden, Interval, StrOptions
from ..utils.validation import check_memory

# mypy error: Module 'sklearn.cluster' has no attribute '_hierarchical_fast'
Expand Down Expand Up @@ -760,6 +760,19 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
If "precomputed", a distance matrix (instead of a similarity matrix)
is needed as input for the fit method.

.. deprecated:: 1.2
`affinity` was deprecated in version 1.2 and will be renamed to
`metric` in 1.4.

metric : str or callable, default=None
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
"manhattan", "cosine", or "precomputed". If set to `None` then
"euclidean" is used. If linkage is "ward", only "euclidean" is
accepted. If "precomputed", a distance matrix is needed as input for
the fit method.

.. versionadded:: 1.2

memory : str or object with the joblib.Memory interface, default=None
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
Expand Down Expand Up @@ -880,9 +893,15 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
_parameter_constraints = {
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
"affinity": [
Hidden(StrOptions({"deprecated"})),
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
],
"metric": [
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
None,
],
"memory": "no_validation", # TODO
"connectivity": ["array-like", callable, None],
"compute_full_tree": [StrOptions({"auto"}), "boolean"],
Expand All @@ -895,7 +914,8 @@ def __init__(
self,
n_clusters=2,
*,
affinity="euclidean",
affinity="deprecated", # TODO(1.4): Remove
metric=None, # TODO(1.4): Set to "euclidean"
memory=None,
connectivity=None,
compute_full_tree="auto",
Expand All @@ -910,6 +930,7 @@ def __init__(
self.compute_full_tree = compute_full_tree
self.linkage = linkage
self.affinity = affinity
self.metric = metric
self.compute_distances = compute_distances

def fit(self, X, y=None):
Expand All @@ -920,7 +941,7 @@ def fit(self, X, y=None):
X : array-like, shape (n_samples, n_features) or \
(n_samples, n_samples)
Training instances to cluster, or distances between instances if
``affinity='precomputed'``.
``metric='precomputed'``.

y : Ignored
Not used, present here for API consistency by convention.
Expand Down Expand Up @@ -950,6 +971,24 @@ def _fit(self, X):
"""
memory = check_memory(self.memory)

self._metric = self.metric
# TODO(1.4): Remove
if self.affinity != "deprecated":
if self.metric is not None:
raise ValueError(
"Both `affinity` and `metric` attributes were set. Attribute"
" `affinity` was deprecated in version 1.2 and will be removed in"
" 1.4. To avoid this error, only set the `metric` attribute."
)
warnings.warn(
"Attribute `affinity` was deprecated in version 1.2 and will be removed"
" in 1.4. Use `metric` instead",
FutureWarning,
)
self._metric = self.affinity
elif self.metric is None:
self._metric = "euclidean"

if not ((self.n_clusters is None) ^ (self.distance_threshold is None)):
raise ValueError(
"Exactly one of n_clusters and "
Expand All @@ -962,10 +1001,10 @@ def _fit(self, X):
"compute_full_tree must be True if distance_threshold is set."
)

if self.linkage == "ward" and self.affinity != "euclidean":
if self.linkage == "ward" and self._metric != "euclidean":
raise ValueError(
"%s was provided as affinity. Ward can only "
"work with euclidean distances." % (self.affinity,)
f"{self._metric} was provided as metric. Ward can only "
"work with euclidean distances."
)

tree_builder = _TREE_BUILDERS[self.linkage]
Expand Down Expand Up @@ -998,7 +1037,7 @@ def _fit(self, X):
kwargs = {}
if self.linkage != "ward":
kwargs["linkage"] = self.linkage
kwargs["affinity"] = self.affinity
kwargs["affinity"] = self._metric

distance_threshold = self.distance_threshold

Expand Down Expand Up @@ -1084,6 +1123,19 @@ class FeatureAgglomeration(
If "precomputed", a distance matrix (instead of a similarity matrix)
is needed as input for the fit method.

.. deprecated:: 1.2
`affinity` was deprecated in version 1.2 and will be renamed to
`metric` in 1.4.

metric : str or callable, default=None
Metric used to compute the linkage. Can be "euclidean", "l1", "l2",
"manhattan", "cosine", or "precomputed". If set to `None` then
"euclidean" is used. If linkage is "ward", only "euclidean" is
accepted. If "precomputed", a distance matrix is needed as input for
the fit method.

.. versionadded:: 1.2

memory : str or object with the joblib.Memory interface, default=None
Used to cache the output of the computation of the tree.
By default, no caching is done. If a string is given, it is the
Expand Down Expand Up @@ -1208,8 +1260,14 @@ class FeatureAgglomeration(
_parameter_constraints = {
"n_clusters": [Interval(Integral, 1, None, closed="left"), None],
"affinity": [
Hidden(StrOptions({"deprecated"})),
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
],
"metric": [
StrOptions(set(_VALID_METRICS) | {"precomputed"}),
callable,
None,
],
"memory": "no_validation", # TODO
"connectivity": ["array-like", callable, None],
Expand All @@ -1224,7 +1282,8 @@ def __init__(
self,
n_clusters=2,
*,
affinity="euclidean",
affinity="deprecated", # TODO(1.4): Remove
metric=None, # TODO(1.4): Set to "euclidean"
memory=None,
connectivity=None,
compute_full_tree="auto",
Expand All @@ -1240,6 +1299,7 @@ def __init__(
compute_full_tree=compute_full_tree,
linkage=linkage,
affinity=affinity,
metric=metric,
distance_threshold=distance_threshold,
compute_distances=compute_distances,
)
Expand Down
37 changes: 30 additions & 7 deletions sklearn/cluster/tests/test_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,24 +241,24 @@ def test_agglomerative_clustering():
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity.toarray(),
affinity="manhattan",
metric="manhattan",
linkage="ward",
)
with pytest.raises(ValueError):
clustering.fit(X)

# Test using another metric than euclidean works with linkage complete
for affinity in PAIRED_DISTANCES.keys():
for metric in PAIRED_DISTANCES.keys():
# Compare our (structured) implementation to scipy
clustering = AgglomerativeClustering(
n_clusters=10,
connectivity=np.ones((n_samples, n_samples)),
affinity=affinity,
metric=metric,
linkage="complete",
)
clustering.fit(X)
clustering2 = AgglomerativeClustering(
n_clusters=10, connectivity=None, affinity=affinity, linkage="complete"
n_clusters=10, connectivity=None, metric=metric, linkage="complete"
)
clustering2.fit(X)
assert_almost_equal(
Expand All @@ -275,7 +275,7 @@ def test_agglomerative_clustering():
clustering2 = AgglomerativeClustering(
n_clusters=10,
connectivity=connectivity,
affinity="precomputed",
metric="precomputed",
linkage="complete",
)
clustering2.fit(X_dist)
Expand All @@ -289,7 +289,7 @@ def test_agglomerative_clustering_memory_mapped():
"""
rng = np.random.RandomState(0)
Xmm = create_memmap_backed_data(rng.randn(50, 100))
AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm)
AgglomerativeClustering(metric="euclidean", linkage="single").fit(Xmm)


def test_ward_agglomeration():
Expand Down Expand Up @@ -860,7 +860,7 @@ def test_invalid_shape_precomputed_dist_matrix():
ValueError,
match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
):
AgglomerativeClustering(affinity="precomputed", linkage="complete").fit(X)
AgglomerativeClustering(metric="precomputed", linkage="complete").fit(X)


def test_precomputed_connectivity_affinity_with_2_connected_components():
Expand Down Expand Up @@ -900,3 +900,26 @@ def test_precomputed_connectivity_affinity_with_2_connected_components():

assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
assert_array_equal(clusterer.children_, clusterer_precomputed.children_)


# TODO(1.4): Remove
def test_deprecate_affinity():
rng = np.random.RandomState(42)
X = rng.randn(50, 10)

af = AgglomerativeClustering(affinity="euclidean")
msg = (
"Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4."
" Use `metric` instead"
)
with pytest.warns(FutureWarning, match=msg):
af.fit(X)
with pytest.warns(FutureWarning, match=msg):
af.fit_predict(X)

af = AgglomerativeClustering(metric="euclidean", affinity="euclidean")
msg = "Both `affinity` and `metric` attributes were set. Attribute"
with pytest.raises(ValueError, match=msg):
af.fit(X)
with pytest.raises(ValueError, match=msg):
af.fit_predict(X)