diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index db4aa1b3250a3..fe4855eb88370 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -76,6 +76,11 @@ Support for Python 3.4 and below has been officially dropped. to set and that scales better, by :user:`Shane `, :user:`Adrin Jalali `, and :user:`Erich Schubert `. +- |API| The ``n_components_`` attribute in :class:`cluster.AgglomerativeClustering` + and :class:`cluster.FeatureAgglomeration` has been renamed to + ``n_connected_components_``. + :issue:`13427` by :user:`Stephane Couvreur `. + :mod:`sklearn.datasets` ....................... diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py index f596bd00648f5..03c5a7d33acbf 100644 --- a/sklearn/cluster/hierarchical.py +++ b/sklearn/cluster/hierarchical.py @@ -23,11 +23,11 @@ from ._feature_agglomeration import AgglomerationTransform from ..utils.fast_dict import IntFloatDict from ..utils.fixes import _astype_copy_false +from ..utils import deprecated ############################################################################### # For non fully-connected graphs - def _fix_connectivity(X, connectivity, affinity): """ Fixes the connectivity matrix @@ -54,15 +54,15 @@ def _fix_connectivity(X, connectivity, affinity): connectivity = connectivity.tolil() # Compute the number of nodes - n_components, labels = connected_components(connectivity) + n_connected_components, labels = connected_components(connectivity) - if n_components > 1: + if n_connected_components > 1: warnings.warn("the number of connected components of the " "connectivity matrix is %d > 1. Completing it to avoid " - "stopping the tree early." % n_components, + "stopping the tree early." % n_connected_components, stacklevel=2) # XXX: Can we do without completing the matrix? - for i in range(n_components): + for i in range(n_connected_components): idx_i = np.where(labels == i)[0] Xi = X[idx_i] for j in range(i): @@ -75,11 +75,11 @@ def _fix_connectivity(X, connectivity, affinity): connectivity[idx_i[ii], idx_j[jj]] = True connectivity[idx_j[jj], idx_i[ii]] = True - return connectivity, n_components + return connectivity, n_connected_components def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, - n_components, return_distance): + n_connected_components, return_distance): """ Perform single linkage clustering on sparse data via the minimum spanning tree from scipy.sparse.csgraph, then using union-find to label. @@ -125,8 +125,8 @@ def _single_linkage_tree(connectivity, n_samples, n_nodes, n_clusters, if return_distance: distances = single_linkage_tree[:, 2] - return children_, n_components, n_samples, parent, distances - return children_, n_components, n_samples, parent + return children_, n_connected_components, n_samples, parent, distances + return children_, n_connected_components, n_samples, parent ############################################################################### @@ -177,7 +177,7 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` - n_components : int + n_connected_components : int The number of connected components in the graph. n_leaves : int @@ -239,8 +239,9 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): else: return children_, 1, n_samples, None - connectivity, n_components = _fix_connectivity(X, connectivity, - affinity='euclidean') + connectivity, n_connected_components = _fix_connectivity( + X, connectivity, + affinity='euclidean') if n_clusters is None: n_nodes = 2 * n_samples - 1 else: @@ -333,9 +334,9 @@ def ward_tree(X, connectivity=None, n_clusters=None, return_distance=False): if return_distance: # 2 is scaling factor to compare w/ unstructured version distances = np.sqrt(2. * distances) - return children, n_components, n_leaves, parent, distances + return children, n_connected_components, n_leaves, parent, distances else: - return children, n_components, n_leaves, parent + return children, n_connected_components, n_leaves, parent # single average and complete linkage @@ -396,7 +397,7 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', at the i-th iteration, children[i][0] and children[i][1] are merged to form node `n_samples + i` - n_components : int + n_connected_components : int The number of connected components in the graph. n_leaves : int @@ -467,9 +468,9 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', return children_, 1, n_samples, None, distances return children_, 1, n_samples, None - connectivity, n_components = _fix_connectivity(X, connectivity, - affinity=affinity) - + connectivity, n_connected_components = _fix_connectivity( + X, connectivity, + affinity=affinity) connectivity = connectivity.tocoo() # Put the diagonal to zero diag_mask = (connectivity.row != connectivity.col) @@ -497,7 +498,8 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', if linkage == 'single': return _single_linkage_tree(connectivity, n_samples, n_nodes, - n_clusters, n_components, return_distance) + n_clusters, n_connected_components, + return_distance) if return_distance: distances = np.empty(n_nodes - n_samples) @@ -567,8 +569,8 @@ def linkage_tree(X, connectivity=None, n_clusters=None, linkage='complete', children = np.array(children)[:, ::-1] if return_distance: - return children, n_components, n_leaves, parent, distances - return children, n_components, n_leaves, parent + return children, n_connected_components, n_leaves, parent, distances + return children, n_connected_components, n_leaves, parent # Matching names to tree-building strategies @@ -717,7 +719,7 @@ class AgglomerativeClustering(BaseEstimator, ClusterMixin): n_leaves_ : int Number of leaves in the hierarchical tree. - n_components_ : int + n_connected_components_ : int The estimated number of connected components in the graph. children_ : array-like, shape (n_samples-1, 2) @@ -756,6 +758,13 @@ def __init__(self, n_clusters=2, affinity="euclidean", self.affinity = affinity self.pooling_func = pooling_func + @property + @deprecated("The ``n_components_`` attribute was deprecated " + "in favor of ``n_connected_components_`` in 0.21 " + "and will be removed in 0.23.") + def n_components_(self): + return self.n_connected_components_ + def fit(self, X, y=None): """Fit the hierarchical clustering on the data @@ -819,10 +828,11 @@ def fit(self, X, y=None): if self.linkage != 'ward': kwargs['linkage'] = self.linkage kwargs['affinity'] = self.affinity - self.children_, self.n_components_, self.n_leaves_, parents = \ - memory.cache(tree_builder)(X, connectivity, - n_clusters=n_clusters, - **kwargs) + (self.children_, self.n_connected_components_, self.n_leaves_, + parents) = memory.cache(tree_builder)(X, connectivity, + n_clusters=n_clusters, + **kwargs) + # Cut the tree if compute_full_tree: self.labels_ = _hc_cut(self.n_clusters, self.children_, @@ -902,7 +912,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform): n_leaves_ : int Number of leaves in the hierarchical tree. - n_components_ : int + n_connected_components_ : int The estimated number of connected components in the graph. children_ : array-like, shape (n_nodes-1, 2) diff --git a/sklearn/cluster/tests/test_hierarchical.py b/sklearn/cluster/tests/test_hierarchical.py index 98e6ef09ce2ee..e50b40e9747cc 100644 --- a/sklearn/cluster/tests/test_hierarchical.py +++ b/sklearn/cluster/tests/test_hierarchical.py @@ -598,3 +598,17 @@ def increment(self, *args, **kwargs): linkage_tree(X, connectivity=connectivity, affinity=fa.increment) assert_equal(fa.counter, 3) + + +def test_n_components_deprecation(): + # Test that a Deprecation warning is thrown when n_components_ + # attribute is accessed + + X = np.array([[1, 2], [1, 4], [1, 0], [4, 2]]) + agc = AgglomerativeClustering().fit(X) + + match = ("``n_components_`` attribute was deprecated " + "in favor of ``n_connected_components_``") + with pytest.warns(DeprecationWarning, match=match): + n = agc.n_components_ + assert n == agc.n_connected_components_