diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py
index e36225a43d7ce..8a3512afb2b0e 100644
--- a/maint_tools/test_docstrings.py
+++ b/maint_tools/test_docstrings.py
@@ -9,10 +9,8 @@
 
 # List of modules ignored when checking for numpydoc validation.
 DOCSTRING_IGNORE_LIST = [
-    "AgglomerativeClustering",
     "BernoulliRBM",
     "Birch",
-    "FeatureAgglomeration",
     "FeatureHasher",
     "FeatureUnion",
     "FunctionTransformer",
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 3eded0b6c822c..8c84b061f0490 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -34,12 +34,38 @@
 
 def _fix_connectivity(X, connectivity, affinity):
     """
-    Fixes the connectivity matrix
+    Fixes the connectivity matrix.
 
-        - copies it
-        - makes it symmetric
-        - converts it to LIL if necessary
-        - completes it if necessary
+    The different steps are:
+
+    - copies it
+    - makes it symmetric
+    - converts it to LIL if necessary
+    - completes it if necessary.
+
+    Parameters
+    ----------
+    X : array-like of shape (n_samples, n_features)
+        Feature matrix representing `n_samples` samples to be clustered.
+
+    connectivity : sparse matrix, default=None
+        Connectivity matrix. Defines for each sample the neighboring samples
+        following a given structure of the data. The matrix is assumed to
+        be symmetric and only the upper triangular half is used.
+        Default is `None`, i.e, the Ward algorithm is unstructured.
+
+    affinity : {"euclidean", "precomputed"}, default="euclidean"
+        Which affinity to use. At the moment `precomputed` and
+        ``euclidean`` are supported. `euclidean` uses the
+        negative squared Euclidean distance between points.
+
+    Returns
+    -------
+    connectivity : sparse matrix
+        The fixed connectivity matrix.
+
+    n_connected_components : int
+        The number of connected components in the graph.
     """
     n_samples = X.shape[0]
     if connectivity.shape[0] != n_samples or connectivity.shape[1] != n_samples:
@@ -157,24 +183,25 @@ def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)
-        feature matrix representing n_samples samples to be clustered
+        Feature matrix representing `n_samples` samples to be clustered.
 
     connectivity : sparse matrix, default=None
-        connectivity matrix. Defines for each sample the neighboring samples
+        Connectivity matrix. Defines for each sample the neighboring samples
         following a given structure of the data. The matrix is assumed to
         be symmetric and only the upper triangular half is used.
         Default is None, i.e, the Ward algorithm is unstructured.
 
     n_clusters : int, default=None
-        Stop early the construction of the tree at n_clusters. This is
-        useful to decrease computation time if the number of clusters is
-        not small compared to the number of samples. In this case, the
-        complete tree is not computed, thus the 'children' output is of
-        limited use, and the 'parents' output should rather be used.
-        This option is valid only when specifying a connectivity matrix.
+        `n_clusters` should be less than `n_samples`.  Stop early the
+        construction of the tree at `n_clusters.` This is useful to decrease
+        computation time if the number of clusters is not small compared to the
+        number of samples. In this case, the complete tree is not computed, thus
+        the 'children' output is of limited use, and the 'parents' output should
+        rather be used. This option is valid only when specifying a connectivity
+        matrix.
 
-    return_distance : bool, default=None
-        If True, return the distance between the clusters.
+    return_distance : bool, default=False
+        If `True`, return the distance between the clusters.
 
     Returns
     -------
@@ -184,24 +211,24 @@ def ward_tree(X, *, connectivity=None, n_clusters=None, return_distance=False):
         A node `i` greater than or equal to `n_samples` is a non-leaf
         node and has children `children_[i - n_samples]`. Alternatively
         at the i-th iteration, children[i][0] and children[i][1]
-        are merged to form node `n_samples + i`
+        are merged to form node `n_samples + i`.
 
     n_connected_components : int
         The number of connected components in the graph.
 
     n_leaves : int
-        The number of leaves in the tree
+        The number of leaves in the tree.
 
     parents : ndarray of shape (n_nodes,) or None
         The parent of each node. Only returned when a connectivity matrix
         is specified, elsewhere 'None' is returned.
 
     distances : ndarray of shape (n_nodes-1,)
-        Only returned if return_distance is set to True (for compatibility).
+        Only returned if `return_distance` is set to `True` (for compatibility).
         The distances between the centers of the nodes. `distances[i]`
-        corresponds to a weighted euclidean distance between
+        corresponds to a weighted Euclidean distance between
         the nodes `children[i, 1]` and `children[i, 2]`. If the nodes refer to
-        leaves of the tree, then `distances[i]` is their unweighted euclidean
+        leaves of the tree, then `distances[i]` is their unweighted Euclidean
         distance. Distances are updated in the following way
         (from scipy.hierarchy.linkage):
 
@@ -375,16 +402,16 @@ def linkage_tree(
     Parameters
     ----------
     X : array-like of shape (n_samples, n_features)
-        feature matrix representing n_samples samples to be clustered
+        Feature matrix representing `n_samples` samples to be clustered.
 
     connectivity : sparse matrix, default=None
-        connectivity matrix. Defines for each sample the neighboring samples
+        Connectivity matrix. Defines for each sample the neighboring samples
         following a given structure of the data. The matrix is assumed to
         be symmetric and only the upper triangular half is used.
-        Default is None, i.e, the Ward algorithm is unstructured.
+        Default is `None`, i.e, the Ward algorithm is unstructured.
 
     n_clusters : int, default=None
-        Stop early the construction of the tree at n_clusters. This is
+        Stop early the construction of the tree at `n_clusters`. This is
         useful to decrease computation time if the number of clusters is
         not small compared to the number of samples. In this case, the
         complete tree is not computed, thus the 'children' output is of
@@ -394,16 +421,16 @@ def linkage_tree(
     linkage : {"average", "complete", "single"}, default="complete"
         Which linkage criteria to use. The linkage criterion determines which
         distance to use between sets of observation.
-            - average uses the average of the distances of each observation of
-              the two sets
-            - complete or maximum linkage uses the maximum distances between
+            - "average" uses the average of the distances of each observation of
+              the two sets.
+            - "complete" or maximum linkage uses the maximum distances between
               all observations of the two sets.
-            - single uses the minimum of the distances between all observations
-              of the two sets.
+            - "single" uses the minimum of the distances between all
+              observations of the two sets.
 
-    affinity : str or callable, default="euclidean".
-        which metric to use. Can be "euclidean", "manhattan", or any
-        distance know to paired distance (see metric.pairwise)
+    affinity : str or callable, default='euclidean'
+        which metric to use. Can be 'euclidean', 'manhattan', or any
+        distance known to paired distance (see metric.pairwise).
 
     return_distance : bool, default=False
         whether or not to return the distances between the clusters.
@@ -416,7 +443,7 @@ def linkage_tree(
         A node `i` greater than or equal to `n_samples` is a non-leaf
         node and has children `children_[i - n_samples]`. Alternatively
         at the i-th iteration, children[i][0] and children[i][1]
-        are merged to form node `n_samples + i`
+        are merged to form node `n_samples + i`.
 
     n_connected_components : int
         The number of connected components in the graph.
@@ -429,7 +456,7 @@ def linkage_tree(
         is specified, elsewhere 'None' is returned.
 
     distances : ndarray of shape (n_nodes-1,)
-        Returned when return_distance is set to True.
+        Returned when `return_distance` is set to `True`.
 
         distances[i] refers to the distance between children[i][0] and
         children[i][1] when they are merged.
@@ -673,7 +700,7 @@ def _hc_cut(n_clusters, children, n_leaves):
         A node `i` greater than or equal to `n_samples` is a non-leaf
         node and has children `children_[i - n_samples]`. Alternatively
         at the i-th iteration, children[i][0] and children[i][1]
-        are merged to form node `n_samples + i`
+        are merged to form node `n_samples + i`.
 
     n_leaves : int
         Number of leaves of the tree.
@@ -681,8 +708,7 @@ def _hc_cut(n_clusters, children, n_leaves):
     Returns
     -------
     labels : array [n_samples]
-        cluster labels for each point
-
+        Cluster labels for each point.
     """
     if n_clusters > n_leaves:
         raise ValueError(
@@ -713,10 +739,9 @@ def _hc_cut(n_clusters, children, n_leaves):
 
 class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     """
-    Agglomerative Clustering
+    Agglomerative Clustering.
 
-    Recursively merges the pair of clusters that minimally increases
-    a given linkage distance.
+    Recursively merges pair of clusters of sample data; uses linkage distance.
 
     Read more in the :ref:`User Guide <hierarchical_clustering>`.
 
@@ -743,7 +768,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         samples following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
         the data into a connectivity matrix, such as derived from
-        kneighbors_graph. Default is ``None``, i.e, the
+        `kneighbors_graph`. Default is ``None``, i.e, the
         hierarchical clustering algorithm is unstructured.
 
     compute_full_tree : 'auto' or bool, default='auto'
@@ -796,7 +821,7 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         ``n_clusters``.
 
     labels_ : ndarray of shape (n_samples)
-        cluster labels for each point
+        Cluster labels for each point.
 
     n_leaves_ : int
         Number of leaves in the hierarchical tree.
@@ -824,13 +849,19 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
         A node `i` greater than or equal to `n_samples` is a non-leaf
         node and has children `children_[i - n_samples]`. Alternatively
         at the i-th iteration, children[i][0] and children[i][1]
-        are merged to form node `n_samples + i`
+        are merged to form node `n_samples + i`.
 
     distances_ : array-like of shape (n_nodes-1,)
         Distances between nodes in the corresponding place in `children_`.
         Only computed if `distance_threshold` is used or `compute_distances`
         is set to `True`.
 
+    See Also
+    --------
+    FeatureAgglomeration : Agglomerative clustering but for features instead of
+        samples.
+    ward_tree : Hierarchical clustering with ward linkage.
+
     Examples
     --------
     >>> from sklearn.cluster import AgglomerativeClustering
@@ -842,7 +873,6 @@ class AgglomerativeClustering(ClusterMixin, BaseEstimator):
     AgglomerativeClustering()
     >>> clustering.labels_
     array([1, 1, 1, 0, 0, 0])
-
     """
 
     def __init__(
@@ -871,7 +901,8 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array-like, shape (n_samples, n_features) or (n_samples, n_samples)
+        X : array-like, shape (n_samples, n_features) or \
+                (n_samples, n_samples)
             Training instances to cluster, or distances between instances if
             ``affinity='precomputed'``.
 
@@ -880,7 +911,8 @@ def fit(self, X, y=None):
 
         Returns
         -------
-        self
+        self : object
+            Returns the fitted instance.
         """
         X = self._validate_data(X, ensure_min_samples=2, estimator=self)
         memory = check_memory(self.memory)
@@ -983,8 +1015,10 @@ def fit(self, X, y=None):
         return self
 
     def fit_predict(self, X, y=None):
-        """Fit the hierarchical clustering from features or distance matrix,
-        and return cluster labels.
+        """Fit and return the result of each sample's clustering assignment.
+
+        In addition to fitting, this method also return the result of the
+        clustering assignment for each sample in the training set.
 
         Parameters
         ----------
@@ -1007,8 +1041,7 @@ def fit_predict(self, X, y=None):
 class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
     """Agglomerate features.
 
-    Similar to AgglomerativeClustering, but recursively merges features
-    instead of samples.
+    Recursively merges pair of clusters of features.
 
     Read more in the :ref:`User Guide <hierarchical_clustering>`.
 
@@ -1033,11 +1066,11 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         features following a given structure of the data.
         This can be a connectivity matrix itself or a callable that transforms
         the data into a connectivity matrix, such as derived from
-        kneighbors_graph. Default is None, i.e, the
+        `kneighbors_graph`. Default is `None`, i.e, the
         hierarchical clustering algorithm is unstructured.
 
     compute_full_tree : 'auto' or bool, default='auto'
-        Stop early the construction of the tree at n_clusters. This is useful
+        Stop early the construction of the tree at `n_clusters`. This is useful
         to decrease computation time if the number of clusters is not small
         compared to the number of features. This option is useful only when
         specifying a connectivity matrix. Note also that when varying the
@@ -1048,17 +1081,17 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         is inferior to the maximum between 100 or `0.02 * n_samples`.
         Otherwise, "auto" is equivalent to `False`.
 
-    linkage : {'ward', 'complete', 'average', 'single'}, default='ward'
+    linkage : {"ward", "complete", "average", "single"}, default="ward"
         Which linkage criterion to use. The linkage criterion determines which
         distance to use between sets of features. The algorithm will merge
         the pairs of cluster that minimize this criterion.
 
-        - ward minimizes the variance of the clusters being merged.
-        - average uses the average of the distances of each feature of
-          the two sets.
-        - complete or maximum linkage uses the maximum distances between
+        - "ward" minimizes the variance of the clusters being merged.
+        - "complete" or maximum linkage uses the maximum distances between
           all features of the two sets.
-        - single uses the minimum of the distances between all features
+        - "average" uses the average of the distances of each feature of
+          the two sets.
+        - "single" uses the minimum of the distances between all features
           of the two sets.
 
     pooling_func : callable, default=np.mean
@@ -1088,7 +1121,7 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         ``n_clusters``.
 
     labels_ : array-like of (n_features,)
-        cluster labels for each feature.
+        Cluster labels for each feature.
 
     n_leaves_ : int
         Number of leaves in the hierarchical tree.
@@ -1116,13 +1149,19 @@ class FeatureAgglomeration(AgglomerativeClustering, AgglomerationTransform):
         A node `i` greater than or equal to `n_features` is a non-leaf
         node and has children `children_[i - n_features]`. Alternatively
         at the i-th iteration, children[i][0] and children[i][1]
-        are merged to form node `n_features + i`
+        are merged to form node `n_features + i`.
 
     distances_ : array-like of shape (n_nodes-1,)
         Distances between nodes in the corresponding place in `children_`.
         Only computed if `distance_threshold` is used or `compute_distances`
         is set to `True`.
 
+    See Also
+    --------
+    AgglomerativeClustering : Agglomerative clustering samples instead of
+        features.
+    ward_tree : Hierarchical clustering with ward linkage.
+
     Examples
     --------
     >>> import numpy as np
@@ -1164,18 +1203,23 @@ def __init__(
         self.pooling_func = pooling_func
 
     def fit(self, X, y=None, **params):
-        """Fit the hierarchical clustering on the data
+        """Fit the hierarchical clustering on the data.
 
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
-            The data
+            The data.
 
         y : Ignored
+            Not used, present here for API consistency by convention.
+
+        **params : dictionary of keyword arguments
+            Additional fit parameters.
 
         Returns
         -------
-        self
+        self : object
+            Returns the transformer.
         """
         X = self._validate_data(
             X,
@@ -1192,4 +1236,5 @@ def fit(self, X, y=None, **params):
 
     @property
     def fit_predict(self):
+        """Fit and return the result of each sample's clustering assignment."""
         raise AttributeError
diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py
index e6e03d57651b7..457a83dd41e71 100644
--- a/sklearn/cluster/_feature_agglomeration.py
+++ b/sklearn/cluster/_feature_agglomeration.py
@@ -17,16 +17,17 @@
 
 class AgglomerationTransform(TransformerMixin):
     """
-    A class for feature agglomeration via the transform interface
+    A class for feature agglomeration via the transform interface.
     """
 
     def transform(self, X):
         """
-        Transform a new matrix using the built clustering
+        Transform a new matrix using the built clustering.
 
         Parameters
         ----------
-        X : array-like of shape (n_samples, n_features) or (n_samples,)
+        X : array-like of shape (n_samples, n_features) or \
+                (n_samples, n_samples)
             A M by N array of M observations in N dimensions or a length
             M array of M one-dimensional observations.
 
@@ -55,19 +56,17 @@ def transform(self, X):
 
     def inverse_transform(self, Xred):
         """
-        Inverse the transformation.
-        Return a vector of size nb_features with the values of Xred assigned
-        to each group of features
+        Inverse the transformation and return a vector of size `n_features`.
 
         Parameters
         ----------
         Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)
-            The values to be assigned to each cluster of samples
+            The values to be assigned to each cluster of samples.
 
         Returns
         -------
         X : ndarray of shape (n_samples, n_features) or (n_features,)
-            A vector of size n_samples with the values of Xred assigned to
+            A vector of size `n_samples` with the values of `Xred` assigned to
             each of the cluster of samples.
         """
         check_is_fitted(self)