ENH propagate eigen_tol to all eigen solver (#23210)

Micky774 · Joan Massich · glemaitre · web-flow · commit 1fbf5a0e1089 · 2022-06-01T23:03:43.000+02:00
Co-authored-by: Joan Massich &lt;sik@visor.udg.edu&gt;
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
Co-authored-by: Jérémie du Boisberranger &lt;34657725+jeremiedbb@users.noreply.github.com&gt;
diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -19,6 +19,12 @@ parameters, may produce different models from the previous version. This often
 occurs due to changes in the modelling logic (bug fixes or enhancements), or in
 random sampling procedures.
 
+- |Enhancement| The default `eigen_tol` for :class:`cluster.SpectralClustering`,
+  :class:`manifold.SpectralEmbedding`, :func:`cluster.spectral_clustering`,
+  and :func:`manifold.spectral_embedding` is now `None` when using the `'amg'`
+  or `'lobpcg'` solvers. This change improves numerical stability of the
+  solver, but may result in a different model.
+
 - |Fix| :class:`manifold.TSNE` now throws a `ValueError` when fit with
   `perplexity>=n_samples` to ensure mathematical correctness of the algorithm.
   :pr:`10805` by :user:`Mathias Andersen <MrMathias>` and
@@ -72,6 +78,13 @@ Changelog
   and both will have their defaults changed to `n_init='auto'` in 1.4.
   :pr:`23038` by :user:`Meekail Zain <micky774>`.
 
+- |Enhancement| :class:`cluster.SpectralClustering` and
+  :func:`cluster.spectral_clustering` now propogates the `eigen_tol` parameter
+  to all choices of `eigen_solver`. Includes a new option `eigen_tol="auto"`
+  and begins deprecation to change the default from `eigen_tol=0` to
+  `eigen_tol="auto"` in version 1.3.
+  :pr:`23210` by :user:`Meekail Zain <micky774>`.
+
 :mod:`sklearn.datasets`
 .......................
 
@@ -148,6 +161,14 @@ Changelog
 :mod:`sklearn.manifold`
 .......................
 
+- |Enhancement| Adds `eigen_tol` parameter to
+  :class:`manifold.SpectralEmbedding`. Both :func:`manifold.spectral_embedding`
+  and :class:`manifold.SpectralEmbedding` now propogate `eigen_tol` to all
+  choices of `eigen_solver`. Includes a new option `eigen_tol="auto"`
+  and begins deprecation to change the default from `eigen_tol=0` to
+  `eigen_tol="auto"` in version 1.3.
+  :pr:`23210` by :user:`Meekail Zain <micky774>`.
+
 - |Fix| :class:`manifold.TSNE` now throws a `ValueError` when fit with
   `perplexity>=n_samples` to ensure mathematical correctness of the algorithm.
   :pr:`10805` by :user:`Mathias Andersen <MrMathias>` and
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
@@ -198,7 +198,7 @@ def spectral_clustering(
     eigen_solver=None,
     random_state=None,
     n_init=10,
-    eigen_tol=0.0,
+    eigen_tol="auto",
     assign_labels="kmeans",
     verbose=False,
 ):
@@ -259,9 +259,23 @@ def spectral_clustering(
         consecutive runs in terms of inertia. Only used if
         ``assign_labels='kmeans'``.
 
-    eigen_tol : float, default=0.0
-        Stopping criterion for eigendecomposition of the Laplacian matrix
-        when using arpack eigen_solver.
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
 
     assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
         The strategy to use to assign labels in the embedding
@@ -461,9 +475,23 @@ class SpectralClustering(ClusterMixin, BaseEstimator):
         Number of neighbors to use when constructing the affinity matrix using
         the nearest neighbors method. Ignored for ``affinity='rbf'``.
 
-    eigen_tol : float, default=0.0
-        Stopping criterion for eigendecomposition of the Laplacian matrix
-        when ``eigen_solver='arpack'``.
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
 
     assign_labels : {'kmeans', 'discretize', 'cluster_qr'}, default='kmeans'
         The strategy for assigning labels in the embedding space. There are two
@@ -598,7 +626,7 @@ def __init__(
         gamma=1.0,
         affinity="rbf",
         n_neighbors=10,
-        eigen_tol=0.0,
+        eigen_tol="auto",
         assign_labels="kmeans",
         degree=3,
         coef0=1,
@@ -694,7 +722,7 @@ def fit(self, X, y=None):
             include_boundaries="left",
         )
 
-        if self.eigen_solver == "arpack":
+        if self.eigen_tol != "auto":
             check_scalar(
                 self.eigen_tol,
                 "eigen_tol",
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
@@ -145,7 +145,7 @@ def spectral_embedding(
     n_components=8,
     eigen_solver=None,
     random_state=None,
-    eigen_tol=0.0,
+    eigen_tol="auto",
     norm_laplacian=True,
     drop_first=True,
 ):
@@ -197,9 +197,22 @@ def spectral_embedding(
             https://github.com/pyamg/pyamg/issues/139 for further
             information.
 
-    eigen_tol : float, default=0.0
-        Stopping criterion for eigendecomposition of the Laplacian matrix
-        when using arpack eigen_solver.
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="amg"` values of `tol<1e-5` may lead
+        to convergence issues and should be avoided.
+
+        .. versionadded:: 1.2
+           Added 'auto' option.
 
     norm_laplacian : bool, default=True
         If True, then compute symmetric normalized Laplacian.
@@ -293,10 +306,11 @@ def spectral_embedding(
         try:
             # We are computing the opposite of the laplacian inplace so as
             # to spare a memory allocation of a possibly very large array
+            tol = 0 if eigen_tol == "auto" else eigen_tol
             laplacian *= -1
             v0 = _init_arpack_v0(laplacian.shape[0], random_state)
             _, diffusion_map = eigsh(
-                laplacian, k=n_components, sigma=1.0, which="LM", tol=eigen_tol, v0=v0
+                laplacian, k=n_components, sigma=1.0, which="LM", tol=tol, v0=v0
             )
             embedding = diffusion_map.T[n_components::-1]
             if norm_laplacian:
@@ -338,7 +352,9 @@ def spectral_embedding(
         X = random_state.standard_normal(size=(laplacian.shape[0], n_components + 1))
         X[:, 0] = dd.ravel()
         X = X.astype(laplacian.dtype)
-        _, diffusion_map = lobpcg(laplacian, X, M=M, tol=1.0e-5, largest=False)
+
+        tol = None if eigen_tol == "auto" else eigen_tol
+        _, diffusion_map = lobpcg(laplacian, X, M=M, tol=tol, largest=False)
         embedding = diffusion_map.T
         if norm_laplacian:
             # recover u = D^-1/2 x from the eigenvector output x
@@ -371,8 +387,9 @@ def spectral_embedding(
             )
             X[:, 0] = dd.ravel()
             X = X.astype(laplacian.dtype)
+            tol = None if eigen_tol == "auto" else eigen_tol
             _, diffusion_map = lobpcg(
-                laplacian, X, tol=1e-5, largest=False, maxiter=2000
+                laplacian, X, tol=tol, largest=False, maxiter=2000
             )
             embedding = diffusion_map.T[:n_components]
             if norm_laplacian:
@@ -444,6 +461,23 @@ class SpectralEmbedding(BaseEstimator):
         to be installed. It can be faster on very large, sparse problems.
         If None, then ``'arpack'`` is used.
 
+    eigen_tol : float, default="auto"
+        Stopping criterion for eigendecomposition of the Laplacian matrix.
+        If `eigen_tol="auto"` then the passed tolerance will depend on the
+        `eigen_solver`:
+
+        - If `eigen_solver="arpack"`, then `eigen_tol=0.0`;
+        - If `eigen_solver="lobpcg"` or `eigen_solver="amg"`, then
+          `eigen_tol=None` which configures the underlying `lobpcg` solver to
+          automatically resolve the value according to their heuristics. See,
+          :func:`scipy.sparse.linalg.lobpcg` for details.
+
+        Note that when using `eigen_solver="lobpcg"` or `eigen_solver="amg"`
+        values of `tol<1e-5` may lead to convergence issues and should be
+        avoided.
+
+        .. versionadded:: 1.2
+
     n_neighbors : int, default=None
         Number of nearest neighbors for nearest_neighbors graph building.
         If None, n_neighbors will be set to max(n_samples/10, 1).
@@ -516,6 +550,7 @@ def __init__(
         gamma=None,
         random_state=None,
         eigen_solver=None,
+        eigen_tol="auto",
         n_neighbors=None,
         n_jobs=None,
     ):
@@ -524,6 +559,7 @@ def __init__(
         self.gamma = gamma
         self.random_state = random_state
         self.eigen_solver = eigen_solver
+        self.eigen_tol = eigen_tol
         self.n_neighbors = n_neighbors
         self.n_jobs = n_jobs
 
@@ -641,6 +677,7 @@ def fit(self, X, y=None):
             affinity_matrix,
             n_components=self.n_components,
             eigen_solver=self.eigen_solver,
+            eigen_tol=self.eigen_tol,
             random_state=random_state,
         )
         return self
diff --git a/sklearn/manifold/tests/test_spectral_embedding.py b/sklearn/manifold/tests/test_spectral_embedding.py
@@ -1,23 +1,26 @@
+from unittest.mock import Mock
 import pytest
 
 import numpy as np
 
 from scipy import sparse
 from scipy.sparse import csgraph
 from scipy.linalg import eigh
+from scipy.sparse.linalg import eigsh
 
-from sklearn.manifold import SpectralEmbedding
+from sklearn.manifold import SpectralEmbedding, _spectral_embedding
 from sklearn.manifold._spectral_embedding import _graph_is_connected
 from sklearn.manifold._spectral_embedding import _graph_connected_component
 from sklearn.manifold import spectral_embedding
 from sklearn.metrics.pairwise import rbf_kernel
-from sklearn.metrics import normalized_mutual_info_score
+from sklearn.metrics import normalized_mutual_info_score, pairwise_distances
 from sklearn.neighbors import NearestNeighbors
 from sklearn.cluster import KMeans
 from sklearn.datasets import make_blobs
 from sklearn.utils.extmath import _deterministic_vector_sign_flip
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_array_equal
+from sklearn.utils.fixes import lobpcg
 
 try:
     from pyamg import smoothed_aggregation_solver  # noqa
@@ -480,3 +483,28 @@ def test_error_pyamg_not_available():
     err_msg = "The eigen_solver was set to 'amg', but pyamg is not available."
     with pytest.raises(ValueError, match=err_msg):
         se_precomp.fit_transform(S)
+
+
+@pytest.mark.parametrize("solver", ["arpack", "amg", "lobpcg"])
+def test_spectral_eigen_tol_auto(monkeypatch, solver):
+    """Test that `eigen_tol="auto"` is resolved correctly"""
+    if solver == "amg" and not pyamg_available:
+        pytest.skip("PyAMG is not available.")
+    X, _ = make_blobs(
+        n_samples=200, random_state=0, centers=[[1, 1], [-1, -1]], cluster_std=0.01
+    )
+    D = pairwise_distances(X)  # Distance matrix
+    S = np.max(D) - D  # Similarity matrix
+
+    solver_func = eigsh if solver == "arpack" else lobpcg
+    default_value = 0 if solver == "arpack" else None
+    if solver == "amg":
+        S = sparse.csr_matrix(S)
+
+    mocked_solver = Mock(side_effect=solver_func)
+
+    monkeypatch.setattr(_spectral_embedding, solver_func.__qualname__, mocked_solver)
+
+    spectral_embedding(S, random_state=42, eigen_solver=solver, eigen_tol="auto")
+    mocked_solver.assert_called()
+    assert mocked_solver.call_args.kwargs["tol"] == default_value