From 891792c4510a6ed9490004b1a589e4a186626b0a Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Wed, 11 Dec 2019 20:23:00 -0500
Subject: [PATCH 1/8] FIX use safe_sparse_dot for callable kernel in
 LabelSpreading (#15866)

---
 sklearn/semi_supervised/_label_propagation.py |  3 +-
 .../tests/test_label_propagation.py           | 39 +++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 0ec687aae7d20..665b50dcfa507 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -195,7 +195,8 @@ class labels
                 for weight_matrix in weight_matrices])
         else:
             weight_matrices = weight_matrices.T
-            probabilities = np.dot(weight_matrices, self.label_distributions_)
+            probabilities = safe_sparse_dot(
+                    weight_matrices, self.label_distributions_)
         normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
         probabilities /= normalizer
         return probabilities
diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py
index 7e20350b20b2f..d95dea2522a3d 100644
--- a/sklearn/semi_supervised/tests/test_label_propagation.py
+++ b/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -3,10 +3,12 @@
 import numpy as np
 import pytest
 
+from scipy.sparse import csr_matrix
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.semi_supervised import _label_propagation as label_propagation
 from sklearn.metrics.pairwise import rbf_kernel
+from sklearn.neighbors import NearestNeighbors
 from sklearn.datasets import make_classification
 from sklearn.exceptions import ConvergenceWarning
 from numpy.testing import assert_array_almost_equal
@@ -152,3 +154,40 @@ def test_convergence_warning():
 
     mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500)
     assert_no_warnings(mdl.fit, X, y)
+
+
+def test_predict_sparse_callable_kernel():
+    # This is a non-regression test for #15866
+
+    # Custom sparse kernel (top-K RBF)
+    def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
+        nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1)
+        nn.fit(X)
+        W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma
+        np.exp(W.data, out=W.data)
+        assert isinstance(W, csr_matrix)
+        return W.T
+
+    n_classes = 4
+    n_samples = 500
+    n_test = 10
+    X, Y = make_classification(n_classes=n_classes,
+                               n_samples=n_samples,
+                               n_features=20,
+                               n_informative=20,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+
+    Xtrain = X[:n_samples - n_test]
+    Ytrain = Y[:n_samples - n_test]
+    Xtest = X[n_samples - n_test:]
+    Ytest = Y[n_samples - n_test:]
+
+    model = label_propagation.LabelSpreading(kernel=topk_rbf)
+    model.fit(Xtrain, Ytrain)
+
+    Ypred = model.predict(Xtest)
+    n_correct = np.sum(Ypred == Ytest)
+
+    assert n_correct >= 0.9 * n_test

From 3bd7e9f0e4874db824dd20803568256580c70476 Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Thu, 12 Dec 2019 11:21:29 -0500
Subject: [PATCH 2/8] FIX use safe_sparse_dot for callable kernel in
 LabelSpreading (#15866)

---
 doc/whats_new/v0.22.rst                              |  8 ++++++++
 .../semi_supervised/tests/test_label_propagation.py  | 12 ++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index af08b832e9f6f..ccf93030d1edf 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -795,6 +795,14 @@ Changelog
   :pr:`13925` by :user:`Isaac S. Robson <isrobson>` and :pr:`15524` by
   :user:`Xun Tang <xun-tang>`.
 
+:mod:`sklearn.semi_supervised`
+.............................
+
+- |Fix| :class:`semi_supervised.LabelPropagation` and
+  `semi_supervised.LabelSpreading` now allow callable kernel function to
+  return sparse weight matrix.
+  :pr:`15868` by :user:`Niklas Smedemark-Margulies <nik-sm>`.
+
 :mod:`sklearn.svm`
 ..................
 
diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py
index d95dea2522a3d..219457f9edcf6 100644
--- a/sklearn/semi_supervised/tests/test_label_propagation.py
+++ b/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pytest
 
-from scipy.sparse import csr_matrix
+from scipy.sparse import issparse
 from sklearn.utils._testing import assert_warns
 from sklearn.utils._testing import assert_no_warnings
 from sklearn.semi_supervised import _label_propagation as label_propagation
@@ -165,7 +165,7 @@ def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
         nn.fit(X)
         W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma
         np.exp(W.data, out=W.data)
-        assert isinstance(W, csr_matrix)
+        assert issparse(W)
         return W.T
 
     n_classes = 4
@@ -191,3 +191,11 @@ def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
     n_correct = np.sum(Ypred == Ytest)
 
     assert n_correct >= 0.9 * n_test
+
+    model = label_propagation.LabelPropagation(kernel=topk_rbf)
+    model.fit(Xtrain, Ytrain)
+
+    Ypred = model.predict(Xtest)
+    n_correct = np.sum(Ypred == Ytest)
+
+    assert n_correct >= 0.9 * n_test

From b8bc7d8f323ef1bc62e600726428ecbf77709199 Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Thu, 12 Dec 2019 11:43:03 -0500
Subject: [PATCH 3/8] FIX use safe_sparse_dot for callable kernel in
 LabelSpreading (#15866)

---
 doc/whats_new/v0.22.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
index ccf93030d1edf..104a3f3adc3df 100644
--- a/doc/whats_new/v0.22.rst
+++ b/doc/whats_new/v0.22.rst
@@ -796,7 +796,7 @@ Changelog
   :user:`Xun Tang <xun-tang>`.
 
 :mod:`sklearn.semi_supervised`
-.............................
+..............................
 
 - |Fix| :class:`semi_supervised.LabelPropagation` and
   `semi_supervised.LabelSpreading` now allow callable kernel function to

From da586279291228112f10e909da504158b028370a Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Fri, 13 Dec 2019 15:49:45 -0500
Subject: [PATCH 4/8] WIP - sparse RBF kernel

---
 .../compare_sparse_kernels_mnist.py           | 169 ++++++++++++++++++
 sklearn/semi_supervised/_label_propagation.py |  62 ++++---
 .../tests/test_label_propagation.py           |  96 +++++++++-
 3 files changed, 304 insertions(+), 23 deletions(-)
 create mode 100644 examples/semi_supervised/compare_sparse_kernels_mnist.py

diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py
new file mode 100644
index 0000000000000..f633102adcbf9
--- /dev/null
+++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py
@@ -0,0 +1,169 @@
+"""
+=================================================
+Label Propagation MNIST: Comparing Sparse Kernels
+=================================================
+
+This example compares the runtime and performance of two sparse kernels for
+semisupervised learning on the MNIST digit dataset.
+
+The MNIST dataset consists of 28x28 pixel grayscale images. Here, we will use a
+subset of 10K images, reserving a fraction of these for testing.  We will
+compare the performance and runtime of two sparse kernels, across a range of
+low-supervision scenarios.
+
+In each scenario, we will run each model multiple times, to increase our
+confidence in the comparison between kernels.
+
+The models will be evaluated for their accuracy at spreading labels during
+training ("transductive learning"), as well as spreading labels to unseen
+points at test time ("inductive learning").
+
+The first kernel option produces a binary k-Nearest Neighbors adjacency matrix.
+The second produces a kernel which is also k-sparse, but contains the same
+weights as used in an RBF kernel.  
+
+Notice that the performance of the sparse-RBF kernel is very sensitive to
+parameters; the parameters used here were found by a quick manual search, so
+the model can likely be improved with further optimization, and using this
+kernel effectively on a new dataset requires hyperparameter tuning.
+"""
+import numpy as np
+from sklearn.datasets import fetch_openml
+from sklearn.semi_supervised import LabelSpreading
+from sklearn.metrics import classification_report, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import make_scorer
+import time
+
+Xorig, Yorig = fetch_openml('mnist_784', version=1, return_X_y=True)
+Yorig = Y.astype(int)
+
+# For a quick demonstration, use only a subset of the data
+n_total = 10000
+X = Xorig[:n_total, :]
+Y = Yorig[:n_total]
+
+# Save test set for inductive learning
+test_fraction = 0.333
+Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=test_fraction,
+                                                random_state=0)
+
+# Mask subset of train data for transductive learning
+n_train = len(Ytrain)
+#kwargs = {'gamma': 1e-9, 'n_neighbors': 50, 'n_jobs': -1, 'max_iter': 100}
+
+#models = [LabelSpreading(kernel='knn', **kwargs),
+#          LabelSpreading(kernel='sparse-rbf', **kwargs)]
+
+#supervision_fractions = [0.001, 0.005, 0.01, 0.05, 0.1]
+
+# First, we perform a grid search to optimize parameters for sparse-rbf kernel.
+# For this purpose, we use a smaller subset of the data.
+# Notice also that we 
+
+class WrapLabelSpreading(LabelSpreading):
+    """
+    In order to perform a grid search over this semi-supervised model,
+    we need to provide a thin wrapper that masks a subset of the data before 
+    `fit` is called.
+    """
+    def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20,
+            n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None):
+
+        self.supervision_fraction = supervision_fraction
+
+        super().__init__(kernel=kernel, gamma=gamma,
+                         n_neighbors=n_neighbors, alpha=alpha,
+                         max_iter=max_iter, tol=tol, n_jobs=n_jobs)
+
+    def fit(self, X, y):
+        # mask a random subset of labels, based on self.supervision_fraction
+        n_total = len(y)
+        n_labeled = self.supervision_fraction * n_total
+
+        indices = np.arange(n_total)
+        np.random.seed(0)
+        np.random.shuffle(indices)
+        unlabeled_subset = indices[n_labeled:]
+
+        y[unlabeled_subset] = -1
+
+        super().fit(X,y)
+        return self
+
+
+# In all cases, we simply use max_iter=100
+sparse_rbf_model = GridSearchCV(WrapLabelSpreading(kernel='sparse-rbf'),
+                                param_grid= {
+                                    'gamma': np.logspace(-8, 1, 10),
+                                    'alpha': np.linspace(0, 1, 10),
+                                     'n_neighbors': list(range(5,55,5))})
+
+knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn'),
+                         param_grid= {
+                             'n_neighbors': list(range(5,55,5))},
+                             'alpha': np.linspace(0, 1, 10),
+                            )
+
+
+# Then, we compare the performance of optimized sparse-rbf kernel to knn kernel
+supervision_fractions = [0.05, 0.1]
+accuracies = {
+        'transduction': { 'knn':[], 'sparse-rbf':[] }, 
+        'induction': { 'knn':[], 'sparse-rbf':[] }
+}
+for supervision_fraction in supervision_fractions:
+    supervision_fraction = 0.05
+    n_labeled = int(supervision_fraction * n_train)
+    indices = np.arange(n_train)
+    unlabeled_set = indices[n_labeled:]
+
+    Ymasked = np.copy(Ytrain)
+    Ymasked[unlabeled_set] = -1
+
+    for kernel_name, model in zip(['knn', 'sparse-rbf'], 
+                                  [knn_model, sparse_rbf_model]):
+        knn_acc_trans = []
+        knn_acc_ind = []
+        sparse_rbf_acc_trans = []
+        sparse_rbf_acc_ind = []
+        # Repeat each scenario 5 times to collect rough statistics
+    #    for _ in range(5):
+        print("="*80)
+        t0 = time.time()
+        print(f"MODEL: {model}")
+        model.fit(Xtrain, Ymasked)
+        t1 = time.time()
+
+        predicted_labels = model.transduction_[unlabeled_set]
+        true_labels = Ytrain[unlabeled_set]
+        acc = np.sum(predicted_labels == true_labels) / len(unlabeled_set)
+        print(f"accuracy: {acc}")
+
+
+
+        print("-"*80)
+        print(f"TRANSDUCTION: {n_labeled} labeled and " +
+              f"{n_train - n_labeled} unlabeled points ({n_train} total)")
+        print("-"*80)
+        print("Confusion Matrix:")
+        print(confusion_matrix(true_labels, predicted_labels,
+                               labels=model.classes_))
+        print("-"*80)
+        print("Classification Report:")
+        print(classification_report(true_labels, predicted_labels))
+        print("-"*80)
+
+        predicted_labels = model.predict(Xtest)
+        t2 = time.time()
+
+        print("-"*80)
+        print(f"INDUCTION: {int(test_fraction * n_total)} test points")
+        print("-"*80)
+        print("Confusion Matrix:")
+        print(confusion_matrix(Ytest, predicted_labels, labels=model.classes_))
+        print("-"*80)
+        print("Classification Report:")
+        print(classification_report(Ytest, predicted_labels))
+        print("-"*80)
+        print(f"Runtimes: Transduction: {t1 - t0:.2f}s. Induction: {t2 - t1:.2f}s")
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 665b50dcfa507..7fd5ec7d2d345 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -23,10 +23,10 @@
 
 Kernel:
   A function which projects a vector into some higher dimensional space. This
-  implementation supports RBF and KNN kernels. Using the RBF kernel generates
-  a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of
-  size O(k*N) which will run much faster. See the documentation for SVMs for
-  more info on kernels.
+  implementation supports RBF, Sparse-RBF, and KNN kernels. Using the RBF
+  kernel generates a dense matrix of size O(N^2). Sparse-RBF and KNN kernel
+  will generate a sparse matrix of size O(k*N) which will run much faster. See
+  the documentation for SVMs for more info on kernels.
 
 Examples
 --------
@@ -76,17 +76,19 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
 
     Parameters
     ----------
-    kernel : {'knn', 'rbf', callable}
+    kernel : {'knn', 'rbf', 'sparse-rbf', callable}
         String identifier for kernel function to use or the kernel function
-        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
-        passed should take two inputs, each of shape [n_samples, n_features],
-        and return a [n_samples, n_samples] shaped weight matrix
+        itself. Only 'rbf', 'sparse-rbf', and 'knn' strings are valid inputs;
+        'sparse-rbf' calculates RBF weights for only the closest 'n_neighbors'
+        points.  The callable function passed should take two inputs, each of
+        shape [n_samples, n_features], and return a [n_samples, n_samples]
+        shaped weight matrix.
 
     gamma : float
-        Parameter for rbf kernel
+        Parameter for rbf or sparse-rbf kernel
 
     n_neighbors : integer > 0
-        Parameter for knn kernel
+        Parameter for knn or sparse-rbf kernel
 
     alpha : float
         Clamping factor
@@ -127,6 +129,17 @@ def _get_kernel(self, X, y=None):
                 return rbf_kernel(X, X, gamma=self.gamma)
             else:
                 return rbf_kernel(X, y, gamma=self.gamma)
+        elif self.kernel == "sparse-rbf":
+            self.nn_fit = NearestNeighbors(self.n_neighbors,
+                                           n_jobs=self.n_jobs).fit(X)
+            # rbf(x1, x2) = exp(-gamma * ||x1 - x2||^2)
+            W = self.nn_fit.kneighbors_graph(y, mode='distance').T.power(2)
+            W *= -1 * self.gamma
+            np.exp(W.data, out=W.data)
+            # explicitly set diagonal, 
+            # since np.exp(W.data) does not modify zeros on the diagonal
+            W.setdiag(1)
+            return W
         elif self.kernel == "knn":
             if self.nn_fit is None:
                 self.nn_fit = NearestNeighbors(self.n_neighbors,
@@ -306,17 +319,19 @@ class LabelPropagation(BaseLabelPropagation):
 
     Parameters
     ----------
-    kernel : {'knn', 'rbf', callable}
+    kernel : {'knn', 'rbf', 'sparse-rbf', callable}
         String identifier for kernel function to use or the kernel function
-        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
-        passed should take two inputs, each of shape [n_samples, n_features],
-        and return a [n_samples, n_samples] shaped weight matrix.
+        itself. Only 'rbf', 'sparse-rbf', and 'knn' strings are valid inputs;
+        'sparse-rbf' calculates RBF weights for only the closest 'n_neighbors'
+        points.  The callable function passed should take two inputs, each of
+        shape [n_samples, n_features], and return a [n_samples, n_samples]
+        shaped weight matrix.
 
     gamma : float
-        Parameter for rbf kernel
+        Parameter for rbf or sparse-rbf kernel
 
     n_neighbors : integer > 0
-        Parameter for knn kernel
+        Parameter for knn or sparse-rbf kernel
 
     max_iter : integer
         Change maximum number of iterations allowed
@@ -412,17 +427,20 @@ class LabelSpreading(BaseLabelPropagation):
 
     Parameters
     ----------
-    kernel : {'knn', 'rbf', callable}
+    kernel : {'knn', 'rbf', 'sparse-rbf', callable}
         String identifier for kernel function to use or the kernel function
-        itself. Only 'rbf' and 'knn' strings are valid inputs. The function
-        passed should take two inputs, each of shape [n_samples, n_features],
-        and return a [n_samples, n_samples] shaped weight matrix
+        itself. Only 'rbf', 'sparse-rbf', and 'knn' strings are valid inputs;
+        'sparse-rbf' calculates RBF weights for only the closest 'n_neighbors'
+        points.  The callable function passed should take two inputs, each of
+        shape [n_samples, n_features], and return a [n_samples, n_samples]
+        shaped weight matrix.
 
     gamma : float
-      parameter for rbf kernel
+      Parameter for rbf or sparse-rbf kernel
 
     n_neighbors : integer > 0
-      parameter for knn kernel
+      Parameter for knn or sparse-rbf kernel
+
 
     alpha : float
       Clamping factor. A value in (0, 1) that specifies the relative amount
diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py
index 219457f9edcf6..1f3879b339aea 100644
--- a/sklearn/semi_supervised/tests/test_label_propagation.py
+++ b/sklearn/semi_supervised/tests/test_label_propagation.py
@@ -17,11 +17,15 @@
 ESTIMATORS = [
     (label_propagation.LabelPropagation, {'kernel': 'rbf'}),
     (label_propagation.LabelPropagation, {'kernel': 'knn', 'n_neighbors': 2}),
+    (label_propagation.LabelPropagation, {
+        'kernel': 'sparse-rbf',  'gamma': 1e-4, 'n_neighbors': 2}),
     (label_propagation.LabelPropagation, {
         'kernel': lambda x, y: rbf_kernel(x, y, gamma=20)
     }),
     (label_propagation.LabelSpreading, {'kernel': 'rbf'}),
     (label_propagation.LabelSpreading, {'kernel': 'knn', 'n_neighbors': 2}),
+    (label_propagation.LabelSpreading, {
+        'kernel': 'sparse-rbf',  'gamma': 1e-4, 'n_neighbors': 2}),
     (label_propagation.LabelSpreading, {
         'kernel': lambda x, y: rbf_kernel(x, y, gamma=20)
     }),
@@ -64,7 +68,7 @@ def test_predict_proba():
     for estimator, parameters in ESTIMATORS:
         clf = estimator(**parameters).fit(samples, labels)
         assert_array_almost_equal(clf.predict_proba([[1., 1.]]),
-                                  np.array([[0.5, 0.5]]))
+                                  np.array([[0.5, 0.5]]), 4)
 
 
 def test_label_spreading_closed_form():
@@ -199,3 +203,93 @@ def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
     n_correct = np.sum(Ypred == Ytest)
 
     assert n_correct >= 0.9 * n_test
+
+
+def test_sparse_rbf_kernel():
+    n_classes = 4
+    n_samples = 500
+    n_test = 10
+    X, Y = make_classification(n_classes=n_classes,
+                               n_samples=n_samples,
+                               n_features=20,
+                               n_informative=20,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+
+    Xtrain = X[:n_samples - n_test]
+    Ytrain = Y[:n_samples - n_test]
+    Xtest = X[n_samples - n_test:]
+    Ytest = Y[n_samples - n_test:]
+
+    model = label_propagation.LabelSpreading(kernel='sparse-rbf', gamma=1e-5)
+    model.fit(Xtrain, Ytrain)
+
+    Ypred = model.predict(Xtest)
+    n_correct = np.sum(Ypred == Ytest)
+
+    assert n_correct >= 0.9 * n_test
+
+    model = label_propagation.LabelPropagation(kernel='sparse-rbf', gamma=1e-5)
+    model.fit(Xtrain, Ytrain)
+
+    Ypred = model.predict(Xtest)
+    n_correct = np.sum(Ypred == Ytest)
+
+    assert n_correct >= 0.9 * n_test
+
+
+def test_sparse_rbf_kernel_agrees_with_dense():
+
+    n_classes = 4
+    n_samples = 500
+    X, Y = make_classification(n_classes=n_classes,
+                               n_samples=n_samples,
+                               n_features=20,
+                               n_informative=20,
+                               n_redundant=0,
+                               n_repeated=0,
+                               random_state=0)
+
+    gamma = 1e-5
+    n_neighbors = 10
+
+    # Check LabelSpreading
+    # Make dense RBF kernel
+    dense_train = (label_propagation
+                   .LabelSpreading(kernel='rbf', gamma=gamma)
+                   ._get_kernel(X))
+    # Keep top k+1 per column. (k neighbors + 1 for self)
+    ind = np.argpartition(
+            dense_train, kth=-(n_neighbors+1), axis=0)[:-(n_neighbors+1), :]
+    np.put_along_axis(dense_train, ind, 0, axis=0)
+
+    # Make column-sparse RBF kernel
+    sparse_train = (label_propagation
+                    .LabelSpreading(kernel='sparse-rbf',
+                                    gamma=gamma,
+                                    n_neighbors=n_neighbors)
+                    ._get_kernel(X)
+                    .toarray())
+
+    assert_array_almost_equal(dense_train, sparse_train)
+
+    # Check LabelPropagation
+    # Make dense RBF kernel
+    dense_train = (label_propagation
+                   .LabelPropagation(kernel='rbf', gamma=gamma)
+                   ._get_kernel(X))
+    # Keep top k+1 per column. (k neighbors + 1 for self)
+    ind = np.argpartition(
+            dense_train, kth=-(n_neighbors+1), axis=0)[:-(n_neighbors+1), :]
+    np.put_along_axis(dense_train, ind, 0, axis=0)
+
+    # Make column-sparse RBF kernel
+    sparse_train = (label_propagation
+                    .LabelPropagation(kernel='sparse-rbf',
+                                      gamma=gamma,
+                                      n_neighbors=n_neighbors)
+                    ._get_kernel(X)
+                    .toarray())
+
+    assert_array_almost_equal(dense_train, sparse_train)

From 57df7a2a016d10b7fa6488a4469a2e22bcef2112 Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Fri, 13 Dec 2019 18:06:53 -0500
Subject: [PATCH 5/8] WIP - sparse RBF kernel

---
 .../compare_sparse_kernels_mnist.py           | 228 ++++++++++--------
 1 file changed, 122 insertions(+), 106 deletions(-)

diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py
index f633102adcbf9..1d0bb86f301f0 100644
--- a/examples/semi_supervised/compare_sparse_kernels_mnist.py
+++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py
@@ -20,7 +20,7 @@
 
 The first kernel option produces a binary k-Nearest Neighbors adjacency matrix.
 The second produces a kernel which is also k-sparse, but contains the same
-weights as used in an RBF kernel.  
+weights as used in an RBF kernel.
 
 Notice that the performance of the sparse-RBF kernel is very sensitive to
 parameters; the parameters used here were found by a quick manual search, so
@@ -28,47 +28,136 @@
 kernel effectively on a new dataset requires hyperparameter tuning.
 """
 import numpy as np
+from pprint import pprint
 from sklearn.datasets import fetch_openml
+from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.semi_supervised import LabelSpreading
-from sklearn.metrics import classification_report, confusion_matrix
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import make_scorer
 import time
 
-Xorig, Yorig = fetch_openml('mnist_784', version=1, return_X_y=True)
-Yorig = Y.astype(int)
 
-# For a quick demonstration, use only a subset of the data
-n_total = 10000
-X = Xorig[:n_total, :]
-Y = Yorig[:n_total]
+def run_comparison():
+    X_orig, y_orig = fetch_openml('mnist_784', version=1, return_X_y=True)
+    y_orig = y_orig.astype(int)
+
+    # First, we use a small subset of the data to tune hyperparameters
+    n_total = 5000
+    X = X_orig[:n_total, :]
+    y = y_orig[:n_total]
+
+    test_fraction = 0.333
+    X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_fraction, random_state=0)
+
+    # Mask subset of train data for transductive learning
+
+    # We perform a grid search to optimize parameters for sparse-rbf
+    # kernel.  For this purpose, we use a smaller subset of the data.  In all
+    # cases, we simply use max_iter=100 Notice that we are searching over the
+    # inductive accuracy (accuracy on the test set) rather than the
+    # transductive accuracy (on the masked training examples).  This keeps
+    # things a bit simpler, though we could customize the score function and
+    # the `WrapLabelSpreading` class further to also hyperparameter search over
+    # the transductive accuracy.
+    sparse_rbf_model = GridSearchCV(
+            WrapLabelSpreading(kernel='sparse-rbf', supervision_fraction=0.05),
+            param_grid={
+                'n_jobs': [-1],
+                'max_iter': [100],
+                'alpha': np.linspace(0.01, 0.50, 5),
+                'gamma': np.logspace(-8, 1, 20),
+                'n_neighbors': list(range(5, 60, 3))},
+            cv=3)
+
+    sparse_rbf_model.fit(X, y)
+    sparse_rbf_params = sparse_rbf_model.best_params_
+    print(f"Optimal parameters for sparse RBF kernel: {sparse_rbf_params}")
+
+    knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn',
+                                                supervision_fraction=0.05),
+                             param_grid={
+                                 'n_jobs': [-1],
+                                 'max_iter': [100],
+                                 'alpha': np.linspace(0.01, 0.50, 5),
+                                 'n_neighbors': list(range(5, 60, 3))},
+                             cv=3)
+
+    knn_model.fit(X, y)
+    knn_params = knn_model.best_params_
+    print(f"Optimal parameters for knn kernel: {knn_params}")
+
+    # Next, we can compare our optimized models on a larger dataset.
+    n_total = 20000
+    X = X_orig[:n_total, :]
+    y = y_orig[:n_total]
+    test_fraction = 0.333
+    X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_fraction, random_state=0)
+
+    supervision_fractions = [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1]
+    results = {
+            'transduction': {'knn': [], 'sparse-rbf': []},
+            'induction': {'knn': [], 'sparse-rbf': []},
+            'runtimes': {'knn': [], 'sparse-rbf': []}
+    }
+    for supervision_fraction in supervision_fractions:
+        n_train = len(y_train)
+        n_labeled = int(supervision_fraction * n_train)
+        indices = np.arange(n_train)
+        unlabeled_set = indices[n_labeled:]
+
+        y_masked = np.copy(y_train)
+        y_masked[unlabeled_set] = -1
+
+        for kernel_name, params in zip(['knn', 'sparse-rbf'],
+                                       [knn_params, sparse_rbf_params]):
+            model = LabelSpreading(kernel=kernel_name, **params)
+            print("="*80)
+            print(f"Kernel: {kernel_name}, " +
+                  f"Supervision fraction: {supervision_fraction}")
+            transductive_accs = []
+            inductive_accs = []
+            runtimes = []
+
+            # Repeat each scenario several times to collect rough statistics
+            for _ in range(3):
+                t0 = time.time()
+                model.fit(X_train, y_masked)
+
+                predicted_labels = model.transduction_[unlabeled_set]
+                true_labels = y_train[unlabeled_set]
+                transductive_acc = (np.sum(predicted_labels == true_labels) /
+                                    len(unlabeled_set))
+                transductive_accs.append(transductive_acc)
+                inductive_acc = model.score(X_test, y_test)
+                inductive_accs.append(inductive_acc)
+                t1 = time.time()
+                runtimes.append(t1-t0)
+
+            mean_t_acc = np.mean(transductive_accs)
+            mean_i_acc = np.mean(inductive_accs)
+            mean_runtime = np.mean(runtimes)
+
+            print(f"Mean transductive accuracy: {100 * mean_t_acc:.2f}%, " +
+                  f"Mean inductive accuracy: {100 * mean_i_acc:.2f}%, " +
+                  f"Mean runtime: {mean_runtime:.2f}s")
+
+            results['transduction'][kernel_name].append(mean_t_acc)
+            results['induction'][kernel_name].append(mean_i_acc)
+            results['runtimes'][kernel_name].append(mean_runtime)
+
+    print("="*80)
+    print(f"supervision_fractions: {supervision_fractions}")
+    pprint(results)
 
-# Save test set for inductive learning
-test_fraction = 0.333
-Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=test_fraction,
-                                                random_state=0)
-
-# Mask subset of train data for transductive learning
-n_train = len(Ytrain)
-#kwargs = {'gamma': 1e-9, 'n_neighbors': 50, 'n_jobs': -1, 'max_iter': 100}
-
-#models = [LabelSpreading(kernel='knn', **kwargs),
-#          LabelSpreading(kernel='sparse-rbf', **kwargs)]
-
-#supervision_fractions = [0.001, 0.005, 0.01, 0.05, 0.1]
-
-# First, we perform a grid search to optimize parameters for sparse-rbf kernel.
-# For this purpose, we use a smaller subset of the data.
-# Notice also that we 
 
 class WrapLabelSpreading(LabelSpreading):
     """
     In order to perform a grid search over this semi-supervised model,
-    we need to provide a thin wrapper that masks a subset of the data before 
+    we need to provide a thin wrapper that masks a subset of the data before
     `fit` is called.
     """
     def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20,
-            n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None):
+                 n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None):
 
         self.supervision_fraction = supervision_fraction
 
@@ -79,7 +168,7 @@ def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20,
     def fit(self, X, y):
         # mask a random subset of labels, based on self.supervision_fraction
         n_total = len(y)
-        n_labeled = self.supervision_fraction * n_total
+        n_labeled = int(self.supervision_fraction * n_total)
 
         indices = np.arange(n_total)
         np.random.seed(0)
@@ -88,82 +177,9 @@ def fit(self, X, y):
 
         y[unlabeled_subset] = -1
 
-        super().fit(X,y)
+        super().fit(X, y)
         return self
 
 
-# In all cases, we simply use max_iter=100
-sparse_rbf_model = GridSearchCV(WrapLabelSpreading(kernel='sparse-rbf'),
-                                param_grid= {
-                                    'gamma': np.logspace(-8, 1, 10),
-                                    'alpha': np.linspace(0, 1, 10),
-                                     'n_neighbors': list(range(5,55,5))})
-
-knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn'),
-                         param_grid= {
-                             'n_neighbors': list(range(5,55,5))},
-                             'alpha': np.linspace(0, 1, 10),
-                            )
-
-
-# Then, we compare the performance of optimized sparse-rbf kernel to knn kernel
-supervision_fractions = [0.05, 0.1]
-accuracies = {
-        'transduction': { 'knn':[], 'sparse-rbf':[] }, 
-        'induction': { 'knn':[], 'sparse-rbf':[] }
-}
-for supervision_fraction in supervision_fractions:
-    supervision_fraction = 0.05
-    n_labeled = int(supervision_fraction * n_train)
-    indices = np.arange(n_train)
-    unlabeled_set = indices[n_labeled:]
-
-    Ymasked = np.copy(Ytrain)
-    Ymasked[unlabeled_set] = -1
-
-    for kernel_name, model in zip(['knn', 'sparse-rbf'], 
-                                  [knn_model, sparse_rbf_model]):
-        knn_acc_trans = []
-        knn_acc_ind = []
-        sparse_rbf_acc_trans = []
-        sparse_rbf_acc_ind = []
-        # Repeat each scenario 5 times to collect rough statistics
-    #    for _ in range(5):
-        print("="*80)
-        t0 = time.time()
-        print(f"MODEL: {model}")
-        model.fit(Xtrain, Ymasked)
-        t1 = time.time()
-
-        predicted_labels = model.transduction_[unlabeled_set]
-        true_labels = Ytrain[unlabeled_set]
-        acc = np.sum(predicted_labels == true_labels) / len(unlabeled_set)
-        print(f"accuracy: {acc}")
-
-
-
-        print("-"*80)
-        print(f"TRANSDUCTION: {n_labeled} labeled and " +
-              f"{n_train - n_labeled} unlabeled points ({n_train} total)")
-        print("-"*80)
-        print("Confusion Matrix:")
-        print(confusion_matrix(true_labels, predicted_labels,
-                               labels=model.classes_))
-        print("-"*80)
-        print("Classification Report:")
-        print(classification_report(true_labels, predicted_labels))
-        print("-"*80)
-
-        predicted_labels = model.predict(Xtest)
-        t2 = time.time()
-
-        print("-"*80)
-        print(f"INDUCTION: {int(test_fraction * n_total)} test points")
-        print("-"*80)
-        print("Confusion Matrix:")
-        print(confusion_matrix(Ytest, predicted_labels, labels=model.classes_))
-        print("-"*80)
-        print("Classification Report:")
-        print(classification_report(Ytest, predicted_labels))
-        print("-"*80)
-        print(f"Runtimes: Transduction: {t1 - t0:.2f}s. Induction: {t2 - t1:.2f}s")
+if __name__ == '__main__':
+    run_comparison()

From 741ef070d207c95350a627b9356d89659bcbe80b Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Wed, 18 Dec 2019 14:30:07 -0500
Subject: [PATCH 6/8] WIP - sparse RBF kernel

---
 .../compare_sparse_kernels_mnist.py           | 194 ++++++++++++------
 sklearn/semi_supervised/_label_propagation.py |   2 +-
 2 files changed, 133 insertions(+), 63 deletions(-)

diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py
index 1d0bb86f301f0..cfb7fffff4af0 100644
--- a/examples/semi_supervised/compare_sparse_kernels_mnist.py
+++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py
@@ -27,6 +27,7 @@
 the model can likely be improved with further optimization, and using this
 kernel effectively on a new dataset requires hyperparameter tuning.
 """
+import matplotlib.pyplot as plt
 import numpy as np
 from pprint import pprint
 from sklearn.datasets import fetch_openml
@@ -35,125 +36,166 @@
 import time
 
 
-def run_comparison():
-    X_orig, y_orig = fetch_openml('mnist_784', version=1, return_X_y=True)
-    y_orig = y_orig.astype(int)
+def run_grid_search(X, y):
+    """
+    We perform a grid search to optimize parameters for sparse-rbf
+    kernel.  For this purpose, we use a smaller subset of the data.  In all
+    cases, we simply use max_iter=100 Notice that we are searching over the
+    inductive accuracy (accuracy on the test set) rather than the
+    transductive accuracy (on the masked training examples).  This keeps
+    things a bit simpler, though we could customize the score function and
+    the `WrapLabelSpreading` class further to also hyperparameter search over
+    the transductive accuracy.
+    """
 
     # First, we use a small subset of the data to tune hyperparameters
     n_total = 5000
-    X = X_orig[:n_total, :]
-    y = y_orig[:n_total]
+    X = X[:n_total, :]
+    y = y[:n_total]
 
     test_fraction = 0.333
     X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=test_fraction, random_state=0)
 
-    # Mask subset of train data for transductive learning
-
-    # We perform a grid search to optimize parameters for sparse-rbf
-    # kernel.  For this purpose, we use a smaller subset of the data.  In all
-    # cases, we simply use max_iter=100 Notice that we are searching over the
-    # inductive accuracy (accuracy on the test set) rather than the
-    # transductive accuracy (on the masked training examples).  This keeps
-    # things a bit simpler, though we could customize the score function and
-    # the `WrapLabelSpreading` class further to also hyperparameter search over
-    # the transductive accuracy.
+    # In order to use GridSearchCV, we will use a thin wrapper class
+    # that masks a subset of our training labels.
     sparse_rbf_model = GridSearchCV(
             WrapLabelSpreading(kernel='sparse-rbf', supervision_fraction=0.05),
             param_grid={
                 'n_jobs': [-1],
                 'max_iter': [100],
-                'alpha': np.linspace(0.01, 0.50, 5),
-                'gamma': np.logspace(-8, 1, 20),
-                'n_neighbors': list(range(5, 60, 3))},
+                'alpha': np.linspace(0.01, 0.99, 10),
+                'gamma': np.logspace(-8, -4, 10),
+                'n_neighbors': list(range(6, 30, 2))},
             cv=3)
 
     sparse_rbf_model.fit(X, y)
     sparse_rbf_params = sparse_rbf_model.best_params_
     print(f"Optimal parameters for sparse RBF kernel: {sparse_rbf_params}")
 
-    knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn',
-                                                supervision_fraction=0.05),
-                             param_grid={
-                                 'n_jobs': [-1],
-                                 'max_iter': [100],
-                                 'alpha': np.linspace(0.01, 0.50, 5),
-                                 'n_neighbors': list(range(5, 60, 3))},
-                             cv=3)
+    knn_model = GridSearchCV(
+            WrapLabelSpreading(kernel='knn', supervision_fraction=0.05),
+            param_grid={
+                'n_jobs': [-1],
+                'max_iter': [100],
+                'alpha': np.linspace(0.01, 0.99, 10),
+                'n_neighbors': list(range(6, 30, 2))},
+            cv=3)
 
     knn_model.fit(X, y)
     knn_params = knn_model.best_params_
     print(f"Optimal parameters for knn kernel: {knn_params}")
+    return n_total, sparse_rbf_params, knn_params
+
 
+def run_comparison(X, y, sparse_rbf_params, knn_params, n_skip):
+    print("Begin comparison...")
     # Next, we can compare our optimized models on a larger dataset.
-    n_total = 20000
-    X = X_orig[:n_total, :]
-    y = y_orig[:n_total]
+    n_total = 35000
+    X = X[n_skip:n_total+n_skip, :]
+    y = y[n_skip:n_total+n_skip]
     test_fraction = 0.333
+
+    print("Train/Test split...")
     X_train, X_test, y_train, y_test = train_test_split(
             X, y, test_size=test_fraction, random_state=0)
 
     supervision_fractions = [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1]
     results = {
-            'transduction': {'knn': [], 'sparse-rbf': []},
-            'induction': {'knn': [], 'sparse-rbf': []},
-            'runtimes': {'knn': [], 'sparse-rbf': []}
+            'transduction': {
+                'knn': {'avg': [], 'std': []},
+                'sparse-rbf': {'avg': [], 'std': []}},
+            'induction': {
+                'knn': {'avg': [], 'std': []},
+                'sparse-rbf': {'avg': [], 'std': []}},
+            'runtimes': {
+                'knn': {'avg': [], 'std': []},
+                'sparse-rbf': {'avg': [], 'std': []}}
     }
-    for supervision_fraction in supervision_fractions:
-        n_train = len(y_train)
-        n_labeled = int(supervision_fraction * n_train)
-        indices = np.arange(n_train)
-        unlabeled_set = indices[n_labeled:]
-
-        y_masked = np.copy(y_train)
-        y_masked[unlabeled_set] = -1
 
+    rng = np.random.RandomState(0)
+    for supervision_fraction in supervision_fractions:
         for kernel_name, params in zip(['knn', 'sparse-rbf'],
                                        [knn_params, sparse_rbf_params]):
             model = LabelSpreading(kernel=kernel_name, **params)
             print("="*80)
             print(f"Kernel: {kernel_name}, " +
                   f"Supervision fraction: {supervision_fraction}")
-            transductive_accs = []
-            inductive_accs = []
-            runtimes = []
 
             # Repeat each scenario several times to collect rough statistics
-            for _ in range(3):
+            t_accs = []
+            i_accs = []
+            runtimes = []
+            for _ in range(5):
+                n_train = len(y_train)
+                n_labeled = int(supervision_fraction * n_train)
+                indices = np.arange(n_train)
+                rng.shuffle(indices)
+                unlabeled_set = indices[n_labeled:]
+
+                y_masked = np.copy(y_train)
+                y_masked[unlabeled_set] = -1
+
                 t0 = time.time()
                 model.fit(X_train, y_masked)
 
                 predicted_labels = model.transduction_[unlabeled_set]
                 true_labels = y_train[unlabeled_set]
-                transductive_acc = (np.sum(predicted_labels == true_labels) /
-                                    len(unlabeled_set))
-                transductive_accs.append(transductive_acc)
-                inductive_acc = model.score(X_test, y_test)
-                inductive_accs.append(inductive_acc)
+                t_acc = (np.sum(predicted_labels == true_labels) /
+                         len(unlabeled_set))
+                t_accs.append(t_acc)
+                i_accs.append(model.score(X_test, y_test))
                 t1 = time.time()
                 runtimes.append(t1-t0)
 
-            mean_t_acc = np.mean(transductive_accs)
-            mean_i_acc = np.mean(inductive_accs)
-            mean_runtime = np.mean(runtimes)
-
-            print(f"Mean transductive accuracy: {100 * mean_t_acc:.2f}%, " +
-                  f"Mean inductive accuracy: {100 * mean_i_acc:.2f}%, " +
-                  f"Mean runtime: {mean_runtime:.2f}s")
-
-            results['transduction'][kernel_name].append(mean_t_acc)
-            results['induction'][kernel_name].append(mean_i_acc)
-            results['runtimes'][kernel_name].append(mean_runtime)
+            results['transduction'][kernel_name]['avg'].append(np.mean(t_accs))
+            results['transduction'][kernel_name]['std'].append(np.std(t_accs))
+            results['induction'][kernel_name]['avg'].append(np.mean(i_accs))
+            results['induction'][kernel_name]['std'].append(np.std(i_accs))
+            results['runtimes'][kernel_name]['avg'].append(np.mean(runtimes))
+            results['runtimes'][kernel_name]['std'].append(np.std(runtimes))
 
     print("="*80)
     print(f"supervision_fractions: {supervision_fractions}")
     pprint(results)
+    return supervision_fractions, results
+
+
+def plot_results(supervision_fractions, results):
+    fig, ax = plt.subplots(3, 1, figsize=(16, 9))
+    for i, (label, ylabel) in enumerate(zip(
+                ['induction', 'transduction', 'runtimes'],
+                ['% Accuracy', '% Accuracy', 'Duration (s)'])):
+
+        S_avg = results[label]['sparse-rbf']['avg']
+        S_std = results[label]['sparse-rbf']['std']
+
+        K_avg = results[label]['knn']['avg']
+        K_std = results[label]['knn']['std']
+
+        ax[i].scatter(supervision_fractions, S_avg, c='b', label='sparse-rbf')
+        ax[i].scatter(supervision_fractions, K_avg, c='r', label='knn')
+        ax[i].set_xscale('log')
+        ax[i].set_xlim([8e-4, 1.3e-1])
+        ax[i].set_title(f'{label.capitalize()}')
+        ax[i].set_xlabel('Supervision Fraction')
+        ax[i].fill_between(supervision_fractions,
+                           [a - b for a, b in zip(S_avg, S_std)],
+                           [a + b for a, b in zip(S_avg, S_std)],
+                           facecolor='b', alpha=0.2)
+        ax[i].fill_between(supervision_fractions,
+                           [a + b for a, b in zip(K_avg, K_std)],
+                           [a - b for a, b in zip(K_avg, K_std)],
+                           facecolor='r', alpha=0.2)
+
+    plt.tight_layout()
+    plt.savefig('sparse_kernel_comparison.png')
 
 
 class WrapLabelSpreading(LabelSpreading):
     """
     In order to perform a grid search over this semi-supervised model,
-    we need to provide a thin wrapper that masks a subset of the data before
+    we need to provide a wrapper that masks a subset of the data before
     `fit` is called.
     """
     def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20,
@@ -182,4 +224,32 @@ def fit(self, X, y):
 
 
 if __name__ == '__main__':
-    run_comparison()
+    X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
+    y = y.astype(int)
+
+    # Set this flag to run the grid search, which takes several hours
+    do_grid_search = False
+
+    if do_grid_search:
+        n_skip, sparse_rbf_params, knn_params = run_grid_search(X, y)
+    else:
+        # Values found from running grid search previously
+        sparse_rbf_params = {
+                'alpha': 0.663,
+                'gamma': 2.154e-7,
+                'max_iter': 100,
+                'n_jobs': -1,
+                'n_neighbors': 20}
+        knn_params = {
+                'alpha': 0.772,
+                'max_iter': 100,
+                'n_jobs': -1,
+                'n_neighbors': 6}
+        n_skip = 0
+
+    supervision_fractions, results = run_comparison(
+            X, y, sparse_rbf_params=sparse_rbf_params,
+            knn_params=knn_params,
+            n_skip=n_skip)
+
+    plot_results(supervision_fractions, results)
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 7fd5ec7d2d345..5013e0a69037b 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -136,7 +136,7 @@ def _get_kernel(self, X, y=None):
             W = self.nn_fit.kneighbors_graph(y, mode='distance').T.power(2)
             W *= -1 * self.gamma
             np.exp(W.data, out=W.data)
-            # explicitly set diagonal, 
+            # explicitly set diagonal,
             # since np.exp(W.data) does not modify zeros on the diagonal
             W.setdiag(1)
             return W

From ef0703d46318fcc7c77abd4ad0647f960d9a998e Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Wed, 18 Dec 2019 17:32:23 -0500
Subject: [PATCH 7/8] WIP - sparse RBF kernel

---
 .../compare_sparse_kernels_mnist.py           | 93 ++++++++++++-------
 1 file changed, 59 insertions(+), 34 deletions(-)

diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py
index cfb7fffff4af0..67ec2a8d4f3fd 100644
--- a/examples/semi_supervised/compare_sparse_kernels_mnist.py
+++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py
@@ -33,10 +33,11 @@
 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split, GridSearchCV
 from sklearn.semi_supervised import LabelSpreading
+from torchvision.datasets import CIFAR10
 import time
 
 
-def run_grid_search(X, y):
+def run_grid_search(X_train, X_test, y_train, y_test):
     """
     We perform a grid search to optimize parameters for sparse-rbf
     kernel.  For this purpose, we use a smaller subset of the data.  In all
@@ -47,16 +48,6 @@ def run_grid_search(X, y):
     the `WrapLabelSpreading` class further to also hyperparameter search over
     the transductive accuracy.
     """
-
-    # First, we use a small subset of the data to tune hyperparameters
-    n_total = 5000
-    X = X[:n_total, :]
-    y = y[:n_total]
-
-    test_fraction = 0.333
-    X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=test_fraction, random_state=0)
-
     # In order to use GridSearchCV, we will use a thin wrapper class
     # that masks a subset of our training labels.
     sparse_rbf_model = GridSearchCV(
@@ -69,7 +60,8 @@ def run_grid_search(X, y):
                 'n_neighbors': list(range(6, 30, 2))},
             cv=3)
 
-    sparse_rbf_model.fit(X, y)
+    sparse_rbf_model.fit(np.vstack((X_train, X_test)),
+                         np.concatenate((y_train, y_test)))
     sparse_rbf_params = sparse_rbf_model.best_params_
     print(f"Optimal parameters for sparse RBF kernel: {sparse_rbf_params}")
 
@@ -82,24 +74,16 @@ def run_grid_search(X, y):
                 'n_neighbors': list(range(6, 30, 2))},
             cv=3)
 
-    knn_model.fit(X, y)
+    knn_model.fit(np.vstack((X_train, X_test)),
+                  np.concatenate((y_train, y_test)))
     knn_params = knn_model.best_params_
     print(f"Optimal parameters for knn kernel: {knn_params}")
-    return n_total, sparse_rbf_params, knn_params
+    return sparse_rbf_params, knn_params
 
 
-def run_comparison(X, y, sparse_rbf_params, knn_params, n_skip):
+def run_comparison(X_train, X_test, y_train, y_test,
+                   sparse_rbf_params, knn_params):
     print("Begin comparison...")
-    # Next, we can compare our optimized models on a larger dataset.
-    n_total = 35000
-    X = X[n_skip:n_total+n_skip, :]
-    y = y[n_skip:n_total+n_skip]
-    test_fraction = 0.333
-
-    print("Train/Test split...")
-    X_train, X_test, y_train, y_test = train_test_split(
-            X, y, test_size=test_fraction, random_state=0)
-
     supervision_fractions = [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1]
     results = {
             'transduction': {
@@ -224,16 +208,54 @@ def fit(self, X, y):
 
 
 if __name__ == '__main__':
-    X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
-    y = y.astype(int)
-
+    # Choose the dataset
+    dataset = 'mnist'
+    # Set the fraction of data to use for hyperparam tuning
+    hyperp_tune_fraction = 0.1
+    # Set the fraction of data to use for the final comparison
+    compare_fraction = 0.1
     # Set this flag to run the grid search, which takes several hours
     do_grid_search = False
 
+    if dataset == 'mnist':
+        X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
+        y = y.astype(int)
+
+        X_train, X_test, y_train, y_test = train_test_split(
+                X, y, test_size=10000, random_state=0)
+
+    elif dataset == 'cifar10':
+        cifar10 = CIFAR10('.', download=True, train=True)
+        X_train = cifar10.data.reshape(-1, 3*32*32)
+        y_train = np.array(cifar10.targets)
+
+        cifar10 = CIFAR10('.', download=True, train=False)
+        X_test = cifar10.data.reshape(-1, 3*32*32)
+        y_test = np.array(cifar10.targets)
+    else:
+        raise ValueError(f"dataset {dataset} not supported")
+
+    print("Full dataset sizes: " +
+          f"\nX_train {X_train.shape}" +
+          f"\nX_test {X_test.shape}" +
+          f"\ny_train {y_train.shape}" +
+          f"\ny_test {y_test.shape}")
+
     if do_grid_search:
-        n_skip, sparse_rbf_params, knn_params = run_grid_search(X, y)
+        # First, we use a small subset of the data to tune hyperparameters
+        tr_tune = int(hyperp_tune_fraction * len(y_train))
+        te_tune = int(hyperp_tune_fraction * len(y_test))
+        print("# items for hyperparam tuning:" +
+              f"train: {tr_tune}, test: {te_tune}")
+        sparse_rbf_params, knn_params = run_grid_search(
+                X_train[:tr_tune, :],
+                X_test[:te_tune, :],
+                y_train[:tr_tune],
+                y_test[:te_tune])
     else:
-        # Values found from running grid search previously
+        # Values found from running grid search previously on MNIST
+        tr_tune = 0
+        te_tune = 0
         sparse_rbf_params = {
                 'alpha': 0.663,
                 'gamma': 2.154e-7,
@@ -245,11 +267,14 @@ def fit(self, X, y):
                 'max_iter': 100,
                 'n_jobs': -1,
                 'n_neighbors': 6}
-        n_skip = 0
 
+    # Skip the items used for hyperparam tuning
+    tr_comp = int(compare_fraction * len(y_train)) + tr_tune
+    te_comp = int(compare_fraction * len(y_test)) + te_tune
     supervision_fractions, results = run_comparison(
-            X, y, sparse_rbf_params=sparse_rbf_params,
-            knn_params=knn_params,
-            n_skip=n_skip)
+            X_train[tr_tune:tr_comp, :], X_test[te_tune:te_comp, :],
+            y_train[tr_tune:tr_comp],    y_test[te_tune:te_comp],
+            sparse_rbf_params=sparse_rbf_params,
+            knn_params=knn_params)
 
     plot_results(supervision_fractions, results)

From 12610aa321b88d76680858892bc6552959fdae3f Mon Sep 17 00:00:00 2001
From: Niklas Smedemark-Margulies <niklas.sm+github@gmail.com>
Date: Wed, 18 Dec 2019 18:43:42 -0500
Subject: [PATCH 8/8] Fix plot legend and use 20% data

---
 examples/semi_supervised/compare_sparse_kernels_mnist.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py
index 67ec2a8d4f3fd..7898d889c2d0e 100644
--- a/examples/semi_supervised/compare_sparse_kernels_mnist.py
+++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py
@@ -163,6 +163,8 @@ def plot_results(supervision_fractions, results):
         ax[i].set_xlim([8e-4, 1.3e-1])
         ax[i].set_title(f'{label.capitalize()}')
         ax[i].set_xlabel('Supervision Fraction')
+        ax[i].set_ylabel(ylabel)
+        ax[i].legend()
         ax[i].fill_between(supervision_fractions,
                            [a - b for a, b in zip(S_avg, S_std)],
                            [a + b for a, b in zip(S_avg, S_std)],
@@ -213,7 +215,7 @@ def fit(self, X, y):
     # Set the fraction of data to use for hyperparam tuning
     hyperp_tune_fraction = 0.1
     # Set the fraction of data to use for the final comparison
-    compare_fraction = 0.1
+    compare_fraction = 0.2
     # Set this flag to run the grid search, which takes several hours
     do_grid_search = False