From 891792c4510a6ed9490004b1a589e4a186626b0a Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Wed, 11 Dec 2019 20:23:00 -0500 Subject: [PATCH 1/8] FIX use safe_sparse_dot for callable kernel in LabelSpreading (#15866) --- sklearn/semi_supervised/_label_propagation.py | 3 +- .../tests/test_label_propagation.py | 39 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 0ec687aae7d20..665b50dcfa507 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -195,7 +195,8 @@ class labels for weight_matrix in weight_matrices]) else: weight_matrices = weight_matrices.T - probabilities = np.dot(weight_matrices, self.label_distributions_) + probabilities = safe_sparse_dot( + weight_matrices, self.label_distributions_) normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T probabilities /= normalizer return probabilities diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 7e20350b20b2f..d95dea2522a3d 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -3,10 +3,12 @@ import numpy as np import pytest +from scipy.sparse import csr_matrix from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_no_warnings from sklearn.semi_supervised import _label_propagation as label_propagation from sklearn.metrics.pairwise import rbf_kernel +from sklearn.neighbors import NearestNeighbors from sklearn.datasets import make_classification from sklearn.exceptions import ConvergenceWarning from numpy.testing import assert_array_almost_equal @@ -152,3 +154,40 @@ def test_convergence_warning(): mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500) assert_no_warnings(mdl.fit, X, y) + + +def test_predict_sparse_callable_kernel(): + # This is a non-regression test for #15866 + + # Custom sparse kernel (top-K RBF) + def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): + nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1) + nn.fit(X) + W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma + np.exp(W.data, out=W.data) + assert isinstance(W, csr_matrix) + return W.T + + n_classes = 4 + n_samples = 500 + n_test = 10 + X, Y = make_classification(n_classes=n_classes, + n_samples=n_samples, + n_features=20, + n_informative=20, + n_redundant=0, + n_repeated=0, + random_state=0) + + Xtrain = X[:n_samples - n_test] + Ytrain = Y[:n_samples - n_test] + Xtest = X[n_samples - n_test:] + Ytest = Y[n_samples - n_test:] + + model = label_propagation.LabelSpreading(kernel=topk_rbf) + model.fit(Xtrain, Ytrain) + + Ypred = model.predict(Xtest) + n_correct = np.sum(Ypred == Ytest) + + assert n_correct >= 0.9 * n_test From 3bd7e9f0e4874db824dd20803568256580c70476 Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Thu, 12 Dec 2019 11:21:29 -0500 Subject: [PATCH 2/8] FIX use safe_sparse_dot for callable kernel in LabelSpreading (#15866) --- doc/whats_new/v0.22.rst | 8 ++++++++ .../semi_supervised/tests/test_label_propagation.py | 12 ++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index af08b832e9f6f..ccf93030d1edf 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -795,6 +795,14 @@ Changelog :pr:`13925` by :user:`Isaac S. Robson ` and :pr:`15524` by :user:`Xun Tang `. +:mod:`sklearn.semi_supervised` +............................. + +- |Fix| :class:`semi_supervised.LabelPropagation` and + `semi_supervised.LabelSpreading` now allow callable kernel function to + return sparse weight matrix. + :pr:`15868` by :user:`Niklas Smedemark-Margulies `. + :mod:`sklearn.svm` .................. diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index d95dea2522a3d..219457f9edcf6 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from scipy.sparse import csr_matrix +from scipy.sparse import issparse from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_no_warnings from sklearn.semi_supervised import _label_propagation as label_propagation @@ -165,7 +165,7 @@ def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): nn.fit(X) W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma np.exp(W.data, out=W.data) - assert isinstance(W, csr_matrix) + assert issparse(W) return W.T n_classes = 4 @@ -191,3 +191,11 @@ def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): n_correct = np.sum(Ypred == Ytest) assert n_correct >= 0.9 * n_test + + model = label_propagation.LabelPropagation(kernel=topk_rbf) + model.fit(Xtrain, Ytrain) + + Ypred = model.predict(Xtest) + n_correct = np.sum(Ypred == Ytest) + + assert n_correct >= 0.9 * n_test From b8bc7d8f323ef1bc62e600726428ecbf77709199 Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Thu, 12 Dec 2019 11:43:03 -0500 Subject: [PATCH 3/8] FIX use safe_sparse_dot for callable kernel in LabelSpreading (#15866) --- doc/whats_new/v0.22.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index ccf93030d1edf..104a3f3adc3df 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -796,7 +796,7 @@ Changelog :user:`Xun Tang `. :mod:`sklearn.semi_supervised` -............................. +.............................. - |Fix| :class:`semi_supervised.LabelPropagation` and `semi_supervised.LabelSpreading` now allow callable kernel function to From da586279291228112f10e909da504158b028370a Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Fri, 13 Dec 2019 15:49:45 -0500 Subject: [PATCH 4/8] WIP - sparse RBF kernel --- .../compare_sparse_kernels_mnist.py | 169 ++++++++++++++++++ sklearn/semi_supervised/_label_propagation.py | 62 ++++--- .../tests/test_label_propagation.py | 96 +++++++++- 3 files changed, 304 insertions(+), 23 deletions(-) create mode 100644 examples/semi_supervised/compare_sparse_kernels_mnist.py diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py new file mode 100644 index 0000000000000..f633102adcbf9 --- /dev/null +++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py @@ -0,0 +1,169 @@ +""" +================================================= +Label Propagation MNIST: Comparing Sparse Kernels +================================================= + +This example compares the runtime and performance of two sparse kernels for +semisupervised learning on the MNIST digit dataset. + +The MNIST dataset consists of 28x28 pixel grayscale images. Here, we will use a +subset of 10K images, reserving a fraction of these for testing. We will +compare the performance and runtime of two sparse kernels, across a range of +low-supervision scenarios. + +In each scenario, we will run each model multiple times, to increase our +confidence in the comparison between kernels. + +The models will be evaluated for their accuracy at spreading labels during +training ("transductive learning"), as well as spreading labels to unseen +points at test time ("inductive learning"). + +The first kernel option produces a binary k-Nearest Neighbors adjacency matrix. +The second produces a kernel which is also k-sparse, but contains the same +weights as used in an RBF kernel. + +Notice that the performance of the sparse-RBF kernel is very sensitive to +parameters; the parameters used here were found by a quick manual search, so +the model can likely be improved with further optimization, and using this +kernel effectively on a new dataset requires hyperparameter tuning. +""" +import numpy as np +from sklearn.datasets import fetch_openml +from sklearn.semi_supervised import LabelSpreading +from sklearn.metrics import classification_report, confusion_matrix +from sklearn.model_selection import train_test_split +from sklearn.metrics import make_scorer +import time + +Xorig, Yorig = fetch_openml('mnist_784', version=1, return_X_y=True) +Yorig = Y.astype(int) + +# For a quick demonstration, use only a subset of the data +n_total = 10000 +X = Xorig[:n_total, :] +Y = Yorig[:n_total] + +# Save test set for inductive learning +test_fraction = 0.333 +Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=test_fraction, + random_state=0) + +# Mask subset of train data for transductive learning +n_train = len(Ytrain) +#kwargs = {'gamma': 1e-9, 'n_neighbors': 50, 'n_jobs': -1, 'max_iter': 100} + +#models = [LabelSpreading(kernel='knn', **kwargs), +# LabelSpreading(kernel='sparse-rbf', **kwargs)] + +#supervision_fractions = [0.001, 0.005, 0.01, 0.05, 0.1] + +# First, we perform a grid search to optimize parameters for sparse-rbf kernel. +# For this purpose, we use a smaller subset of the data. +# Notice also that we + +class WrapLabelSpreading(LabelSpreading): + """ + In order to perform a grid search over this semi-supervised model, + we need to provide a thin wrapper that masks a subset of the data before + `fit` is called. + """ + def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20, + n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None): + + self.supervision_fraction = supervision_fraction + + super().__init__(kernel=kernel, gamma=gamma, + n_neighbors=n_neighbors, alpha=alpha, + max_iter=max_iter, tol=tol, n_jobs=n_jobs) + + def fit(self, X, y): + # mask a random subset of labels, based on self.supervision_fraction + n_total = len(y) + n_labeled = self.supervision_fraction * n_total + + indices = np.arange(n_total) + np.random.seed(0) + np.random.shuffle(indices) + unlabeled_subset = indices[n_labeled:] + + y[unlabeled_subset] = -1 + + super().fit(X,y) + return self + + +# In all cases, we simply use max_iter=100 +sparse_rbf_model = GridSearchCV(WrapLabelSpreading(kernel='sparse-rbf'), + param_grid= { + 'gamma': np.logspace(-8, 1, 10), + 'alpha': np.linspace(0, 1, 10), + 'n_neighbors': list(range(5,55,5))}) + +knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn'), + param_grid= { + 'n_neighbors': list(range(5,55,5))}, + 'alpha': np.linspace(0, 1, 10), + ) + + +# Then, we compare the performance of optimized sparse-rbf kernel to knn kernel +supervision_fractions = [0.05, 0.1] +accuracies = { + 'transduction': { 'knn':[], 'sparse-rbf':[] }, + 'induction': { 'knn':[], 'sparse-rbf':[] } +} +for supervision_fraction in supervision_fractions: + supervision_fraction = 0.05 + n_labeled = int(supervision_fraction * n_train) + indices = np.arange(n_train) + unlabeled_set = indices[n_labeled:] + + Ymasked = np.copy(Ytrain) + Ymasked[unlabeled_set] = -1 + + for kernel_name, model in zip(['knn', 'sparse-rbf'], + [knn_model, sparse_rbf_model]): + knn_acc_trans = [] + knn_acc_ind = [] + sparse_rbf_acc_trans = [] + sparse_rbf_acc_ind = [] + # Repeat each scenario 5 times to collect rough statistics + # for _ in range(5): + print("="*80) + t0 = time.time() + print(f"MODEL: {model}") + model.fit(Xtrain, Ymasked) + t1 = time.time() + + predicted_labels = model.transduction_[unlabeled_set] + true_labels = Ytrain[unlabeled_set] + acc = np.sum(predicted_labels == true_labels) / len(unlabeled_set) + print(f"accuracy: {acc}") + + + + print("-"*80) + print(f"TRANSDUCTION: {n_labeled} labeled and " + + f"{n_train - n_labeled} unlabeled points ({n_train} total)") + print("-"*80) + print("Confusion Matrix:") + print(confusion_matrix(true_labels, predicted_labels, + labels=model.classes_)) + print("-"*80) + print("Classification Report:") + print(classification_report(true_labels, predicted_labels)) + print("-"*80) + + predicted_labels = model.predict(Xtest) + t2 = time.time() + + print("-"*80) + print(f"INDUCTION: {int(test_fraction * n_total)} test points") + print("-"*80) + print("Confusion Matrix:") + print(confusion_matrix(Ytest, predicted_labels, labels=model.classes_)) + print("-"*80) + print("Classification Report:") + print(classification_report(Ytest, predicted_labels)) + print("-"*80) + print(f"Runtimes: Transduction: {t1 - t0:.2f}s. Induction: {t2 - t1:.2f}s") diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 665b50dcfa507..7fd5ec7d2d345 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -23,10 +23,10 @@ Kernel: A function which projects a vector into some higher dimensional space. This - implementation supports RBF and KNN kernels. Using the RBF kernel generates - a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of - size O(k*N) which will run much faster. See the documentation for SVMs for - more info on kernels. + implementation supports RBF, Sparse-RBF, and KNN kernels. Using the RBF + kernel generates a dense matrix of size O(N^2). Sparse-RBF and KNN kernel + will generate a sparse matrix of size O(k*N) which will run much faster. See + the documentation for SVMs for more info on kernels. Examples -------- @@ -76,17 +76,19 @@ class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): Parameters ---------- - kernel : {'knn', 'rbf', callable} + kernel : {'knn', 'rbf', 'sparse-rbf', callable} String identifier for kernel function to use or the kernel function - itself. Only 'rbf' and 'knn' strings are valid inputs. The function - passed should take two inputs, each of shape [n_samples, n_features], - and return a [n_samples, n_samples] shaped weight matrix + itself. Only 'rbf', 'sparse-rbf', and 'knn' strings are valid inputs; + 'sparse-rbf' calculates RBF weights for only the closest 'n_neighbors' + points. The callable function passed should take two inputs, each of + shape [n_samples, n_features], and return a [n_samples, n_samples] + shaped weight matrix. gamma : float - Parameter for rbf kernel + Parameter for rbf or sparse-rbf kernel n_neighbors : integer > 0 - Parameter for knn kernel + Parameter for knn or sparse-rbf kernel alpha : float Clamping factor @@ -127,6 +129,17 @@ def _get_kernel(self, X, y=None): return rbf_kernel(X, X, gamma=self.gamma) else: return rbf_kernel(X, y, gamma=self.gamma) + elif self.kernel == "sparse-rbf": + self.nn_fit = NearestNeighbors(self.n_neighbors, + n_jobs=self.n_jobs).fit(X) + # rbf(x1, x2) = exp(-gamma * ||x1 - x2||^2) + W = self.nn_fit.kneighbors_graph(y, mode='distance').T.power(2) + W *= -1 * self.gamma + np.exp(W.data, out=W.data) + # explicitly set diagonal, + # since np.exp(W.data) does not modify zeros on the diagonal + W.setdiag(1) + return W elif self.kernel == "knn": if self.nn_fit is None: self.nn_fit = NearestNeighbors(self.n_neighbors, @@ -306,17 +319,19 @@ class LabelPropagation(BaseLabelPropagation): Parameters ---------- - kernel : {'knn', 'rbf', callable} + kernel : {'knn', 'rbf', 'sparse-rbf', callable} String identifier for kernel function to use or the kernel function - itself. Only 'rbf' and 'knn' strings are valid inputs. The function - passed should take two inputs, each of shape [n_samples, n_features], - and return a [n_samples, n_samples] shaped weight matrix. + itself. Only 'rbf', 'sparse-rbf', and 'knn' strings are valid inputs; + 'sparse-rbf' calculates RBF weights for only the closest 'n_neighbors' + points. The callable function passed should take two inputs, each of + shape [n_samples, n_features], and return a [n_samples, n_samples] + shaped weight matrix. gamma : float - Parameter for rbf kernel + Parameter for rbf or sparse-rbf kernel n_neighbors : integer > 0 - Parameter for knn kernel + Parameter for knn or sparse-rbf kernel max_iter : integer Change maximum number of iterations allowed @@ -412,17 +427,20 @@ class LabelSpreading(BaseLabelPropagation): Parameters ---------- - kernel : {'knn', 'rbf', callable} + kernel : {'knn', 'rbf', 'sparse-rbf', callable} String identifier for kernel function to use or the kernel function - itself. Only 'rbf' and 'knn' strings are valid inputs. The function - passed should take two inputs, each of shape [n_samples, n_features], - and return a [n_samples, n_samples] shaped weight matrix + itself. Only 'rbf', 'sparse-rbf', and 'knn' strings are valid inputs; + 'sparse-rbf' calculates RBF weights for only the closest 'n_neighbors' + points. The callable function passed should take two inputs, each of + shape [n_samples, n_features], and return a [n_samples, n_samples] + shaped weight matrix. gamma : float - parameter for rbf kernel + Parameter for rbf or sparse-rbf kernel n_neighbors : integer > 0 - parameter for knn kernel + Parameter for knn or sparse-rbf kernel + alpha : float Clamping factor. A value in (0, 1) that specifies the relative amount diff --git a/sklearn/semi_supervised/tests/test_label_propagation.py b/sklearn/semi_supervised/tests/test_label_propagation.py index 219457f9edcf6..1f3879b339aea 100644 --- a/sklearn/semi_supervised/tests/test_label_propagation.py +++ b/sklearn/semi_supervised/tests/test_label_propagation.py @@ -17,11 +17,15 @@ ESTIMATORS = [ (label_propagation.LabelPropagation, {'kernel': 'rbf'}), (label_propagation.LabelPropagation, {'kernel': 'knn', 'n_neighbors': 2}), + (label_propagation.LabelPropagation, { + 'kernel': 'sparse-rbf', 'gamma': 1e-4, 'n_neighbors': 2}), (label_propagation.LabelPropagation, { 'kernel': lambda x, y: rbf_kernel(x, y, gamma=20) }), (label_propagation.LabelSpreading, {'kernel': 'rbf'}), (label_propagation.LabelSpreading, {'kernel': 'knn', 'n_neighbors': 2}), + (label_propagation.LabelSpreading, { + 'kernel': 'sparse-rbf', 'gamma': 1e-4, 'n_neighbors': 2}), (label_propagation.LabelSpreading, { 'kernel': lambda x, y: rbf_kernel(x, y, gamma=20) }), @@ -64,7 +68,7 @@ def test_predict_proba(): for estimator, parameters in ESTIMATORS: clf = estimator(**parameters).fit(samples, labels) assert_array_almost_equal(clf.predict_proba([[1., 1.]]), - np.array([[0.5, 0.5]])) + np.array([[0.5, 0.5]]), 4) def test_label_spreading_closed_form(): @@ -199,3 +203,93 @@ def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5): n_correct = np.sum(Ypred == Ytest) assert n_correct >= 0.9 * n_test + + +def test_sparse_rbf_kernel(): + n_classes = 4 + n_samples = 500 + n_test = 10 + X, Y = make_classification(n_classes=n_classes, + n_samples=n_samples, + n_features=20, + n_informative=20, + n_redundant=0, + n_repeated=0, + random_state=0) + + Xtrain = X[:n_samples - n_test] + Ytrain = Y[:n_samples - n_test] + Xtest = X[n_samples - n_test:] + Ytest = Y[n_samples - n_test:] + + model = label_propagation.LabelSpreading(kernel='sparse-rbf', gamma=1e-5) + model.fit(Xtrain, Ytrain) + + Ypred = model.predict(Xtest) + n_correct = np.sum(Ypred == Ytest) + + assert n_correct >= 0.9 * n_test + + model = label_propagation.LabelPropagation(kernel='sparse-rbf', gamma=1e-5) + model.fit(Xtrain, Ytrain) + + Ypred = model.predict(Xtest) + n_correct = np.sum(Ypred == Ytest) + + assert n_correct >= 0.9 * n_test + + +def test_sparse_rbf_kernel_agrees_with_dense(): + + n_classes = 4 + n_samples = 500 + X, Y = make_classification(n_classes=n_classes, + n_samples=n_samples, + n_features=20, + n_informative=20, + n_redundant=0, + n_repeated=0, + random_state=0) + + gamma = 1e-5 + n_neighbors = 10 + + # Check LabelSpreading + # Make dense RBF kernel + dense_train = (label_propagation + .LabelSpreading(kernel='rbf', gamma=gamma) + ._get_kernel(X)) + # Keep top k+1 per column. (k neighbors + 1 for self) + ind = np.argpartition( + dense_train, kth=-(n_neighbors+1), axis=0)[:-(n_neighbors+1), :] + np.put_along_axis(dense_train, ind, 0, axis=0) + + # Make column-sparse RBF kernel + sparse_train = (label_propagation + .LabelSpreading(kernel='sparse-rbf', + gamma=gamma, + n_neighbors=n_neighbors) + ._get_kernel(X) + .toarray()) + + assert_array_almost_equal(dense_train, sparse_train) + + # Check LabelPropagation + # Make dense RBF kernel + dense_train = (label_propagation + .LabelPropagation(kernel='rbf', gamma=gamma) + ._get_kernel(X)) + # Keep top k+1 per column. (k neighbors + 1 for self) + ind = np.argpartition( + dense_train, kth=-(n_neighbors+1), axis=0)[:-(n_neighbors+1), :] + np.put_along_axis(dense_train, ind, 0, axis=0) + + # Make column-sparse RBF kernel + sparse_train = (label_propagation + .LabelPropagation(kernel='sparse-rbf', + gamma=gamma, + n_neighbors=n_neighbors) + ._get_kernel(X) + .toarray()) + + assert_array_almost_equal(dense_train, sparse_train) From 57df7a2a016d10b7fa6488a4469a2e22bcef2112 Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Fri, 13 Dec 2019 18:06:53 -0500 Subject: [PATCH 5/8] WIP - sparse RBF kernel --- .../compare_sparse_kernels_mnist.py | 228 ++++++++++-------- 1 file changed, 122 insertions(+), 106 deletions(-) diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py index f633102adcbf9..1d0bb86f301f0 100644 --- a/examples/semi_supervised/compare_sparse_kernels_mnist.py +++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py @@ -20,7 +20,7 @@ The first kernel option produces a binary k-Nearest Neighbors adjacency matrix. The second produces a kernel which is also k-sparse, but contains the same -weights as used in an RBF kernel. +weights as used in an RBF kernel. Notice that the performance of the sparse-RBF kernel is very sensitive to parameters; the parameters used here were found by a quick manual search, so @@ -28,47 +28,136 @@ kernel effectively on a new dataset requires hyperparameter tuning. """ import numpy as np +from pprint import pprint from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.semi_supervised import LabelSpreading -from sklearn.metrics import classification_report, confusion_matrix -from sklearn.model_selection import train_test_split -from sklearn.metrics import make_scorer import time -Xorig, Yorig = fetch_openml('mnist_784', version=1, return_X_y=True) -Yorig = Y.astype(int) -# For a quick demonstration, use only a subset of the data -n_total = 10000 -X = Xorig[:n_total, :] -Y = Yorig[:n_total] +def run_comparison(): + X_orig, y_orig = fetch_openml('mnist_784', version=1, return_X_y=True) + y_orig = y_orig.astype(int) + + # First, we use a small subset of the data to tune hyperparameters + n_total = 5000 + X = X_orig[:n_total, :] + y = y_orig[:n_total] + + test_fraction = 0.333 + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_fraction, random_state=0) + + # Mask subset of train data for transductive learning + + # We perform a grid search to optimize parameters for sparse-rbf + # kernel. For this purpose, we use a smaller subset of the data. In all + # cases, we simply use max_iter=100 Notice that we are searching over the + # inductive accuracy (accuracy on the test set) rather than the + # transductive accuracy (on the masked training examples). This keeps + # things a bit simpler, though we could customize the score function and + # the `WrapLabelSpreading` class further to also hyperparameter search over + # the transductive accuracy. + sparse_rbf_model = GridSearchCV( + WrapLabelSpreading(kernel='sparse-rbf', supervision_fraction=0.05), + param_grid={ + 'n_jobs': [-1], + 'max_iter': [100], + 'alpha': np.linspace(0.01, 0.50, 5), + 'gamma': np.logspace(-8, 1, 20), + 'n_neighbors': list(range(5, 60, 3))}, + cv=3) + + sparse_rbf_model.fit(X, y) + sparse_rbf_params = sparse_rbf_model.best_params_ + print(f"Optimal parameters for sparse RBF kernel: {sparse_rbf_params}") + + knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn', + supervision_fraction=0.05), + param_grid={ + 'n_jobs': [-1], + 'max_iter': [100], + 'alpha': np.linspace(0.01, 0.50, 5), + 'n_neighbors': list(range(5, 60, 3))}, + cv=3) + + knn_model.fit(X, y) + knn_params = knn_model.best_params_ + print(f"Optimal parameters for knn kernel: {knn_params}") + + # Next, we can compare our optimized models on a larger dataset. + n_total = 20000 + X = X_orig[:n_total, :] + y = y_orig[:n_total] + test_fraction = 0.333 + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_fraction, random_state=0) + + supervision_fractions = [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1] + results = { + 'transduction': {'knn': [], 'sparse-rbf': []}, + 'induction': {'knn': [], 'sparse-rbf': []}, + 'runtimes': {'knn': [], 'sparse-rbf': []} + } + for supervision_fraction in supervision_fractions: + n_train = len(y_train) + n_labeled = int(supervision_fraction * n_train) + indices = np.arange(n_train) + unlabeled_set = indices[n_labeled:] + + y_masked = np.copy(y_train) + y_masked[unlabeled_set] = -1 + + for kernel_name, params in zip(['knn', 'sparse-rbf'], + [knn_params, sparse_rbf_params]): + model = LabelSpreading(kernel=kernel_name, **params) + print("="*80) + print(f"Kernel: {kernel_name}, " + + f"Supervision fraction: {supervision_fraction}") + transductive_accs = [] + inductive_accs = [] + runtimes = [] + + # Repeat each scenario several times to collect rough statistics + for _ in range(3): + t0 = time.time() + model.fit(X_train, y_masked) + + predicted_labels = model.transduction_[unlabeled_set] + true_labels = y_train[unlabeled_set] + transductive_acc = (np.sum(predicted_labels == true_labels) / + len(unlabeled_set)) + transductive_accs.append(transductive_acc) + inductive_acc = model.score(X_test, y_test) + inductive_accs.append(inductive_acc) + t1 = time.time() + runtimes.append(t1-t0) + + mean_t_acc = np.mean(transductive_accs) + mean_i_acc = np.mean(inductive_accs) + mean_runtime = np.mean(runtimes) + + print(f"Mean transductive accuracy: {100 * mean_t_acc:.2f}%, " + + f"Mean inductive accuracy: {100 * mean_i_acc:.2f}%, " + + f"Mean runtime: {mean_runtime:.2f}s") + + results['transduction'][kernel_name].append(mean_t_acc) + results['induction'][kernel_name].append(mean_i_acc) + results['runtimes'][kernel_name].append(mean_runtime) + + print("="*80) + print(f"supervision_fractions: {supervision_fractions}") + pprint(results) -# Save test set for inductive learning -test_fraction = 0.333 -Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=test_fraction, - random_state=0) - -# Mask subset of train data for transductive learning -n_train = len(Ytrain) -#kwargs = {'gamma': 1e-9, 'n_neighbors': 50, 'n_jobs': -1, 'max_iter': 100} - -#models = [LabelSpreading(kernel='knn', **kwargs), -# LabelSpreading(kernel='sparse-rbf', **kwargs)] - -#supervision_fractions = [0.001, 0.005, 0.01, 0.05, 0.1] - -# First, we perform a grid search to optimize parameters for sparse-rbf kernel. -# For this purpose, we use a smaller subset of the data. -# Notice also that we class WrapLabelSpreading(LabelSpreading): """ In order to perform a grid search over this semi-supervised model, - we need to provide a thin wrapper that masks a subset of the data before + we need to provide a thin wrapper that masks a subset of the data before `fit` is called. """ def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20, - n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None): + n_neighbors=7, alpha=0.2, max_iter=30, tol=1e-3, n_jobs=None): self.supervision_fraction = supervision_fraction @@ -79,7 +168,7 @@ def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20, def fit(self, X, y): # mask a random subset of labels, based on self.supervision_fraction n_total = len(y) - n_labeled = self.supervision_fraction * n_total + n_labeled = int(self.supervision_fraction * n_total) indices = np.arange(n_total) np.random.seed(0) @@ -88,82 +177,9 @@ def fit(self, X, y): y[unlabeled_subset] = -1 - super().fit(X,y) + super().fit(X, y) return self -# In all cases, we simply use max_iter=100 -sparse_rbf_model = GridSearchCV(WrapLabelSpreading(kernel='sparse-rbf'), - param_grid= { - 'gamma': np.logspace(-8, 1, 10), - 'alpha': np.linspace(0, 1, 10), - 'n_neighbors': list(range(5,55,5))}) - -knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn'), - param_grid= { - 'n_neighbors': list(range(5,55,5))}, - 'alpha': np.linspace(0, 1, 10), - ) - - -# Then, we compare the performance of optimized sparse-rbf kernel to knn kernel -supervision_fractions = [0.05, 0.1] -accuracies = { - 'transduction': { 'knn':[], 'sparse-rbf':[] }, - 'induction': { 'knn':[], 'sparse-rbf':[] } -} -for supervision_fraction in supervision_fractions: - supervision_fraction = 0.05 - n_labeled = int(supervision_fraction * n_train) - indices = np.arange(n_train) - unlabeled_set = indices[n_labeled:] - - Ymasked = np.copy(Ytrain) - Ymasked[unlabeled_set] = -1 - - for kernel_name, model in zip(['knn', 'sparse-rbf'], - [knn_model, sparse_rbf_model]): - knn_acc_trans = [] - knn_acc_ind = [] - sparse_rbf_acc_trans = [] - sparse_rbf_acc_ind = [] - # Repeat each scenario 5 times to collect rough statistics - # for _ in range(5): - print("="*80) - t0 = time.time() - print(f"MODEL: {model}") - model.fit(Xtrain, Ymasked) - t1 = time.time() - - predicted_labels = model.transduction_[unlabeled_set] - true_labels = Ytrain[unlabeled_set] - acc = np.sum(predicted_labels == true_labels) / len(unlabeled_set) - print(f"accuracy: {acc}") - - - - print("-"*80) - print(f"TRANSDUCTION: {n_labeled} labeled and " + - f"{n_train - n_labeled} unlabeled points ({n_train} total)") - print("-"*80) - print("Confusion Matrix:") - print(confusion_matrix(true_labels, predicted_labels, - labels=model.classes_)) - print("-"*80) - print("Classification Report:") - print(classification_report(true_labels, predicted_labels)) - print("-"*80) - - predicted_labels = model.predict(Xtest) - t2 = time.time() - - print("-"*80) - print(f"INDUCTION: {int(test_fraction * n_total)} test points") - print("-"*80) - print("Confusion Matrix:") - print(confusion_matrix(Ytest, predicted_labels, labels=model.classes_)) - print("-"*80) - print("Classification Report:") - print(classification_report(Ytest, predicted_labels)) - print("-"*80) - print(f"Runtimes: Transduction: {t1 - t0:.2f}s. Induction: {t2 - t1:.2f}s") +if __name__ == '__main__': + run_comparison() From 741ef070d207c95350a627b9356d89659bcbe80b Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Wed, 18 Dec 2019 14:30:07 -0500 Subject: [PATCH 6/8] WIP - sparse RBF kernel --- .../compare_sparse_kernels_mnist.py | 194 ++++++++++++------ sklearn/semi_supervised/_label_propagation.py | 2 +- 2 files changed, 133 insertions(+), 63 deletions(-) diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py index 1d0bb86f301f0..cfb7fffff4af0 100644 --- a/examples/semi_supervised/compare_sparse_kernels_mnist.py +++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py @@ -27,6 +27,7 @@ the model can likely be improved with further optimization, and using this kernel effectively on a new dataset requires hyperparameter tuning. """ +import matplotlib.pyplot as plt import numpy as np from pprint import pprint from sklearn.datasets import fetch_openml @@ -35,125 +36,166 @@ import time -def run_comparison(): - X_orig, y_orig = fetch_openml('mnist_784', version=1, return_X_y=True) - y_orig = y_orig.astype(int) +def run_grid_search(X, y): + """ + We perform a grid search to optimize parameters for sparse-rbf + kernel. For this purpose, we use a smaller subset of the data. In all + cases, we simply use max_iter=100 Notice that we are searching over the + inductive accuracy (accuracy on the test set) rather than the + transductive accuracy (on the masked training examples). This keeps + things a bit simpler, though we could customize the score function and + the `WrapLabelSpreading` class further to also hyperparameter search over + the transductive accuracy. + """ # First, we use a small subset of the data to tune hyperparameters n_total = 5000 - X = X_orig[:n_total, :] - y = y_orig[:n_total] + X = X[:n_total, :] + y = y[:n_total] test_fraction = 0.333 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_fraction, random_state=0) - # Mask subset of train data for transductive learning - - # We perform a grid search to optimize parameters for sparse-rbf - # kernel. For this purpose, we use a smaller subset of the data. In all - # cases, we simply use max_iter=100 Notice that we are searching over the - # inductive accuracy (accuracy on the test set) rather than the - # transductive accuracy (on the masked training examples). This keeps - # things a bit simpler, though we could customize the score function and - # the `WrapLabelSpreading` class further to also hyperparameter search over - # the transductive accuracy. + # In order to use GridSearchCV, we will use a thin wrapper class + # that masks a subset of our training labels. sparse_rbf_model = GridSearchCV( WrapLabelSpreading(kernel='sparse-rbf', supervision_fraction=0.05), param_grid={ 'n_jobs': [-1], 'max_iter': [100], - 'alpha': np.linspace(0.01, 0.50, 5), - 'gamma': np.logspace(-8, 1, 20), - 'n_neighbors': list(range(5, 60, 3))}, + 'alpha': np.linspace(0.01, 0.99, 10), + 'gamma': np.logspace(-8, -4, 10), + 'n_neighbors': list(range(6, 30, 2))}, cv=3) sparse_rbf_model.fit(X, y) sparse_rbf_params = sparse_rbf_model.best_params_ print(f"Optimal parameters for sparse RBF kernel: {sparse_rbf_params}") - knn_model = GridSearchCV(WrapLabelSpreading(kernel='knn', - supervision_fraction=0.05), - param_grid={ - 'n_jobs': [-1], - 'max_iter': [100], - 'alpha': np.linspace(0.01, 0.50, 5), - 'n_neighbors': list(range(5, 60, 3))}, - cv=3) + knn_model = GridSearchCV( + WrapLabelSpreading(kernel='knn', supervision_fraction=0.05), + param_grid={ + 'n_jobs': [-1], + 'max_iter': [100], + 'alpha': np.linspace(0.01, 0.99, 10), + 'n_neighbors': list(range(6, 30, 2))}, + cv=3) knn_model.fit(X, y) knn_params = knn_model.best_params_ print(f"Optimal parameters for knn kernel: {knn_params}") + return n_total, sparse_rbf_params, knn_params + +def run_comparison(X, y, sparse_rbf_params, knn_params, n_skip): + print("Begin comparison...") # Next, we can compare our optimized models on a larger dataset. - n_total = 20000 - X = X_orig[:n_total, :] - y = y_orig[:n_total] + n_total = 35000 + X = X[n_skip:n_total+n_skip, :] + y = y[n_skip:n_total+n_skip] test_fraction = 0.333 + + print("Train/Test split...") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_fraction, random_state=0) supervision_fractions = [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1] results = { - 'transduction': {'knn': [], 'sparse-rbf': []}, - 'induction': {'knn': [], 'sparse-rbf': []}, - 'runtimes': {'knn': [], 'sparse-rbf': []} + 'transduction': { + 'knn': {'avg': [], 'std': []}, + 'sparse-rbf': {'avg': [], 'std': []}}, + 'induction': { + 'knn': {'avg': [], 'std': []}, + 'sparse-rbf': {'avg': [], 'std': []}}, + 'runtimes': { + 'knn': {'avg': [], 'std': []}, + 'sparse-rbf': {'avg': [], 'std': []}} } - for supervision_fraction in supervision_fractions: - n_train = len(y_train) - n_labeled = int(supervision_fraction * n_train) - indices = np.arange(n_train) - unlabeled_set = indices[n_labeled:] - - y_masked = np.copy(y_train) - y_masked[unlabeled_set] = -1 + rng = np.random.RandomState(0) + for supervision_fraction in supervision_fractions: for kernel_name, params in zip(['knn', 'sparse-rbf'], [knn_params, sparse_rbf_params]): model = LabelSpreading(kernel=kernel_name, **params) print("="*80) print(f"Kernel: {kernel_name}, " + f"Supervision fraction: {supervision_fraction}") - transductive_accs = [] - inductive_accs = [] - runtimes = [] # Repeat each scenario several times to collect rough statistics - for _ in range(3): + t_accs = [] + i_accs = [] + runtimes = [] + for _ in range(5): + n_train = len(y_train) + n_labeled = int(supervision_fraction * n_train) + indices = np.arange(n_train) + rng.shuffle(indices) + unlabeled_set = indices[n_labeled:] + + y_masked = np.copy(y_train) + y_masked[unlabeled_set] = -1 + t0 = time.time() model.fit(X_train, y_masked) predicted_labels = model.transduction_[unlabeled_set] true_labels = y_train[unlabeled_set] - transductive_acc = (np.sum(predicted_labels == true_labels) / - len(unlabeled_set)) - transductive_accs.append(transductive_acc) - inductive_acc = model.score(X_test, y_test) - inductive_accs.append(inductive_acc) + t_acc = (np.sum(predicted_labels == true_labels) / + len(unlabeled_set)) + t_accs.append(t_acc) + i_accs.append(model.score(X_test, y_test)) t1 = time.time() runtimes.append(t1-t0) - mean_t_acc = np.mean(transductive_accs) - mean_i_acc = np.mean(inductive_accs) - mean_runtime = np.mean(runtimes) - - print(f"Mean transductive accuracy: {100 * mean_t_acc:.2f}%, " + - f"Mean inductive accuracy: {100 * mean_i_acc:.2f}%, " + - f"Mean runtime: {mean_runtime:.2f}s") - - results['transduction'][kernel_name].append(mean_t_acc) - results['induction'][kernel_name].append(mean_i_acc) - results['runtimes'][kernel_name].append(mean_runtime) + results['transduction'][kernel_name]['avg'].append(np.mean(t_accs)) + results['transduction'][kernel_name]['std'].append(np.std(t_accs)) + results['induction'][kernel_name]['avg'].append(np.mean(i_accs)) + results['induction'][kernel_name]['std'].append(np.std(i_accs)) + results['runtimes'][kernel_name]['avg'].append(np.mean(runtimes)) + results['runtimes'][kernel_name]['std'].append(np.std(runtimes)) print("="*80) print(f"supervision_fractions: {supervision_fractions}") pprint(results) + return supervision_fractions, results + + +def plot_results(supervision_fractions, results): + fig, ax = plt.subplots(3, 1, figsize=(16, 9)) + for i, (label, ylabel) in enumerate(zip( + ['induction', 'transduction', 'runtimes'], + ['% Accuracy', '% Accuracy', 'Duration (s)'])): + + S_avg = results[label]['sparse-rbf']['avg'] + S_std = results[label]['sparse-rbf']['std'] + + K_avg = results[label]['knn']['avg'] + K_std = results[label]['knn']['std'] + + ax[i].scatter(supervision_fractions, S_avg, c='b', label='sparse-rbf') + ax[i].scatter(supervision_fractions, K_avg, c='r', label='knn') + ax[i].set_xscale('log') + ax[i].set_xlim([8e-4, 1.3e-1]) + ax[i].set_title(f'{label.capitalize()}') + ax[i].set_xlabel('Supervision Fraction') + ax[i].fill_between(supervision_fractions, + [a - b for a, b in zip(S_avg, S_std)], + [a + b for a, b in zip(S_avg, S_std)], + facecolor='b', alpha=0.2) + ax[i].fill_between(supervision_fractions, + [a + b for a, b in zip(K_avg, K_std)], + [a - b for a, b in zip(K_avg, K_std)], + facecolor='r', alpha=0.2) + + plt.tight_layout() + plt.savefig('sparse_kernel_comparison.png') class WrapLabelSpreading(LabelSpreading): """ In order to perform a grid search over this semi-supervised model, - we need to provide a thin wrapper that masks a subset of the data before + we need to provide a wrapper that masks a subset of the data before `fit` is called. """ def __init__(self, supervision_fraction, kernel='sparse-rbf', gamma=20, @@ -182,4 +224,32 @@ def fit(self, X, y): if __name__ == '__main__': - run_comparison() + X, y = fetch_openml('mnist_784', version=1, return_X_y=True) + y = y.astype(int) + + # Set this flag to run the grid search, which takes several hours + do_grid_search = False + + if do_grid_search: + n_skip, sparse_rbf_params, knn_params = run_grid_search(X, y) + else: + # Values found from running grid search previously + sparse_rbf_params = { + 'alpha': 0.663, + 'gamma': 2.154e-7, + 'max_iter': 100, + 'n_jobs': -1, + 'n_neighbors': 20} + knn_params = { + 'alpha': 0.772, + 'max_iter': 100, + 'n_jobs': -1, + 'n_neighbors': 6} + n_skip = 0 + + supervision_fractions, results = run_comparison( + X, y, sparse_rbf_params=sparse_rbf_params, + knn_params=knn_params, + n_skip=n_skip) + + plot_results(supervision_fractions, results) diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 7fd5ec7d2d345..5013e0a69037b 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -136,7 +136,7 @@ def _get_kernel(self, X, y=None): W = self.nn_fit.kneighbors_graph(y, mode='distance').T.power(2) W *= -1 * self.gamma np.exp(W.data, out=W.data) - # explicitly set diagonal, + # explicitly set diagonal, # since np.exp(W.data) does not modify zeros on the diagonal W.setdiag(1) return W From ef0703d46318fcc7c77abd4ad0647f960d9a998e Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Wed, 18 Dec 2019 17:32:23 -0500 Subject: [PATCH 7/8] WIP - sparse RBF kernel --- .../compare_sparse_kernels_mnist.py | 93 ++++++++++++------- 1 file changed, 59 insertions(+), 34 deletions(-) diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py index cfb7fffff4af0..67ec2a8d4f3fd 100644 --- a/examples/semi_supervised/compare_sparse_kernels_mnist.py +++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py @@ -33,10 +33,11 @@ from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.semi_supervised import LabelSpreading +from torchvision.datasets import CIFAR10 import time -def run_grid_search(X, y): +def run_grid_search(X_train, X_test, y_train, y_test): """ We perform a grid search to optimize parameters for sparse-rbf kernel. For this purpose, we use a smaller subset of the data. In all @@ -47,16 +48,6 @@ def run_grid_search(X, y): the `WrapLabelSpreading` class further to also hyperparameter search over the transductive accuracy. """ - - # First, we use a small subset of the data to tune hyperparameters - n_total = 5000 - X = X[:n_total, :] - y = y[:n_total] - - test_fraction = 0.333 - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=test_fraction, random_state=0) - # In order to use GridSearchCV, we will use a thin wrapper class # that masks a subset of our training labels. sparse_rbf_model = GridSearchCV( @@ -69,7 +60,8 @@ def run_grid_search(X, y): 'n_neighbors': list(range(6, 30, 2))}, cv=3) - sparse_rbf_model.fit(X, y) + sparse_rbf_model.fit(np.vstack((X_train, X_test)), + np.concatenate((y_train, y_test))) sparse_rbf_params = sparse_rbf_model.best_params_ print(f"Optimal parameters for sparse RBF kernel: {sparse_rbf_params}") @@ -82,24 +74,16 @@ def run_grid_search(X, y): 'n_neighbors': list(range(6, 30, 2))}, cv=3) - knn_model.fit(X, y) + knn_model.fit(np.vstack((X_train, X_test)), + np.concatenate((y_train, y_test))) knn_params = knn_model.best_params_ print(f"Optimal parameters for knn kernel: {knn_params}") - return n_total, sparse_rbf_params, knn_params + return sparse_rbf_params, knn_params -def run_comparison(X, y, sparse_rbf_params, knn_params, n_skip): +def run_comparison(X_train, X_test, y_train, y_test, + sparse_rbf_params, knn_params): print("Begin comparison...") - # Next, we can compare our optimized models on a larger dataset. - n_total = 35000 - X = X[n_skip:n_total+n_skip, :] - y = y[n_skip:n_total+n_skip] - test_fraction = 0.333 - - print("Train/Test split...") - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=test_fraction, random_state=0) - supervision_fractions = [0.001, 0.003, 0.005, 0.01, 0.03, 0.05, 0.1] results = { 'transduction': { @@ -224,16 +208,54 @@ def fit(self, X, y): if __name__ == '__main__': - X, y = fetch_openml('mnist_784', version=1, return_X_y=True) - y = y.astype(int) - + # Choose the dataset + dataset = 'mnist' + # Set the fraction of data to use for hyperparam tuning + hyperp_tune_fraction = 0.1 + # Set the fraction of data to use for the final comparison + compare_fraction = 0.1 # Set this flag to run the grid search, which takes several hours do_grid_search = False + if dataset == 'mnist': + X, y = fetch_openml('mnist_784', version=1, return_X_y=True) + y = y.astype(int) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=10000, random_state=0) + + elif dataset == 'cifar10': + cifar10 = CIFAR10('.', download=True, train=True) + X_train = cifar10.data.reshape(-1, 3*32*32) + y_train = np.array(cifar10.targets) + + cifar10 = CIFAR10('.', download=True, train=False) + X_test = cifar10.data.reshape(-1, 3*32*32) + y_test = np.array(cifar10.targets) + else: + raise ValueError(f"dataset {dataset} not supported") + + print("Full dataset sizes: " + + f"\nX_train {X_train.shape}" + + f"\nX_test {X_test.shape}" + + f"\ny_train {y_train.shape}" + + f"\ny_test {y_test.shape}") + if do_grid_search: - n_skip, sparse_rbf_params, knn_params = run_grid_search(X, y) + # First, we use a small subset of the data to tune hyperparameters + tr_tune = int(hyperp_tune_fraction * len(y_train)) + te_tune = int(hyperp_tune_fraction * len(y_test)) + print("# items for hyperparam tuning:" + + f"train: {tr_tune}, test: {te_tune}") + sparse_rbf_params, knn_params = run_grid_search( + X_train[:tr_tune, :], + X_test[:te_tune, :], + y_train[:tr_tune], + y_test[:te_tune]) else: - # Values found from running grid search previously + # Values found from running grid search previously on MNIST + tr_tune = 0 + te_tune = 0 sparse_rbf_params = { 'alpha': 0.663, 'gamma': 2.154e-7, @@ -245,11 +267,14 @@ def fit(self, X, y): 'max_iter': 100, 'n_jobs': -1, 'n_neighbors': 6} - n_skip = 0 + # Skip the items used for hyperparam tuning + tr_comp = int(compare_fraction * len(y_train)) + tr_tune + te_comp = int(compare_fraction * len(y_test)) + te_tune supervision_fractions, results = run_comparison( - X, y, sparse_rbf_params=sparse_rbf_params, - knn_params=knn_params, - n_skip=n_skip) + X_train[tr_tune:tr_comp, :], X_test[te_tune:te_comp, :], + y_train[tr_tune:tr_comp], y_test[te_tune:te_comp], + sparse_rbf_params=sparse_rbf_params, + knn_params=knn_params) plot_results(supervision_fractions, results) From 12610aa321b88d76680858892bc6552959fdae3f Mon Sep 17 00:00:00 2001 From: Niklas Smedemark-Margulies Date: Wed, 18 Dec 2019 18:43:42 -0500 Subject: [PATCH 8/8] Fix plot legend and use 20% data --- examples/semi_supervised/compare_sparse_kernels_mnist.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/semi_supervised/compare_sparse_kernels_mnist.py b/examples/semi_supervised/compare_sparse_kernels_mnist.py index 67ec2a8d4f3fd..7898d889c2d0e 100644 --- a/examples/semi_supervised/compare_sparse_kernels_mnist.py +++ b/examples/semi_supervised/compare_sparse_kernels_mnist.py @@ -163,6 +163,8 @@ def plot_results(supervision_fractions, results): ax[i].set_xlim([8e-4, 1.3e-1]) ax[i].set_title(f'{label.capitalize()}') ax[i].set_xlabel('Supervision Fraction') + ax[i].set_ylabel(ylabel) + ax[i].legend() ax[i].fill_between(supervision_fractions, [a - b for a, b in zip(S_avg, S_std)], [a + b for a, b in zip(S_avg, S_std)], @@ -213,7 +215,7 @@ def fit(self, X, y): # Set the fraction of data to use for hyperparam tuning hyperp_tune_fraction = 0.1 # Set the fraction of data to use for the final comparison - compare_fraction = 0.1 + compare_fraction = 0.2 # Set this flag to run the grid search, which takes several hours do_grid_search = False