diff --git a/doc/modules/neural_networks_unsupervised.rst b/doc/modules/neural_networks_unsupervised.rst index 9924e76eed600..3b68445fc071b 100644 --- a/doc/modules/neural_networks_unsupervised.rst +++ b/doc/modules/neural_networks_unsupervised.rst @@ -7,6 +7,132 @@ Neural network models (unsupervised) .. currentmodule:: sklearn.neural_network +.. _random_basis_function: + +Random basis function +===================== + +The Random basis function :math: `f(X): R \rightarrow R` that maps matrix +:math: `X` into another feature space where the number of features is less, equal +or higher than the original feature space. The output matrix :math: `H` is +computed as follows: + +.. math:: + + H = g(Xw + b) + +where :math: `g(\cdot): R \rightarrow R` is the activation function, :math: `w` +is the weight parameter vector, and :math: `b` is the intercept vector. + +:math: `w \in R^{d \times k}`, and :math: `b \in R^{d}` are generated based +on the uniform distribution scaled between two values, set by the user. + + +The example code below illustrates using this function:: + + >>> from sklearn.neural_network import RandomBasisFunction + >>> X = [[0, 0], [1, 1]] + >>> fe = RandomBasisFunction(random_state=1, n_outputs=2) + >>> fe.fit(X) + RandomBasisFunction(activation='tanh', intercept=True, n_outputs=2, + random_state=1, weight_scale='auto') + >>> fe.transform(X) + array([[-0.69896184, -0.76098975], + [-0.97981807, -0.73662692]]) + +This function can be used to initialize a single-hidden layer feedforward network. + +Randomly weighted single-hidden layer feedforward network +========================================================= + +Randomly weighted neural networks (RW-NN) is a supervised learning algorithm +that trains a single-hidden layer feedforward network (SLFN) with the help of randomization. +It computes :math:`\w1 \in R^{d \times h}`, :math:`\w2 \in R^{h \times o}`, and +:math:`\b \in R^{d}` such that: + +.. math:: + + g(Xw1 + b)w2 \approx y + +where :math:`g(\cdot): R \rightarrow R` is the activation function; :math:`w1 \in R^{d \times k}` +is the weight parameter vector between the input layer of the network and +the hidden layer; :math:`w2 \in R^{k \times o}` is the weight parameter vector between the hidden +layer of the network and the output layer; :math:`b \in R^{d}` is the intercept vector +for the hidden layer. Figure 1 shows an example of such network. + +.. figure:: ../auto_examples/neural_networks/images/plot_slfn_001.png + :target: ../auto_examples/neural_networks/plot_slfn.html + :align: center + :scale: 100% + +The algorithm takes the following steps: + + * Generate the matrices :math:`w1 \in R^{d \times k}` and :math:`b \in R^d` with random values using the uniform distribution; + * compute :math:`H = g(Xw1 + b)`; and + * solve for :math:`w2` using a linear model, such as, ridge regression which is defined as :math:`(H^T H + (1 / C) * I)^{-1} H^T y` - where + `C` is the regularization term. + +:math:`k` is the number of hidden neurons. Larger :math:`k` allows for higher capacity to learn complex functions. +:math:`H`, or the values in the hidden neurons, represent random combinations of the training dataset features that are randomly weighted. +This technique provides an approximation of the solution returned by training SLFN using backpropagation. This is because + unlike backpropagation, this technique does not propagate the errors resulting from solving :math:`w2` to the previous layer. + +For classification, one can use a pipeline comprising the :class:`RandomBasisFunction` and :class:`RidgeClassifier` as +shown in the following example:: + + >>> from sklearn.neural_network import RandomBasisFunction + >>> from sklearn.linear_model import RidgeClassifier + >>> from sklearn.pipeline import make_pipeline + + >>> X = [[0, 0], [1, 1]] + >>> y = [0, 1] + + >>> reg = make_pipeline(RandomBasisFunction(random_state=1), RidgeClassifier(alpha=0)) + >>> reg.fit(X, y) + Pipeline(steps=[('randombasisfunction', RandomBasisFunction(activation='tanh', intercept=True, n_outputs=10, + random_state=1, weight_scale='auto')), ('ridgeclassifier', RidgeClassifier(alpha=0, class_weight=None, copy_X=True, fit_intercept=True, + max_iter=None, normalize=False, solver='auto', tol=0.001))]) + + >>> reg.predict(X) + array([0, 1]) + +For regression, one can use a pipeline comprising the :class:`RandomBasisFunction` and :class:`Ridge` as +shown in the following example:: + + >>> from sklearn.neural_network import RandomBasisFunction + >>> from sklearn.linear_model import Ridge + >>> from sklearn.pipeline import make_pipeline + + >>> X = [[0, 0], [1, 1]] + >>> y = [0.5, 0.2] + + >>> reg = make_pipeline(RandomBasisFunction(random_state=1), Ridge(alpha=0)) + >>> reg.fit(X, y) + Pipeline(steps=[('randombasisfunction', RandomBasisFunction(activation='tanh', intercept=True, n_outputs=10, + random_state=1, weight_scale='auto')), ('ridge', Ridge(alpha=0, copy_X=True, fit_intercept=True, max_iter=None, + normalize=False, solver='auto', tol=0.001))]) + + >>> reg.predict(X) + array([ 0.5, 0.2]) + +The references below show examples of how tuning some of the hyper-parameters of the pipeline affect the resulting +decision function:: + + * :ref:`example_neural_networks_plot_random_neural_network.py` + + + +.. topic:: References: + + * `"Understanding the difficulty of training deep feedforward neural networks." + `_ + Schmidt, Wouter F., Martin A. Kraaijveld, and Robert PW Duin. + + * `"Feedforward neural networks with random weights." + `_ + Schmidt, Wouter F., Martin A. Kraaijveld, and Robert PW Duin. + + .. _rbm: Restricted Boltzmann machines diff --git a/examples/neural_networks/plot_random_neural_network.py b/examples/neural_networks/plot_random_neural_network.py new file mode 100644 index 0000000000000..bce9beeb3d81f --- /dev/null +++ b/examples/neural_networks/plot_random_neural_network.py @@ -0,0 +1,163 @@ +""" +=========================================== +Effect of parameters in RandomBasisFunction +=========================================== + +This example generates plots that illustrate the impact of varying the RandomBasisFunction parameters on the decision +function of the random neural network model. + +This generates three plots, each corresponding to varying one single parameter. The plots correspond to varying the +parameter alpha, weight_scale, and n_output, respectively. + +If there is high bias in the model, which can lead to a high training error, then decreasing alpha, +increasing weight_scale, and/or increasing n_output decreases bias and therefore reduces underfitting. +Similarly, if there is high variance in the model, which is when the training error poorly approximates the testing +error, then increasing alpha, decreasing weight_scale, and/or decreasing n_output would decrease variance and therefore +reduces overfitting. + +One way to find a balance between bias and variance when tuning these parameters is by +testing a range of values using cross-validation as seen in this example. + +""" +print(__doc__) + + +# Author: Issam H. Laradji +# License: BSD 3 clause + +import numpy as np + +from matplotlib import pyplot as plt +from matplotlib.colors import ListedColormap + +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.datasets import make_moons, make_circles, make_classification +from sklearn.neural_network import RandomBasisFunction +from sklearn.linear_model import Ridge +from sklearn.pipeline import make_pipeline +from sklearn.utils.fixes import expit as logistic_sigmoid + + +# To be removed (no predict_proba in Ridge) +def predict_proba(clf, x): + return logistic_sigmoid(clf.predict(x)) + +h = .02 # step size in the mesh +rng = np.random.RandomState(1) + +alpha_list = np.logspace(-4, 4, 5) +weight_scale_list = np.logspace(-2, 2, 5) +n_outputs_list = [2, 10, 100, 200, 500] + + + +def plot(names, classifiers, title): + X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, + random_state=rng, n_clusters_per_class=1) + + linearly_separable = (X, y) + + datasets = [make_moons(noise=1., random_state=rng), + make_circles(noise=0.2, factor=0.5, random_state=rng), + linearly_separable] + + figure = plt.figure(figsize=(17, 9)) + figure.suptitle(title) + i = 1 + # iterate over datasets + for X, y in datasets: + # initialize standard scaler + scaler = StandardScaler() + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, + random_state=1) + # Compute the mean and standard deviation of each feature of the + # training set and scale the training set + X_train = scaler.fit_transform(X_train) + + # Using the same mean and standard deviation, scale the testing set + X_test = scaler.transform(X_test) + + x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 + y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 + xx, yy = np.meshgrid(np.arange(x_min, x_max, h), + np.arange(y_min, y_max, h)) + + # just plot the dataset first + cm_bright = ListedColormap(['#FF0000', '#0000FF']) + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + # Plot the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, + alpha=0.6) + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + i += 1 + + # iterate over classifiers + for name, clf in zip(names, classifiers): + ax = plt.subplot(len(datasets), len(classifiers) + 1, i) + clf.fit(X_train, y_train) + score = clf.score(X_test, y_test) + + # Plot the decision boundary. + Z = predict_proba(clf, np.c_[xx.ravel(), yy.ravel()]) + + # Put the result into a color plot + Z = Z.reshape(xx.shape) + + ax.contourf(xx, yy, Z, cmap=plt.cm.RdBu, alpha=.8) + + # Plot also the training points + ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright) + # and testing points + ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, + alpha=0.6) + + ax.set_xlim(xx.min(), xx.max()) + ax.set_ylim(yy.min(), yy.max()) + ax.set_xticks(()) + ax.set_yticks(()) + ax.set_title(name) + ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), + size=15, horizontalalignment='right') + i += 1 + +classifiers = [] +names = [] +for alpha in alpha_list: + clf = make_pipeline(RandomBasisFunction(weight_scale=1.), Ridge(alpha=alpha)) + + classifiers.append(clf) + names.append("alpha = " + str(alpha)) + +title = "Effect of varying alpha for fixed weight_scale=1" +plot(names, classifiers, title) + +classifiers = [] +names = [] +for weight_scale in weight_scale_list: + clf = make_pipeline(RandomBasisFunction(weight_scale=weight_scale), Ridge(alpha=1.)) + + classifiers.append(clf) + names.append("weight_scale = " + str(weight_scale)) + +title = "Effect of varying weight_scale for fixed alpha=1" +plot(names, classifiers, title) + +classifiers = [] +names = [] +for n_outputs in n_outputs_list: + clf = make_pipeline(RandomBasisFunction(n_outputs=n_outputs), Ridge(alpha=1.)) + + classifiers.append(clf) + names.append("n_output = " + str(n_outputs)) + +title = "Effect of varying n_output in RandomBasisFunction" +plot(names, classifiers, title) + +plt.show() diff --git a/examples/neural_networks/plot_random_nn_overfitting.py b/examples/neural_networks/plot_random_nn_overfitting.py new file mode 100644 index 0000000000000..f343de0471f1b --- /dev/null +++ b/examples/neural_networks/plot_random_nn_overfitting.py @@ -0,0 +1,71 @@ +""" +=========================================================================== +Impact of increasing the number of hidden neurons in random neural networks +=========================================================================== + +This illustrates how the random neural network behaves when increasing +the number of hidden neurons. Larger number of hidden neurons increases +training score, but might reduce the testing score as a result of overfitting. + +The example generates a plot showing the how training and testing scores change +with the number of hidden neurons on a small dataset. + +""" +print(__doc__) + + +# Author: Issam H. Laradji +# License: BSD 3 clause + +import numpy as np + +from sklearn.neural_network import RandomBasisFunction +from sklearn.linear_model import Ridge +from sklearn.pipeline import make_pipeline +from sklearn.learning_curve import validation_curve + +############################################################################### +# Generate sample data +n_samples_train, n_samples_test = 100, 50 +n_features = 50 + +np.random.seed(0) + +coef = np.random.randn(n_features) +X = np.random.randn(n_samples_train + n_samples_test, n_features) +y = np.dot(X, coef) + +# Split train and test data +X_train, X_test = X[:n_samples_train], X[n_samples_train:] +y_train, y_test = y[:n_samples_train], y[n_samples_train:] + +############################################################################### +# Compute train and test errors +n_hidden_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] + +rnn = make_pipeline(RandomBasisFunction(), Ridge(alpha=0)) + +train_scores, test_scores = validation_curve(rnn, X, y, + param_name="randombasisfunction__n_outputs", + param_range=n_hidden_list, scoring='r2') + +train_scores_mean = np.mean(train_scores, axis=1) +test_scores_mean = np.mean(test_scores, axis=1) + + +############################################################################### +# Plot results functions + +import pylab as pl + +pl.plot(n_hidden_list, train_scores_mean, label='Train') +pl.plot(n_hidden_list, test_scores_mean, label='Test') + +pl.legend(loc='lower left') +pl.title("Random neural network on training vs. testing scores") +pl.xlabel('number of neurons in the hidden layer ') +pl.ylabel('The $R^2$ score') + +pl.ylim([0.1, 1.01]) + +pl.show() diff --git a/examples/neural_networks/plot_slfn.py b/examples/neural_networks/plot_slfn.py new file mode 100644 index 0000000000000..4e63be10a8b49 --- /dev/null +++ b/examples/neural_networks/plot_slfn.py @@ -0,0 +1,86 @@ +""" +============================================ +Plot single-hidden layer feedforward network +============================================ + +This example plots a single-hidden layer feedforward network. It also allows +users to generate networks of different number of layers and +neurons by changing the two variables 'symbols' and 'n_neurons'. +""" +print(__doc__) + +# Author: Issam H. Laradji +# License: BSD 3 clause + +import pylab as pl +from matplotlib.patches import Circle, Arrow + + +def create_layer(ax, symbol, n_neurons, bias, x, y): + patches = [] + if bias is True: + patches.append(Circle((x, y + 3 * radius), radius, zorder=1, + fc='#CCCCCC')) + pl.text(x, y + 3 * radius, "$+1$", ha='center', va='center', + fontsize=fontsize) + + for i in range(n_neurons): + patches.append(Circle((x, y - i * (3 * radius)), radius, + zorder=1, fc='#CCCCCC')) + + neuron_symbol = symbol + if n_neurons != 1: + neuron_symbol += "$_" + str(i + 1) + "$" + + pl.text(x, y - i * (3 * radius), neuron_symbol, + ha='center', va='center', fontsize=fontsize) + + for p in patches: + ax.add_patch(p) + + return patches + + +def create_arrows(ax, prev_patches, curr_patches): + for prev in prev_patches: + for curr in curr_patches: + dx = curr.center[0] - prev.center[0] - 2 * radius + dy = curr.center[1] - prev.center[1] + ax.add_patch(Arrow(prev.center[0] + prev.radius, prev.center[1], + dx, dy, antialiased=True, fc='#88CCFF', + width=0.05)) + +# Change 'symbols' and 'n_neurons' for generating different networks +symbols = ["$x$", "$h$", "$f(x)$"] +n_neurons = [3, 3, 1] + +assert len(symbols) == len(n_neurons) + +radius = 0.6 +fontsize = 50 * radius + +n_layers = len(symbols) +x_size = n_layers * radius * 3.5 +y_size = n_neurons[0] * radius * 4 + +fig = pl.figure(figsize=(x_size, y_size), facecolor='w') +ax = pl.axes((0, 0, 1, 1), xticks=[], yticks=[], frameon=False) +ax.set_xlim(0, x_size) +ax.set_ylim(0, y_size) + +start_x = radius * 2 +start_y = y_size - radius * 4.5 + +rows = start_y + 3 * radius +prev_patches = create_layer(ax, symbols[0], n_neurons[0], True, start_x, + start_y) +for i in range(1, n_layers): + start_y = rows - (rows - (n_neurons[i] * 2 * radius)) / 2 + curr_patches = create_layer(ax, symbols[i], n_neurons[i], False, + start_x + 3 * radius, start_y) + create_arrows(ax, prev_patches, curr_patches) + + prev_patches = curr_patches + start_x += 3 * radius + +pl.show() diff --git a/sklearn/neural_network/__init__.py b/sklearn/neural_network/__init__.py index 60fadcec1902d..4f6bb3425c952 100644 --- a/sklearn/neural_network/__init__.py +++ b/sklearn/neural_network/__init__.py @@ -10,6 +10,9 @@ from .multilayer_perceptron import MLPClassifier from .multilayer_perceptron import MLPRegressor +from .random_basis_function import RandomBasisFunction + __all__ = ["BernoulliRBM", "MLPClassifier", - "MLPRegressor"] + "MLPRegressor", + "RandomBasisFunction"] diff --git a/sklearn/neural_network/random_basis_function.py b/sklearn/neural_network/random_basis_function.py new file mode 100644 index 0000000000000..78d71e5ee160b --- /dev/null +++ b/sklearn/neural_network/random_basis_function.py @@ -0,0 +1,158 @@ +import numpy as np + +from ._base import ACTIVATIONS + +from ..base import BaseEstimator, TransformerMixin +from ..utils import check_random_state +from ..utils.extmath import safe_sparse_dot +from ..utils.validation import check_array + + +class RandomBasisFunction(BaseEstimator, TransformerMixin): + """Random basis activation. + + The algorithm uses the number of features of the input data to randomly + generate coefficient parameters based on the uniform probability + distribution. + + Using these coefficient parameters, the algorithm can transform + the data into a different dimensional space. + + Parameters + ---------- + n_outputs : int, default 10 + The number of output features to generate. + + weight_scale : float, default 'auto' + If 'auto', `coef_` and `intercept_` get values ranging between + plus/minus 'sqrt(6. / (n_features + n_outputs))' based on + the uniform distribution; otherwise, between +weight_scale and + -weight_scale. + + activation : {'logistic', 'tanh', 'relu'}, default 'tanh' + Activation function for the output features. + + - 'logistic', the logistic sigmoid function, + returns f(x) = 1 / (1 + exp(x)). + + - 'tanh', the hyperbolic tan function, + returns f(x) = tanh(x). + + - 'relu', the rectified linear unit function, + returns f(x) = max(0, x). + + intercept : boolean, default True + Whether to randomly generate an intercept. + + random_state : int or RandomState, optional, default None + State of or seed for random number generator. + + Attributes + ---------- + `coef_` : array-like, shape (n_features, n_outputs) + The coefficient parameters used to generate the output features. + + `intercept_` : array-like, shape (n_outputs,) + An intercept parameter added to the output features. + + References + ---------- + Glorot, Xavier, and Yoshua Bengio. "Understanding the difficulty of + training deep feedforward neural networks." International Conference + on Artificial Intelligence and Statistics. 2010. + + Schmidt, Wouter F., Martin A. Kraaijveld, and Robert PW Duin. + "Feedforward neural networks with random weights." Pattern Recognition, + 1992. Vol. II. Conference B: Pattern Recognition Methodology and Systems, + Proceedings., 11th IAPR International Conference on. IEEE, 1992. + + See also + -------- + `sklearn.random_projection` and `sklearn.kernel_approximation` contains algorithms + that are similar to `RandomBasisFunction` in that they transform the input features to another + dimensional space. However, `RandomBasisFunction` is more general in that + the user defines the number of features to generate and the function to + apply on these output features. + + """ + def __init__(self, n_outputs=10, weight_scale='auto', + activation='tanh', intercept=True, random_state=None): + self.n_outputs = n_outputs + self.weight_scale = weight_scale + self.activation = activation + self.intercept = intercept + self.random_state = random_state + + def fit(self, X, y=None): + """ + Generate random parameters based on the number of features the input + data has. + + Parameters + ---------- + X : numpy array or scipy.sparse of shape (n_samples, n_features). + + y : is not used: placeholder to allow for usage in a Pipeline. + + Returns + ------- + self + + """ + # Sanity checks + if self.n_outputs <= 0: + raise ValueError("n_outputs must be > 0, got %s." % + self.n_outputs) + + if self.activation not in ACTIVATIONS: + raise ValueError("The activation %s is not supported. Supported " + "activations are %s." % (self.activation, + ACTIVATIONS)) + + X = check_array(X, accept_sparse=['csr', 'csc']) + + n_samples, n_features = X.shape + + rng = check_random_state(self.random_state) + + if self.weight_scale == 'auto': + weight_init_bound = np.sqrt(6. / (n_features + self.n_outputs)) + else: + weight_init_bound = self.weight_scale + + self.coef_ = rng.uniform(-weight_init_bound, weight_init_bound, + (n_features, self.n_outputs)) + + if self.intercept: + self.intercept_ = rng.uniform(-weight_init_bound, weight_init_bound, + self.n_outputs) + + self.activation_function = ACTIVATIONS[self.activation] + + return self + + def transform(self, X, y=None): + """ + Transform input data to another space using the randomly generated + parameters + + Parameters + ---------- + X : numpy array or scipy.sparse of shape (n_samples, n_features). + + y : is not used: placeholder to allow for usage in a Pipeline. + + Returns + ------- + X_new : numpy array or scipy sparse of shape [n_samples, n_components] + Projected array. + + """ + X = check_array(X, accept_sparse=['csr', 'csc']) + + X_new = safe_sparse_dot(X, self.coef_) + + if self.intercept: + X_new += self.intercept_ + + return self.activation_function(X_new)