scikit-learn
diff --git a/‎benchmarks/bench_mnist.py
Lines changed: 11 additions & 12 deletions b/‎benchmarks/bench_mnist.py
Lines changed: 11 additions & 12 deletions
diff --git a/‎doc/modules/classes.rst
Lines changed: 2 additions & 2 deletions b/‎doc/modules/classes.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/modules/neural_networks_supervised.rst
Lines changed: 47 additions & 34 deletions b/‎doc/modules/neural_networks_supervised.rst
Lines changed: 47 additions & 34 deletions
diff --git a/‎examples/classification/plot_classifier_comparison.py
Lines changed: 3 additions & 1 deletion b/‎examples/classification/plot_classifier_comparison.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/neural_networks/plot_mlp_alpha.py
Lines changed: 3 additions & 3 deletions b/‎examples/neural_networks/plot_mlp_alpha.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/neural_networks/plot_mlp_nonlinear.py
Lines changed: 2 additions & 2 deletions b/‎examples/neural_networks/plot_mlp_nonlinear.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/neural_networks/plot_mlp_training_curves.py
Lines changed: 76 additions & 0 deletions b/‎examples/neural_networks/plot_mlp_training_curves.py
Lines changed: 76 additions & 0 deletions
@@ -9,20 +9,19 @@
 covertype dataset, the feature space is homogenous.
 
 Example of output :
-
     [..]
+
     Classification performance:
     ===========================
-    Classifier               train-time   test-time   error-rat
+    Classifier               train-time   test-time   error-rate
     ------------------------------------------------------------
-    MultilayerPerceptron       1308.66s       0.31s       0.0184
-    Nystroem-SVM                105.07s       0.91s       0.0227
-    ExtraTrees                   48.20s       1.22s       0.0288
-    RandomForest                 47.17s       1.21s       0.0304
-    SampledRBF-SVM              140.45s       0.84s       0.0486
-    CART                         22.84s       0.16s       0.1214
-    dummy                         0.01s       0.02s       0.8973
-
+    MultilayerPerceptron        475.76s       1.31s       0.0201
+    Nystroem-SVM                218.38s      17.86s       0.0229
+    ExtraTrees                   45.54s       0.52s       0.0288
+    RandomForest                 44.79s       0.32s       0.0304
+    SampledRBF-SVM              265.64s      19.78s       0.0488
+    CART                         21.13s       0.01s       0.1214
+    dummy                         0.01s       0.01s       0.8973
 """
 from __future__ import division, print_function
 
@@ -48,7 +47,7 @@
 from sklearn.svm import LinearSVC
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.utils import check_array
-from sklearn.neural_network import MultilayerPerceptronClassifier
+from sklearn.neural_network import MLPClassifier
 
 # Memoize the data extraction and memory map the resulting
 # train / test splits in readonly mode
@@ -89,7 +88,7 @@ def load_data(dtype=np.float32, order='F'):
     make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)),
     'SampledRBF-SVM':
     make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)),
-    'MultilayerPerceptron': MultilayerPerceptronClassifier(
+    'MultilayerPerceptron': MLPClassifier(
         hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
         algorithm='sgd', learning_rate_init=0.5, momentum=0.9, verbose=1,
         tol=1e-4, random_state=1)
 
@@ -1007,8 +1007,8 @@ See the :ref:`metrics` section of the user guide for further details.
    :template: class.rst
 
    neural_network.BernoulliRBM
-   neural_network.MultilayerPerceptronClassifier
-   neural_network.MultilayerPerceptronRegressor
+   neural_network.MLPClassifier
+   neural_network.MLPRegressor
 
 
 .. _calibration_ref:
 
@@ -33,10 +33,10 @@ layer transforms the values from the previous layer by a weighted linear summati
 :math:`g(\cdot):R \rightarrow R` - like the hyperbolic tan function. The output layer
 receives the values from the last hidden layer and transforms them into output values.
 
-The module contains the public attributes ``layers_coef_`` and ``layers_intercept_``.
-``layers_coef_`` is a list of weight matrices, where weight matrix at index
+The module contains the public attributes ``coefs_`` and ``intercepts_``.
+``coefs_`` is a list of weight matrices, where weight matrix at index
 :math:`i` represents the weights between layer :math:`i` and layer 
-:math:`i+1`. ``layers_intercept_`` is a list of bias vectors, where the vector
+:math:`i+1`. ``intercepts_`` is a list of bias vectors, where the vector
 at index :math:`i` represents the bias values added to layer :math:`i+1`.
 
 The advantages of Multi-layer Perceptron are:
@@ -68,45 +68,45 @@ some of these disadvantages.
 Classification
 ==============
 
-Class :class:`MultilayerPerceptronClassifier` implements  
+Class :class:`MLPClassifier` implements  
 a multi layer perceptron (MLP) algorithm that trains using Backpropagation. 
 
 MLP trains on two arrays: array X of size (n_samples, n_features), which holds 
 the training samples represented as floating point feature vectors; and array 
 y of size (n_samples,), which holds the target values (class labels) for the 
 training samples::
 
-    >>> from sklearn.neural_network import MultilayerPerceptronClassifier
+    >>> from sklearn.neural_network import MLPClassifier
     >>> X = [[0., 0.], [1., 1.]]
     >>> y = [0, 1]
-    >>> clf = MultilayerPerceptronClassifier(hidden_layer_sizes=(5, 2), random_state=1)
-    >>> clf.fit(X, y)
-    MultilayerPerceptronClassifier(activation='relu', algorithm='l-bfgs',
-                    alpha=1e-05, batch_size=200, hidden_layer_sizes=(5, 2),
-                    learning_rate='constant', learning_rate_init=0.5,
-                    max_iter=200, power_t=0.5, random_state=1, shuffle=False,
-                    tol=1e-05, verbose=False, warm_start=False)
+    >>> clf = MLPClassifier(hidden_layer_sizes=(5, 2), random_state=1)
+    >>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
+    MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
+        batch_size=200, early_stopping=False, hidden_layer_sizes=(5, 2),
+        learning_rate='constant', learning_rate_init=0.2, max_iter=200,
+        momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1,
+        shuffle=True, tol=0.0001, verbose=False, warm_start=False)
 
 After fitting (training), the model can predict labels for new samples::
 
     >>> clf.predict([[2., 2.], [-1., -2.]]) 
     array([1, 0])
 
-MLP can fit a non-linear model to the training data. ``clf.layers_coef_`` 
+MLP can fit a non-linear model to the training data. ``clf.coefs_`` 
 contains the weight matrices that constitute the model parameters::
 
-    >>> [coef.shape for coef in clf.layers_coef_]
+    >>> [coef.shape for coef in clf.coefs_]
     [(2, 5), (5, 2), (2, 1)]
 
 To get the raw values before applying the output activation function, run the
 following command,
 
-use :meth:`MultilayerPerceptronClassifier.decision_function`::
+use :meth:`MLPClassifier.decision_function`::
 
     >>> clf.decision_function([[2., 2.], [1., 2.]])  # doctest: +ELLIPSIS
-    array([ 11.55...,  11.55...])
+    array([ 47.6...,  47.6...])
 
-Currently, :class:`MultilayerPerceptronClassifier` supports only the 
+Currently, :class:`MLPClassifier` supports only the 
 Cross-Entropy loss function, which allows probability estimates by running the 
 ``predict_proba`` method.
 
@@ -115,36 +115,36 @@ Cross-Entropy loss function, giving a vector of probability estimates
 :math:`P(y|x)` per sample :math:`x`:: 
 
     >>> clf.predict_proba([[2., 2.], [1., 2.]])  # doctest: +ELLIPSIS
-    array([[  9.5...e-06,   9.99...e-01],
-           [  9.5...e-06,   9.99...e-01]])
+    array([[ 0.,  1.],
+           [ 0.,  1.]])
 
-:class:`MultilayerPerceptronClassifier` supports multi-class classification by 
+:class:`MLPClassifier` supports multi-class classification by 
 applying `Softmax <http://en.wikipedia.org/wiki/Softmax_activation_function>`_
 as the output function. 
 
 Further, the algorithm supports :ref:`multi-label classification <multiclass>` 
 in which a sample can belong to more than one class. For each class, the output 
-of :meth:`MultilayerPerceptronClassifier.decision_function` passes through the 
+of :meth:`MLPClassifier.decision_function` passes through the 
 logistic function. Values larger or equal to `0.5` are rounded to `1`, 
 otherwise to `0`. For a predicted output of a sample, the indices where the
 value is `1` represents the assigned classes of that samples::
 
     >>> X = [[0., 0.], [1., 1.]]
-    >>> y = [[0, 1], [1]]
-    >>> clf = MultilayerPerceptronClassifier(hidden_layer_sizes=(15,), random_state=1)
+    >>> y = [[0, 1], [1, 1]]
+    >>> clf = MLPClassifier(hidden_layer_sizes=(15,), random_state=1)
     >>> clf.fit(X, y)
-    MultilayerPerceptronClassifier(activation='relu', algorithm='l-bfgs',
-                    alpha=1e-05, batch_size=200, hidden_layer_sizes=(15,),
-                    learning_rate='constant', learning_rate_init=0.5,
-                    max_iter=200, power_t=0.5, random_state=1, shuffle=False,
-                    tol=1e-05, verbose=False, warm_start=False)
+    MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
+           batch_size=200, early_stopping=False, hidden_layer_sizes=(15,),
+           learning_rate='constant', learning_rate_init=0.2, max_iter=200,
+           momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1,
+           shuffle=True, tol=0.0001, verbose=False, warm_start=False)
     >>> clf.predict([1., 2.])
-    [(1,)]
+    array([[1, 1]])
     >>> clf.predict([0., 0.])
-    [(0, 1)]
+    array([[0, 1]])
 
 See the examples below and the doc string of 
-:meth:`MultilayerPerceptronClassifier.fit` for further information.
+:meth:`MLPClassifier.fit` for further information.
 
 .. topic:: Examples:
 
@@ -155,12 +155,12 @@ See the examples below and the doc string of
 Regression
 ==========
 
-Class :class:`MultilayerPerceptronRegressor` implements  
+Class :class:`MLPRegressor` implements  
 a multi layer perceptron (MLP) that trains using backpropagation with no 
 activation function in the output layer. Therefore, it uses the square error as 
 the loss function, and the output is a set of continuous values.
 
-:class:`MultilayerPerceptronRegressor` also supports multi-output regression, in 
+:class:`MLPRegressor` also supports multi-output regression, in 
 which a sample can have more than one target.
 
 
@@ -308,9 +308,22 @@ Tips on Practical Use
   * Empirically, we observed that `L-BFGS` converges faster and
     with better solutions than `SGD`. Therefore, if mini-batch
     and online learning are unnecessary, it is best advised
-    to set :meth:`MultilayerPerceptronClassifier.algorithm` as
+    to set :meth:`MLPClassifier.algorithm` as
     'l-bfgs'.
 
+More control with warm_start
+============================
+If you want more control over stopping criteria or learning rate in SGD,
+or want to do additional monitoring, using ``warm_start=True`` and
+``max_iter=1`` and iterating yourself can be helpful::
+
+    >>> X = [[0., 0.], [1., 1.]]
+    >>> y = [0, 1]
+    >>> clf = MLPClassifier(hidden_layer_sizes=(15,), random_state=1, max_iter=1)
+    >>> for i in range(10):
+    ...     clf.fit(X, y)
+    ...     # additional monitoring / inspection # doctest: +ELLIPSIS
+    MLPClassifier(...
 
 .. topic:: References:
 
 
@@ -34,6 +34,7 @@
 from sklearn.cross_validation import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import make_moons, make_circles, make_classification
+from sklearn.neural_network import MLPClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.svm import SVC
 from sklearn.tree import DecisionTreeClassifier
@@ -45,13 +46,14 @@
 h = .02  # step size in the mesh
 
 names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
-         "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
+         "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
 classifiers = [
     KNeighborsClassifier(3),
     SVC(kernel="linear", C=0.025),
     SVC(gamma=2, C=1),
     DecisionTreeClassifier(max_depth=5),
     RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
+    MLPClassifier(alpha=1),
     AdaBoostClassifier(),
     GaussianNB(),
     LDA(),
 
@@ -28,18 +28,18 @@
 from sklearn.cross_validation import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.datasets import make_moons, make_circles, make_classification
-from sklearn.neural_network import MultilayerPerceptronClassifier
+from sklearn.neural_network import MLPClassifier
 
 h = .02  # step size in the mesh
 
-alphas = np.logspace(-4, 4, 5)
+alphas = np.logspace(-5, 3, 5)
 names = []
 for i in alphas:
     names.append('alpha ' + str(i))
 
 classifiers = []
 for i in alphas:
-    classifiers.append(MultilayerPerceptronClassifier(alpha=i, random_state=1))
+    classifiers.append(MLPClassifier(alpha=i, random_state=1))
 
 X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                            random_state=0, n_clusters_per_class=1)
 
@@ -17,7 +17,7 @@
 
 import numpy as np
 from matplotlib import pyplot as plt
-from sklearn.neural_network import MultilayerPerceptronClassifier
+from sklearn.neural_network import MLPClassifier
 
 # generate datapoints
 xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
@@ -27,7 +27,7 @@
 Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
 
 # fit the model
-clf = MultilayerPerceptronClassifier()
+clf = MLPClassifier()
 clf.fit(X, Y)
 
 # plot the decision function for each datapoint on the grid
 
@@ -0,0 +1,76 @@
+"""
+==================================================
+Compare SGD learning strategies for MLPClassifier
+==================================================
+
+This example visualizes some training loss curves for different SGD mini-batch
+learning strategies. Because of time-constraints, we use several small
+datasets, for which L-BFGS might be more suitable. The general trend shown in
+these examples seems to carry over to larger datasets, however.
+"""
+print(__doc__)
+import matplotlib.pyplot as plt
+from sklearn.neural_network import MLPClassifier
+from sklearn.preprocessing import MinMaxScaler
+from sklearn import datasets
+
+# different learning rate schedules and momentum parameters
+params = [{'learning_rate': 'constant', 'momentum': 0},
+          {'learning_rate': 'constant', 'momentum': .9, 'nesterovs_momentum': False},
+          {'learning_rate': 'constant', 'momentum': .9, 'nesterovs_momentum': True},
+          {'learning_rate': 'invscaling', 'momentum': 0},
+          {'learning_rate': 'invscaling', 'momentum': .9, 'nesterovs_momentum': True},
+          {'learning_rate': 'invscaling', 'momentum': .9, 'nesterovs_momentum': False}]
+
+labels = ["constant learning-rate", "constant with momentum",
+          "constant with Nesterov's momentum",
+          "inv-scaling learning-rate", "inv-scaling with momentum",
+          "inv-scaling with Nesterov's momentum"]
+
+plot_args = [{'c': 'red', 'linestyle': '-'},
+             {'c': 'green', 'linestyle': '-'},
+             {'c': 'blue', 'linestyle': '-'},
+             {'c': 'red', 'linestyle': '--'},
+             {'c': 'green', 'linestyle': '--'},
+             {'c': 'blue', 'linestyle': '--'}]
+
+
+def plot_on_dataset(X, y, ax, name):
+    # for each dataset, plot learning for each learning strategy
+    print("\nlearning on dataset %s" % name)
+    ax.set_title(name)
+    X = MinMaxScaler().fit_transform(X)
+    mlps = []
+    if name == "digits":
+        # digits is larger but converges fairly quickly
+        max_iter = 15
+    else:
+        max_iter = 400
+
+    for label, param in zip(labels, params):
+        print("training: %s" % label)
+        mlp = MLPClassifier(verbose=0, algorithm='sgd', random_state=0,
+                            max_iter=max_iter, **param)
+        mlp.fit(X, y)
+        mlps.append(mlp)
+        print("Training set score: %f" % mlp.score(X, y))
+        print("Training set loss: %f" % mlp.loss_)
+    for mlp, label, args in zip(mlps, labels, plot_args):
+            ax.plot(mlp.loss_curve_, label=label, **args)
+
+
+fig, axes = plt.subplots(2, 2, figsize=(15, 10))
+# load / generate some toy datasets
+iris = datasets.load_iris()
+digits = datasets.load_digits()
+data_sets = [(iris.data, iris.target),
+             (digits.data, digits.target),
+             datasets.make_circles(noise=0.2, factor=0.5, random_state=1),
+             datasets.make_moons(noise=0.3, random_state=0)]
+
+for ax, data, name in zip(axes.ravel(), data_sets, ['iris', 'digits',
+                                                    'circles', 'moons']):
+    plot_on_dataset(*data, ax=ax, name=name)
+
+fig.legend(ax.get_lines(), labels=labels, ncol=3, loc="upper center")
+plt.show()