Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c93b7a8

Browse files
committed
iterate, improve. Nesterov's momentum.
1 parent 7f70965 commit c93b7a8

12 files changed

+663
-418
lines changed

benchmarks/bench_mnist.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,19 @@
99
covertype dataset, the feature space is homogenous.
1010
1111
Example of output :
12-
1312
[..]
13+
1414
Classification performance:
1515
===========================
16-
Classifier train-time test-time error-rat
16+
Classifier train-time test-time error-rate
1717
------------------------------------------------------------
18-
MultilayerPerceptron 1308.66s 0.31s 0.0184
19-
Nystroem-SVM 105.07s 0.91s 0.0227
20-
ExtraTrees 48.20s 1.22s 0.0288
21-
RandomForest 47.17s 1.21s 0.0304
22-
SampledRBF-SVM 140.45s 0.84s 0.0486
23-
CART 22.84s 0.16s 0.1214
24-
dummy 0.01s 0.02s 0.8973
25-
18+
MultilayerPerceptron 475.76s 1.31s 0.0201
19+
Nystroem-SVM 218.38s 17.86s 0.0229
20+
ExtraTrees 45.54s 0.52s 0.0288
21+
RandomForest 44.79s 0.32s 0.0304
22+
SampledRBF-SVM 265.64s 19.78s 0.0488
23+
CART 21.13s 0.01s 0.1214
24+
dummy 0.01s 0.01s 0.8973
2625
"""
2726
from __future__ import division, print_function
2827

@@ -48,7 +47,7 @@
4847
from sklearn.svm import LinearSVC
4948
from sklearn.tree import DecisionTreeClassifier
5049
from sklearn.utils import check_array
51-
from sklearn.neural_network import MultilayerPerceptronClassifier
50+
from sklearn.neural_network import MLPClassifier
5251

5352
# Memoize the data extraction and memory map the resulting
5453
# train / test splits in readonly mode
@@ -89,7 +88,7 @@ def load_data(dtype=np.float32, order='F'):
8988
make_pipeline(Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)),
9089
'SampledRBF-SVM':
9190
make_pipeline(RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)),
92-
'MultilayerPerceptron': MultilayerPerceptronClassifier(
91+
'MultilayerPerceptron': MLPClassifier(
9392
hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
9493
algorithm='sgd', learning_rate_init=0.5, momentum=0.9, verbose=1,
9594
tol=1e-4, random_state=1)

doc/modules/classes.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1007,8 +1007,8 @@ See the :ref:`metrics` section of the user guide for further details.
10071007
:template: class.rst
10081008

10091009
neural_network.BernoulliRBM
1010-
neural_network.MultilayerPerceptronClassifier
1011-
neural_network.MultilayerPerceptronRegressor
1010+
neural_network.MLPClassifier
1011+
neural_network.MLPRegressor
10121012

10131013

10141014
.. _calibration_ref:

doc/modules/neural_networks_supervised.rst

Lines changed: 47 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,10 @@ layer transforms the values from the previous layer by a weighted linear summati
3333
:math:`g(\cdot):R \rightarrow R` - like the hyperbolic tan function. The output layer
3434
receives the values from the last hidden layer and transforms them into output values.
3535

36-
The module contains the public attributes ``layers_coef_`` and ``layers_intercept_``.
37-
``layers_coef_`` is a list of weight matrices, where weight matrix at index
36+
The module contains the public attributes ``coefs_`` and ``intercepts_``.
37+
``coefs_`` is a list of weight matrices, where weight matrix at index
3838
:math:`i` represents the weights between layer :math:`i` and layer
39-
:math:`i+1`. ``layers_intercept_`` is a list of bias vectors, where the vector
39+
:math:`i+1`. ``intercepts_`` is a list of bias vectors, where the vector
4040
at index :math:`i` represents the bias values added to layer :math:`i+1`.
4141

4242
The advantages of Multi-layer Perceptron are:
@@ -68,45 +68,45 @@ some of these disadvantages.
6868
Classification
6969
==============
7070

71-
Class :class:`MultilayerPerceptronClassifier` implements
71+
Class :class:`MLPClassifier` implements
7272
a multi layer perceptron (MLP) algorithm that trains using Backpropagation.
7373

7474
MLP trains on two arrays: array X of size (n_samples, n_features), which holds
7575
the training samples represented as floating point feature vectors; and array
7676
y of size (n_samples,), which holds the target values (class labels) for the
7777
training samples::
7878

79-
>>> from sklearn.neural_network import MultilayerPerceptronClassifier
79+
>>> from sklearn.neural_network import MLPClassifier
8080
>>> X = [[0., 0.], [1., 1.]]
8181
>>> y = [0, 1]
82-
>>> clf = MultilayerPerceptronClassifier(hidden_layer_sizes=(5, 2), random_state=1)
83-
>>> clf.fit(X, y)
84-
MultilayerPerceptronClassifier(activation='relu', algorithm='l-bfgs',
85-
alpha=1e-05, batch_size=200, hidden_layer_sizes=(5, 2),
86-
learning_rate='constant', learning_rate_init=0.5,
87-
max_iter=200, power_t=0.5, random_state=1, shuffle=False,
88-
tol=1e-05, verbose=False, warm_start=False)
82+
>>> clf = MLPClassifier(hidden_layer_sizes=(5, 2), random_state=1)
83+
>>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
84+
MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
85+
batch_size=200, early_stopping=False, hidden_layer_sizes=(5, 2),
86+
learning_rate='constant', learning_rate_init=0.2, max_iter=200,
87+
momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1,
88+
shuffle=True, tol=0.0001, verbose=False, warm_start=False)
8989

9090
After fitting (training), the model can predict labels for new samples::
9191

9292
>>> clf.predict([[2., 2.], [-1., -2.]])
9393
array([1, 0])
9494

95-
MLP can fit a non-linear model to the training data. ``clf.layers_coef_``
95+
MLP can fit a non-linear model to the training data. ``clf.coefs_``
9696
contains the weight matrices that constitute the model parameters::
9797

98-
>>> [coef.shape for coef in clf.layers_coef_]
98+
>>> [coef.shape for coef in clf.coefs_]
9999
[(2, 5), (5, 2), (2, 1)]
100100

101101
To get the raw values before applying the output activation function, run the
102102
following command,
103103

104-
use :meth:`MultilayerPerceptronClassifier.decision_function`::
104+
use :meth:`MLPClassifier.decision_function`::
105105

106106
>>> clf.decision_function([[2., 2.], [1., 2.]]) # doctest: +ELLIPSIS
107-
array([ 11.55..., 11.55...])
107+
array([ 47.6..., 47.6...])
108108

109-
Currently, :class:`MultilayerPerceptronClassifier` supports only the
109+
Currently, :class:`MLPClassifier` supports only the
110110
Cross-Entropy loss function, which allows probability estimates by running the
111111
``predict_proba`` method.
112112

@@ -115,36 +115,36 @@ Cross-Entropy loss function, giving a vector of probability estimates
115115
:math:`P(y|x)` per sample :math:`x`::
116116

117117
>>> clf.predict_proba([[2., 2.], [1., 2.]]) # doctest: +ELLIPSIS
118-
array([[ 9.5...e-06, 9.99...e-01],
119-
[ 9.5...e-06, 9.99...e-01]])
118+
array([[ 0., 1.],
119+
[ 0., 1.]])
120120

121-
:class:`MultilayerPerceptronClassifier` supports multi-class classification by
121+
:class:`MLPClassifier` supports multi-class classification by
122122
applying `Softmax <http://en.wikipedia.org/wiki/Softmax_activation_function>`_
123123
as the output function.
124124

125125
Further, the algorithm supports :ref:`multi-label classification <multiclass>`
126126
in which a sample can belong to more than one class. For each class, the output
127-
of :meth:`MultilayerPerceptronClassifier.decision_function` passes through the
127+
of :meth:`MLPClassifier.decision_function` passes through the
128128
logistic function. Values larger or equal to `0.5` are rounded to `1`,
129129
otherwise to `0`. For a predicted output of a sample, the indices where the
130130
value is `1` represents the assigned classes of that samples::
131131

132132
>>> X = [[0., 0.], [1., 1.]]
133-
>>> y = [[0, 1], [1]]
134-
>>> clf = MultilayerPerceptronClassifier(hidden_layer_sizes=(15,), random_state=1)
133+
>>> y = [[0, 1], [1, 1]]
134+
>>> clf = MLPClassifier(hidden_layer_sizes=(15,), random_state=1)
135135
>>> clf.fit(X, y)
136-
MultilayerPerceptronClassifier(activation='relu', algorithm='l-bfgs',
137-
alpha=1e-05, batch_size=200, hidden_layer_sizes=(15,),
138-
learning_rate='constant', learning_rate_init=0.5,
139-
max_iter=200, power_t=0.5, random_state=1, shuffle=False,
140-
tol=1e-05, verbose=False, warm_start=False)
136+
MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
137+
batch_size=200, early_stopping=False, hidden_layer_sizes=(15,),
138+
learning_rate='constant', learning_rate_init=0.2, max_iter=200,
139+
momentum=0.9, nesterovs_momentum=True, power_t=0.5, random_state=1,
140+
shuffle=True, tol=0.0001, verbose=False, warm_start=False)
141141
>>> clf.predict([1., 2.])
142-
[(1,)]
142+
array([[1, 1]])
143143
>>> clf.predict([0., 0.])
144-
[(0, 1)]
144+
array([[0, 1]])
145145

146146
See the examples below and the doc string of
147-
:meth:`MultilayerPerceptronClassifier.fit` for further information.
147+
:meth:`MLPClassifier.fit` for further information.
148148

149149
.. topic:: Examples:
150150

@@ -155,12 +155,12 @@ See the examples below and the doc string of
155155
Regression
156156
==========
157157

158-
Class :class:`MultilayerPerceptronRegressor` implements
158+
Class :class:`MLPRegressor` implements
159159
a multi layer perceptron (MLP) that trains using backpropagation with no
160160
activation function in the output layer. Therefore, it uses the square error as
161161
the loss function, and the output is a set of continuous values.
162162

163-
:class:`MultilayerPerceptronRegressor` also supports multi-output regression, in
163+
:class:`MLPRegressor` also supports multi-output regression, in
164164
which a sample can have more than one target.
165165

166166

@@ -308,9 +308,22 @@ Tips on Practical Use
308308
* Empirically, we observed that `L-BFGS` converges faster and
309309
with better solutions than `SGD`. Therefore, if mini-batch
310310
and online learning are unnecessary, it is best advised
311-
to set :meth:`MultilayerPerceptronClassifier.algorithm` as
311+
to set :meth:`MLPClassifier.algorithm` as
312312
'l-bfgs'.
313313

314+
More control with warm_start
315+
============================
316+
If you want more control over stopping criteria or learning rate in SGD,
317+
or want to do additional monitoring, using ``warm_start=True`` and
318+
``max_iter=1`` and iterating yourself can be helpful::
319+
320+
>>> X = [[0., 0.], [1., 1.]]
321+
>>> y = [0, 1]
322+
>>> clf = MLPClassifier(hidden_layer_sizes=(15,), random_state=1, max_iter=1)
323+
>>> for i in range(10):
324+
... clf.fit(X, y)
325+
... # additional monitoring / inspection # doctest: +ELLIPSIS
326+
MLPClassifier(...
314327

315328
.. topic:: References:
316329

examples/classification/plot_classifier_comparison.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from sklearn.cross_validation import train_test_split
3535
from sklearn.preprocessing import StandardScaler
3636
from sklearn.datasets import make_moons, make_circles, make_classification
37+
from sklearn.neural_network import MLPClassifier
3738
from sklearn.neighbors import KNeighborsClassifier
3839
from sklearn.svm import SVC
3940
from sklearn.tree import DecisionTreeClassifier
@@ -45,13 +46,14 @@
4546
h = .02 # step size in the mesh
4647

4748
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
48-
"Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
49+
"Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
4950
classifiers = [
5051
KNeighborsClassifier(3),
5152
SVC(kernel="linear", C=0.025),
5253
SVC(gamma=2, C=1),
5354
DecisionTreeClassifier(max_depth=5),
5455
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
56+
MLPClassifier(alpha=1),
5557
AdaBoostClassifier(),
5658
GaussianNB(),
5759
LDA(),

examples/neural_networks/plot_mlp_alpha.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,18 +28,18 @@
2828
from sklearn.cross_validation import train_test_split
2929
from sklearn.preprocessing import StandardScaler
3030
from sklearn.datasets import make_moons, make_circles, make_classification
31-
from sklearn.neural_network import MultilayerPerceptronClassifier
31+
from sklearn.neural_network import MLPClassifier
3232

3333
h = .02 # step size in the mesh
3434

35-
alphas = np.logspace(-4, 4, 5)
35+
alphas = np.logspace(-5, 3, 5)
3636
names = []
3737
for i in alphas:
3838
names.append('alpha ' + str(i))
3939

4040
classifiers = []
4141
for i in alphas:
42-
classifiers.append(MultilayerPerceptronClassifier(alpha=i, random_state=1))
42+
classifiers.append(MLPClassifier(alpha=i, random_state=1))
4343

4444
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
4545
random_state=0, n_clusters_per_class=1)

examples/neural_networks/plot_mlp_nonlinear.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import numpy as np
1919
from matplotlib import pyplot as plt
20-
from sklearn.neural_network import MultilayerPerceptronClassifier
20+
from sklearn.neural_network import MLPClassifier
2121

2222
# generate datapoints
2323
xx, yy = np.meshgrid(np.linspace(-3, 3, 500),
@@ -27,7 +27,7 @@
2727
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)
2828

2929
# fit the model
30-
clf = MultilayerPerceptronClassifier()
30+
clf = MLPClassifier()
3131
clf.fit(X, Y)
3232

3333
# plot the decision function for each datapoint on the grid
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
==================================================
3+
Compare SGD learning strategies for MLPClassifier
4+
==================================================
5+
6+
This example visualizes some training loss curves for different SGD mini-batch
7+
learning strategies. Because of time-constraints, we use several small
8+
datasets, for which L-BFGS might be more suitable. The general trend shown in
9+
these examples seems to carry over to larger datasets, however.
10+
"""
11+
print(__doc__)
12+
import matplotlib.pyplot as plt
13+
from sklearn.neural_network import MLPClassifier
14+
from sklearn.preprocessing import MinMaxScaler
15+
from sklearn import datasets
16+
17+
# different learning rate schedules and momentum parameters
18+
params = [{'learning_rate': 'constant', 'momentum': 0},
19+
{'learning_rate': 'constant', 'momentum': .9, 'nesterovs_momentum': False},
20+
{'learning_rate': 'constant', 'momentum': .9, 'nesterovs_momentum': True},
21+
{'learning_rate': 'invscaling', 'momentum': 0},
22+
{'learning_rate': 'invscaling', 'momentum': .9, 'nesterovs_momentum': True},
23+
{'learning_rate': 'invscaling', 'momentum': .9, 'nesterovs_momentum': False}]
24+
25+
labels = ["constant learning-rate", "constant with momentum",
26+
"constant with Nesterov's momentum",
27+
"inv-scaling learning-rate", "inv-scaling with momentum",
28+
"inv-scaling with Nesterov's momentum"]
29+
30+
plot_args = [{'c': 'red', 'linestyle': '-'},
31+
{'c': 'green', 'linestyle': '-'},
32+
{'c': 'blue', 'linestyle': '-'},
33+
{'c': 'red', 'linestyle': '--'},
34+
{'c': 'green', 'linestyle': '--'},
35+
{'c': 'blue', 'linestyle': '--'}]
36+
37+
38+
def plot_on_dataset(X, y, ax, name):
39+
# for each dataset, plot learning for each learning strategy
40+
print("\nlearning on dataset %s" % name)
41+
ax.set_title(name)
42+
X = MinMaxScaler().fit_transform(X)
43+
mlps = []
44+
if name == "digits":
45+
# digits is larger but converges fairly quickly
46+
max_iter = 15
47+
else:
48+
max_iter = 400
49+
50+
for label, param in zip(labels, params):
51+
print("training: %s" % label)
52+
mlp = MLPClassifier(verbose=0, algorithm='sgd', random_state=0,
53+
max_iter=max_iter, **param)
54+
mlp.fit(X, y)
55+
mlps.append(mlp)
56+
print("Training set score: %f" % mlp.score(X, y))
57+
print("Training set loss: %f" % mlp.loss_)
58+
for mlp, label, args in zip(mlps, labels, plot_args):
59+
ax.plot(mlp.loss_curve_, label=label, **args)
60+
61+
62+
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
63+
# load / generate some toy datasets
64+
iris = datasets.load_iris()
65+
digits = datasets.load_digits()
66+
data_sets = [(iris.data, iris.target),
67+
(digits.data, digits.target),
68+
datasets.make_circles(noise=0.2, factor=0.5, random_state=1),
69+
datasets.make_moons(noise=0.3, random_state=0)]
70+
71+
for ax, data, name in zip(axes.ravel(), data_sets, ['iris', 'digits',
72+
'circles', 'moons']):
73+
plot_on_dataset(*data, ax=ax, name=name)
74+
75+
fig.legend(ax.get_lines(), labels=labels, ncol=3, loc="upper center")
76+
plt.show()

0 commit comments

Comments
 (0)