Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit bc67720

Browse files
committed
refactor mlp optimization methods into _fit_lbfgs and _fit_sgd, add Ctrl+C stop option for SGD
1 parent 8ee1b30 commit bc67720

File tree

2 files changed

+95
-62
lines changed

2 files changed

+95
-62
lines changed

benchmarks/bench_mnist.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def benchmark(clf):
110110
err = metrics.zero_one_loss(y_test, pred)
111111
return err, train_time, test_time
112112

113-
# Train MultilayerPerceptron model
113+
# Train Logistic Regression model
114114
classifiers['LogisticRegression'] = LogisticRegression()
115115

116116
# Train MultilayerPerceptron model

sklearn/neural_network/multilayer_perceptron.py

Lines changed: 94 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ class BaseMultilayerPerceptron(six.with_metaclass(ABCMeta, BaseEstimator)):
3838
def __init__(self, hidden_layer_sizes, activation, algorithm,
3939
alpha, batch_size, learning_rate, learning_rate_init, power_t,
4040
max_iter, loss, shuffle, random_state, tol, verbose,
41-
warm_start):
41+
warm_start, momentum):
4242
self.activation = activation
4343
self.algorithm = algorithm
4444
self.alpha = alpha
@@ -54,6 +54,7 @@ def __init__(self, hidden_layer_sizes, activation, algorithm,
5454
self.tol = tol
5555
self.verbose = verbose
5656
self.warm_start = warm_start
57+
self.momentum = momentum
5758

5859
self.layers_coef_ = None
5960
self.layers_intercept_ = None
@@ -373,9 +374,65 @@ def _fit(self, X, y, incremental=False):
373374

374375
# Run the Stochastic Gradient Descent algorithm
375376
if self.algorithm == 'sgd':
376-
prev_cost = np.inf
377-
cost_increase_count = 0
377+
self._fit_sgd(X, y, activations, deltas, coef_grads,
378+
intercept_grads, layer_units, incremental)
378379

380+
# Run the LBFGS algorithm
381+
elif self.algorithm == 'l-bfgs':
382+
self._fit_lbfgs(X, y, activations, deltas, coef_grads,
383+
intercept_grads, layer_units)
384+
return self
385+
386+
def _fit_lbfgs(self, X, y, activations, deltas, coef_grads, intercept_grads,
387+
layer_units):
388+
# Store meta information for the parameters
389+
self._coef_indptr = []
390+
self._intercept_indptr = []
391+
start = 0
392+
393+
# Save sizes and indices of coefficients for faster unpacking
394+
for i in range(self.n_layers_ - 1):
395+
n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]
396+
397+
end = start + (n_fan_in * n_fan_out)
398+
self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
399+
start = end
400+
401+
# Save sizes and indices of intercepts for faster unpacking
402+
for i in range(self.n_layers_ - 1):
403+
end = start + layer_units[i + 1]
404+
self._intercept_indptr.append((start, end))
405+
start = end
406+
407+
# Run LBFGS
408+
packed_coef_inter = _pack(self.layers_coef_,
409+
self.layers_intercept_)
410+
411+
if self.verbose is True or self.verbose >= 1:
412+
iprint = 1
413+
else:
414+
iprint = -1
415+
416+
optimal_parameters, self.cost_, d = fmin_l_bfgs_b(
417+
x0=packed_coef_inter,
418+
func=self._cost_grad_lbfgs,
419+
maxfun=self.max_iter,
420+
iprint=iprint,
421+
pgtol=self.tol,
422+
args=(X, y, activations, deltas, coef_grads, intercept_grads))
423+
424+
self._unpack(optimal_parameters)
425+
426+
def _fit_sgd(self, X, y, activations, deltas, coef_grads, intercept_grads,
427+
layer_units, incremental):
428+
prev_cost = np.inf
429+
cost_increase_count = 0
430+
n_samples = X.shape[0]
431+
batch_size = np.clip(self.batch_size, 1, n_samples)
432+
intercept_update_prev = [np.zeros_like(grads) for grads in intercept_grads]
433+
coef_update_prev = [np.zeros_like(grads) for grads in coef_grads]
434+
435+
try:
379436
for i in range(self.max_iter):
380437
for batch_slice in gen_batches(n_samples, batch_size):
381438
activations[0] = X[batch_slice]
@@ -386,10 +443,13 @@ def _fit(self, X, y, incremental=False):
386443

387444
# update weights
388445
for i in range(self.n_layers_ - 1):
389-
self.layers_coef_[i] -= (self.learning_rate_ *
390-
coef_grads[i])
391-
self.layers_intercept_[i] -= (self.learning_rate_ *
392-
intercept_grads[i])
446+
coef_update_prev[i] = ((1 - self.momentum) * coef_grads[i]
447+
+ self.momentum * coef_update_prev[i])
448+
self.layers_coef_[i] -= self.learning_rate_ * coef_update_prev[i]
449+
450+
intercept_update_prev[i] = ((1 - self.momentum) * intercept_grads[i]
451+
+ self.momentum * intercept_update_prev[i])
452+
self.layers_intercept_[i] -= self.learning_rate_ * intercept_update_prev[i]
393453

394454
if self.learning_rate == 'invscaling':
395455
self.learning_rate_ = self.learning_rate_init / \
@@ -423,47 +483,8 @@ def _fit(self, X, y, incremental=False):
423483
warnings.warn('SGD: Maximum iterations have reached and'
424484
' the optimization hasn\'t converged yet.'
425485
% (), ConvergenceWarning)
426-
# Run the LBFGS algorithm
427-
elif self.algorithm == 'l-bfgs':
428-
# Store meta information for the parameters
429-
self._coef_indptr = []
430-
self._intercept_indptr = []
431-
start = 0
432-
433-
# Save sizes and indices of coefficients for faster unpacking
434-
for i in range(self.n_layers_ - 1):
435-
n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1]
436-
437-
end = start + (n_fan_in * n_fan_out)
438-
self._coef_indptr.append((start, end, (n_fan_in, n_fan_out)))
439-
start = end
440-
441-
# Save sizes and indices of intercepts for faster unpacking
442-
for i in range(self.n_layers_ - 1):
443-
end = start + layer_units[i + 1]
444-
self._intercept_indptr.append((start, end))
445-
start = end
446-
447-
# Run LBFGS
448-
packed_coef_inter = _pack(self.layers_coef_,
449-
self.layers_intercept_)
450-
451-
if self.verbose is True or self.verbose >= 1:
452-
iprint = 1
453-
else:
454-
iprint = -1
455-
456-
optimal_parameters, self.cost_, d = fmin_l_bfgs_b(
457-
x0=packed_coef_inter,
458-
func=self._cost_grad_lbfgs,
459-
maxfun=self.max_iter,
460-
iprint=iprint,
461-
pgtol=self.tol,
462-
args=(X, y, activations, deltas, coef_grads, intercept_grads))
463-
464-
self._unpack(optimal_parameters)
465-
466-
return self
486+
except KeyboardInterrupt:
487+
pass
467488

468489
def fit(self, X, y):
469490
"""Fit the model to the data X and target y.
@@ -592,6 +613,8 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
592613
each time step 't' using an inverse scaling exponent of 'power_t'.
593614
learning_rate_ = learning_rate_init / pow(t, power_t)
594615
616+
Only used when algorithm='sgd'.
617+
595618
max_iter : int, optional, default 200
596619
Maximum number of iterations. The algorithm
597620
iterates until convergence (determined by 'tol') or
@@ -600,7 +623,7 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
600623
random_state : int or RandomState, optional, default None
601624
State of or seed for random number generator.
602625
603-
shuffle : bool, optional, default False
626+
shuffle : bool, optional, default True
604627
Whether to shuffle samples in each iteration before extracting
605628
minibatches.
606629
@@ -611,12 +634,12 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
611634
612635
learning_rate_init : double, optional, default 0.5
613636
The initial learning rate used. It controls the step-size
614-
in updating the weights.
637+
in updating the weights. Only used when algorithm='sgd'.
615638
616639
power_t : double, optional, default 0.5
617640
The exponent for inverse scaling learning rate.
618641
It is used in updating learning_rate_init when the learning_rate
619-
is set to 'invscaling'.
642+
is set to 'invscaling'. Only used when algorithm='sgd'.
620643
621644
verbose : bool, optional, default False
622645
Whether to print progress messages to stdout.
@@ -626,6 +649,10 @@ class MultilayerPerceptronClassifier(BaseMultilayerPerceptron,
626649
call to fit as initialization, otherwise, just erase the
627650
previous solution.
628651
652+
momentum : float, default 0
653+
Momentum for gradient descent update. Should be between 0 and 1. Only
654+
used when algorithm='sgd'.
655+
629656
Attributes
630657
----------
631658
`classes_` : array or list of array of shape (n_classes,)
@@ -686,8 +713,8 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
686713
algorithm='l-bfgs', alpha=0.00001,
687714
batch_size=200, learning_rate="constant",
688715
learning_rate_init=0.5, power_t=0.5, max_iter=200,
689-
shuffle=False, random_state=None, tol=1e-5,
690-
verbose=False, warm_start=False):
716+
shuffle=True, random_state=None, tol=1e-5,
717+
verbose=False, warm_start=False, momentum=0):
691718

692719
sup = super(MultilayerPerceptronClassifier, self)
693720
sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
@@ -696,7 +723,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
696723
learning_rate_init=learning_rate_init, power_t=power_t,
697724
max_iter=max_iter, loss='log_loss', shuffle=shuffle,
698725
random_state=random_state, tol=tol,
699-
verbose=verbose, warm_start=warm_start)
726+
verbose=verbose, warm_start=warm_start, momentum=momentum)
700727

701728
self.label_binarizer_ = LabelBinarizer()
702729

@@ -859,6 +886,8 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
859886
each time step 't' using an inverse scaling exponent of 'power_t'.
860887
learning_rate_ = learning_rate_init / pow(t, power_t)
861888
889+
Only used when algorithm='sgd'.
890+
862891
max_iter : int, optional, default 200
863892
Maximum number of iterations. The algorithm
864893
iterates until convergence (determined by 'tol') or
@@ -867,9 +896,9 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
867896
random_state : int or RandomState, optional, default None
868897
State of or seed for random number generator.
869898
870-
shuffle : bool, optional, default False
899+
shuffle : bool, optional, default True
871900
Whether to shuffle samples in each iteration before extracting
872-
minibatches.
901+
minibatches. Only used when algorithm='sgd'.
873902
874903
tol : float, optional, default 1e-5
875904
Tolerance for the optimization. When the loss at iteration i+1 differs
@@ -878,12 +907,12 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
878907
879908
learning_rate_init : double, optional, default 0.5
880909
The initial learning rate used. It controls the step-size
881-
in updating the weights.
910+
in updating the weights. Only used when algorithm='sgd'.
882911
883912
power_t : double, optional, default 0.5
884913
The exponent for inverse scaling learning rate.
885914
It is used for updating learning_rate_ when it
886-
is set to 'invscaling'.
915+
is set to 'invscaling'. Only used when algorithm='sgd'.
887916
888917
verbose : bool, optional, default False
889918
Whether to print progress messages to stdout.
@@ -893,6 +922,10 @@ class MultilayerPerceptronRegressor(BaseMultilayerPerceptron, RegressorMixin):
893922
call to fit as initialization, otherwise, just erase the
894923
previous solution.
895924
925+
momentum : float, default 0
926+
Momentum for gradient descent update. Should be between 0 and 1. Only
927+
used when algorithm='sgd'.
928+
896929
Attributes
897930
----------
898931
`cost_` : float
@@ -947,9 +980,9 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
947980
algorithm='l-bfgs', alpha=0.00001,
948981
batch_size=200, learning_rate="constant",
949982
learning_rate_init=0.1,
950-
power_t=0.5, max_iter=100, shuffle=False,
983+
power_t=0.5, max_iter=100, shuffle=True,
951984
random_state=None, tol=1e-5,
952-
verbose=False, warm_start=False):
985+
verbose=False, warm_start=False, momentum=0):
953986

954987
sup = super(MultilayerPerceptronRegressor, self)
955988
sup.__init__(hidden_layer_sizes=hidden_layer_sizes,
@@ -958,7 +991,7 @@ def __init__(self, hidden_layer_sizes=(100,), activation="relu",
958991
learning_rate_init=learning_rate_init, power_t=power_t,
959992
max_iter=max_iter, loss='squared_loss', shuffle=shuffle,
960993
random_state=random_state, tol=tol,
961-
verbose=verbose, warm_start=warm_start)
994+
verbose=verbose, warm_start=warm_start, momentum=momentum)
962995

963996
def predict(self, X):
964997
"""Predict using the multi-layer perceptron model.

0 commit comments

Comments
 (0)